433 lines
12 KiB
Rust
433 lines
12 KiB
Rust
use std::{cmp, collections::HashMap, convert::TryFrom};
|
|
|
|
use conduwuit::{err, utils, Config, Result};
|
|
use rocksdb::{
|
|
statistics::StatsLevel, BlockBasedOptions, Cache, DBCompactionStyle, DBCompressionType,
|
|
DBRecoveryMode, Env, LogLevel, LruCacheOptions, Options, UniversalCompactOptions,
|
|
UniversalCompactionStopStyle,
|
|
};
|
|
|
|
/// Create database-wide options suitable for opening the database. This also
|
|
/// sets our default column options in case of opening a column with the same
|
|
/// resulting value. Note that we require special per-column options on some
|
|
/// columns, therefor columns should only be opened after passing this result
|
|
/// through cf_options().
|
|
pub(crate) fn db_options(
|
|
config: &Config,
|
|
env: &mut Env,
|
|
row_cache: &Cache,
|
|
col_cache: &Cache,
|
|
) -> Result<Options> {
|
|
const DEFAULT_STATS_LEVEL: StatsLevel = if cfg!(debug_assertions) {
|
|
StatsLevel::ExceptDetailedTimers
|
|
} else {
|
|
StatsLevel::DisableAll
|
|
};
|
|
|
|
let mut opts = Options::default();
|
|
|
|
// Logging
|
|
set_logging_defaults(&mut opts, config);
|
|
|
|
// Processing
|
|
opts.set_max_background_jobs(num_threads::<i32>(config)?);
|
|
opts.set_max_subcompactions(num_threads::<u32>(config)?);
|
|
opts.set_avoid_unnecessary_blocking_io(true);
|
|
opts.set_max_file_opening_threads(0);
|
|
if config.rocksdb_compaction_prio_idle {
|
|
env.lower_thread_pool_cpu_priority();
|
|
}
|
|
|
|
// IO
|
|
opts.set_atomic_flush(true);
|
|
opts.set_manual_wal_flush(true);
|
|
opts.set_enable_pipelined_write(false);
|
|
if config.rocksdb_direct_io {
|
|
opts.set_use_direct_reads(true);
|
|
opts.set_use_direct_io_for_flush_and_compaction(true);
|
|
}
|
|
if config.rocksdb_optimize_for_spinning_disks {
|
|
// speeds up opening DB on hard drives
|
|
opts.set_skip_checking_sst_file_sizes_on_db_open(true);
|
|
opts.set_skip_stats_update_on_db_open(true);
|
|
//opts.set_max_file_opening_threads(threads.try_into().unwrap());
|
|
}
|
|
if config.rocksdb_compaction_ioprio_idle {
|
|
env.lower_thread_pool_io_priority();
|
|
}
|
|
|
|
// Blocks
|
|
let mut table_opts = table_options(config);
|
|
table_opts.set_block_cache(col_cache);
|
|
opts.set_block_based_table_factory(&table_opts);
|
|
opts.set_row_cache(row_cache);
|
|
|
|
// Buffers
|
|
opts.set_write_buffer_size(2 * 1024 * 1024);
|
|
opts.set_max_write_buffer_number(2);
|
|
opts.set_min_write_buffer_number(1);
|
|
|
|
// Files
|
|
opts.set_table_cache_num_shard_bits(7);
|
|
opts.set_max_total_wal_size(96 * 1024 * 1024);
|
|
set_level_defaults(&mut opts, config);
|
|
|
|
// Compression
|
|
set_compression_defaults(&mut opts, config);
|
|
|
|
// Misc
|
|
opts.create_if_missing(true);
|
|
opts.set_disable_auto_compactions(!config.rocksdb_compaction);
|
|
|
|
opts.set_statistics_level(match config.rocksdb_stats_level {
|
|
| 0 => StatsLevel::DisableAll,
|
|
| 1 => DEFAULT_STATS_LEVEL,
|
|
| 2 => StatsLevel::ExceptHistogramOrTimers,
|
|
| 3 => StatsLevel::ExceptTimers,
|
|
| 4 => StatsLevel::ExceptDetailedTimers,
|
|
| 5 => StatsLevel::ExceptTimeForMutex,
|
|
| 6_u8..=u8::MAX => StatsLevel::All,
|
|
});
|
|
|
|
// Default: https://github.com/facebook/rocksdb/wiki/WAL-Recovery-Modes#ktoleratecorruptedtailrecords
|
|
//
|
|
// Unclean shutdowns of a Matrix homeserver are likely to be fine when
|
|
// recovered in this manner as it's likely any lost information will be
|
|
// restored via federation.
|
|
opts.set_wal_recovery_mode(match config.rocksdb_recovery_mode {
|
|
| 0 => DBRecoveryMode::AbsoluteConsistency,
|
|
| 1 => DBRecoveryMode::TolerateCorruptedTailRecords,
|
|
| 2 => DBRecoveryMode::PointInTime,
|
|
| 3 => DBRecoveryMode::SkipAnyCorruptedRecord,
|
|
| 4_u8..=u8::MAX => unimplemented!(),
|
|
});
|
|
|
|
// <https://github.com/facebook/rocksdb/wiki/Track-WAL-in-MANIFEST>
|
|
// "We recommend to set track_and_verify_wals_in_manifest to true for
|
|
// production, it has been enabled in production for the entire database cluster
|
|
// serving the social graph for all Meta apps."
|
|
opts.set_track_and_verify_wals_in_manifest(true);
|
|
|
|
opts.set_paranoid_checks(config.rocksdb_paranoid_file_checks);
|
|
|
|
opts.set_env(env);
|
|
Ok(opts)
|
|
}
|
|
|
|
/// Adjust options for the specific column by name. Provide the result of
|
|
/// db_options() as the argument to this function and use the return value in
|
|
/// the arguments to open the specific column.
|
|
pub(crate) fn cf_options(
|
|
cfg: &Config,
|
|
name: &str,
|
|
mut opts: Options,
|
|
cache: &mut HashMap<String, Cache>,
|
|
) -> Result<Options> {
|
|
// Columns with non-default compaction options
|
|
match name {
|
|
| "backupid_algorithm"
|
|
| "backupid_etag"
|
|
| "backupkeyid_backup"
|
|
| "roomid_shortroomid"
|
|
| "shorteventid_shortstatehash"
|
|
| "shorteventid_eventid"
|
|
| "shortstatekey_statekey"
|
|
| "shortstatehash_statediff"
|
|
| "userdevicetxnid_response"
|
|
| "userfilterid_filter" => set_for_sequential_small_uc(&mut opts, cfg),
|
|
| &_ => {},
|
|
}
|
|
|
|
// Columns with non-default table/cache configs
|
|
match name {
|
|
| "shorteventid_eventid" => set_table_with_new_cache(
|
|
&mut opts,
|
|
cfg,
|
|
cache,
|
|
name,
|
|
cache_size(cfg, cfg.shorteventid_cache_capacity, 64)?,
|
|
),
|
|
|
|
| "eventid_shorteventid" => set_table_with_new_cache(
|
|
&mut opts,
|
|
cfg,
|
|
cache,
|
|
name,
|
|
cache_size(cfg, cfg.eventidshort_cache_capacity, 64)?,
|
|
),
|
|
|
|
| "eventid_pduid" => set_table_with_new_cache(
|
|
&mut opts,
|
|
cfg,
|
|
cache,
|
|
name,
|
|
cache_size(cfg, cfg.eventid_pdu_cache_capacity, 64)?,
|
|
),
|
|
|
|
| "shorteventid_authchain" => {
|
|
set_table_with_new_cache(
|
|
&mut opts,
|
|
cfg,
|
|
cache,
|
|
name,
|
|
cache_size(cfg, cfg.auth_chain_cache_capacity, 192)?,
|
|
);
|
|
},
|
|
|
|
| "shortstatekey_statekey" => set_table_with_new_cache(
|
|
&mut opts,
|
|
cfg,
|
|
cache,
|
|
name,
|
|
cache_size(cfg, cfg.shortstatekey_cache_capacity, 1024)?,
|
|
),
|
|
|
|
| "statekey_shortstatekey" => set_table_with_new_cache(
|
|
&mut opts,
|
|
cfg,
|
|
cache,
|
|
name,
|
|
cache_size(cfg, cfg.statekeyshort_cache_capacity, 1024)?,
|
|
),
|
|
|
|
| "servernameevent_data" => set_table_with_new_cache(
|
|
&mut opts,
|
|
cfg,
|
|
cache,
|
|
name,
|
|
cache_size(cfg, cfg.servernameevent_data_cache_capacity, 128)?, /* Raw average
|
|
* value size =
|
|
* 102, key
|
|
* size = 34 */
|
|
),
|
|
|
|
| "eventid_outlierpdu" => {
|
|
set_table_with_new_cache(
|
|
&mut opts,
|
|
cfg,
|
|
cache,
|
|
name,
|
|
cache_size(cfg, cfg.pdu_cache_capacity, 1536)?,
|
|
);
|
|
},
|
|
|
|
| "pduid_pdu" => {
|
|
set_table_with_shared_cache(&mut opts, cfg, cache, name, "eventid_outlierpdu");
|
|
},
|
|
|
|
| &_ => {},
|
|
}
|
|
|
|
Ok(opts)
|
|
}
|
|
|
|
fn set_logging_defaults(opts: &mut Options, config: &Config) {
|
|
let rocksdb_log_level = match config.rocksdb_log_level.as_ref() {
|
|
| "debug" => LogLevel::Debug,
|
|
| "info" => LogLevel::Info,
|
|
| "warn" => LogLevel::Warn,
|
|
| "fatal" => LogLevel::Fatal,
|
|
| _ => LogLevel::Error,
|
|
};
|
|
|
|
opts.set_log_level(rocksdb_log_level);
|
|
opts.set_max_log_file_size(config.rocksdb_max_log_file_size);
|
|
opts.set_log_file_time_to_roll(config.rocksdb_log_time_to_roll);
|
|
opts.set_keep_log_file_num(config.rocksdb_max_log_files);
|
|
opts.set_stats_dump_period_sec(0);
|
|
|
|
if config.rocksdb_log_stderr {
|
|
opts.set_stderr_logger(rocksdb_log_level, "rocksdb");
|
|
} else {
|
|
opts.set_callback_logger(rocksdb_log_level, &super::engine::handle_log);
|
|
}
|
|
}
|
|
|
|
fn set_compression_defaults(opts: &mut Options, config: &Config) {
|
|
let rocksdb_compression_algo = match config.rocksdb_compression_algo.as_ref() {
|
|
| "snappy" => DBCompressionType::Snappy,
|
|
| "zlib" => DBCompressionType::Zlib,
|
|
| "bz2" => DBCompressionType::Bz2,
|
|
| "lz4" => DBCompressionType::Lz4,
|
|
| "lz4hc" => DBCompressionType::Lz4hc,
|
|
| "none" => DBCompressionType::None,
|
|
| _ => DBCompressionType::Zstd,
|
|
};
|
|
|
|
if config.rocksdb_bottommost_compression {
|
|
opts.set_bottommost_compression_type(rocksdb_compression_algo);
|
|
opts.set_bottommost_zstd_max_train_bytes(0, true);
|
|
|
|
// -14 w_bits is only read by zlib.
|
|
opts.set_bottommost_compression_options(
|
|
-14,
|
|
config.rocksdb_bottommost_compression_level,
|
|
0,
|
|
0,
|
|
true,
|
|
);
|
|
}
|
|
|
|
// -14 w_bits is only read by zlib.
|
|
opts.set_compression_options(-14, config.rocksdb_compression_level, 0, 0);
|
|
opts.set_compression_type(rocksdb_compression_algo);
|
|
}
|
|
|
|
#[allow(dead_code)]
|
|
fn set_for_random_small_uc(opts: &mut Options, config: &Config) {
|
|
let uco = uc_options(config);
|
|
set_for_random_small(opts, config);
|
|
opts.set_universal_compaction_options(&uco);
|
|
opts.set_compaction_style(DBCompactionStyle::Universal);
|
|
}
|
|
|
|
fn set_for_sequential_small_uc(opts: &mut Options, config: &Config) {
|
|
let uco = uc_options(config);
|
|
set_for_sequential_small(opts, config);
|
|
opts.set_universal_compaction_options(&uco);
|
|
opts.set_compaction_style(DBCompactionStyle::Universal);
|
|
}
|
|
|
|
#[allow(dead_code)]
|
|
fn set_for_random_small(opts: &mut Options, config: &Config) {
|
|
set_for_random(opts, config);
|
|
|
|
opts.set_write_buffer_size(1024 * 128);
|
|
opts.set_target_file_size_base(1024 * 128);
|
|
opts.set_target_file_size_multiplier(2);
|
|
opts.set_max_bytes_for_level_base(1024 * 512);
|
|
opts.set_max_bytes_for_level_multiplier(2.0);
|
|
}
|
|
|
|
fn set_for_sequential_small(opts: &mut Options, config: &Config) {
|
|
set_for_sequential(opts, config);
|
|
|
|
opts.set_write_buffer_size(1024 * 512);
|
|
opts.set_target_file_size_base(1024 * 512);
|
|
opts.set_target_file_size_multiplier(2);
|
|
opts.set_max_bytes_for_level_base(1024 * 1024);
|
|
opts.set_max_bytes_for_level_multiplier(2.0);
|
|
}
|
|
|
|
fn set_for_random(opts: &mut Options, config: &Config) {
|
|
set_level_defaults(opts, config);
|
|
|
|
let pri = "compaction_pri=kOldestSmallestSeqFirst";
|
|
opts.set_options_from_string(pri)
|
|
.expect("set compaction priority string");
|
|
|
|
opts.set_max_bytes_for_level_base(8 * 1024 * 1024);
|
|
opts.set_max_bytes_for_level_multiplier(1.0);
|
|
opts.set_max_bytes_for_level_multiplier_additional(&[0, 1, 1, 3, 7, 15, 31]);
|
|
}
|
|
|
|
fn set_for_sequential(opts: &mut Options, config: &Config) {
|
|
set_level_defaults(opts, config);
|
|
|
|
let pri = "compaction_pri=kOldestLargestSeqFirst";
|
|
opts.set_options_from_string(pri)
|
|
.expect("set compaction priority string");
|
|
|
|
opts.set_target_file_size_base(2 * 1024 * 1024);
|
|
opts.set_target_file_size_multiplier(2);
|
|
|
|
opts.set_max_bytes_for_level_base(32 * 1024 * 1024);
|
|
opts.set_max_bytes_for_level_multiplier(1.0);
|
|
opts.set_max_bytes_for_level_multiplier_additional(&[0, 1, 1, 3, 7, 15, 31]);
|
|
}
|
|
|
|
fn set_level_defaults(opts: &mut Options, _config: &Config) {
|
|
opts.set_level_zero_file_num_compaction_trigger(2);
|
|
|
|
opts.set_target_file_size_base(1024 * 1024);
|
|
opts.set_target_file_size_multiplier(2);
|
|
|
|
opts.set_level_compaction_dynamic_level_bytes(false);
|
|
opts.set_max_bytes_for_level_base(16 * 1024 * 1024);
|
|
opts.set_max_bytes_for_level_multiplier(2.0);
|
|
|
|
opts.set_ttl(21 * 24 * 60 * 60);
|
|
}
|
|
|
|
fn uc_options(_config: &Config) -> UniversalCompactOptions {
|
|
let mut opts = UniversalCompactOptions::default();
|
|
|
|
opts.set_stop_style(UniversalCompactionStopStyle::Total);
|
|
opts.set_max_size_amplification_percent(10000);
|
|
opts.set_compression_size_percent(-1);
|
|
opts.set_size_ratio(1);
|
|
|
|
opts.set_min_merge_width(2);
|
|
opts.set_max_merge_width(16);
|
|
|
|
opts
|
|
}
|
|
|
|
fn set_table_with_new_cache(
|
|
opts: &mut Options,
|
|
config: &Config,
|
|
caches: &mut HashMap<String, Cache>,
|
|
name: &str,
|
|
size: usize,
|
|
) {
|
|
let mut cache_opts = LruCacheOptions::default();
|
|
cache_opts.set_capacity(size);
|
|
cache_opts.set_num_shard_bits(7);
|
|
|
|
let cache = Cache::new_lru_cache_opts(&cache_opts);
|
|
caches.insert(name.into(), cache);
|
|
|
|
set_table_with_shared_cache(opts, config, caches, name, name);
|
|
}
|
|
|
|
fn set_table_with_shared_cache(
|
|
opts: &mut Options,
|
|
config: &Config,
|
|
cache: &HashMap<String, Cache>,
|
|
_name: &str,
|
|
cache_name: &str,
|
|
) {
|
|
let mut table = table_options(config);
|
|
table.set_block_cache(
|
|
cache
|
|
.get(cache_name)
|
|
.expect("existing cache to share with this column"),
|
|
);
|
|
|
|
opts.set_block_based_table_factory(&table);
|
|
}
|
|
|
|
fn cache_size(config: &Config, base_size: u32, entity_size: usize) -> Result<usize> {
|
|
let ents = f64::from(base_size) * config.cache_capacity_modifier;
|
|
|
|
#[allow(clippy::as_conversions, clippy::cast_sign_loss, clippy::cast_possible_truncation)]
|
|
(ents as usize)
|
|
.checked_mul(entity_size)
|
|
.ok_or_else(|| err!(Config("cache_capacity_modifier", "Cache size is too large.")))
|
|
}
|
|
|
|
fn table_options(_config: &Config) -> BlockBasedOptions {
|
|
let mut opts = BlockBasedOptions::default();
|
|
|
|
opts.set_block_size(4 * 1024);
|
|
opts.set_metadata_block_size(4 * 1024);
|
|
|
|
opts.set_use_delta_encoding(false);
|
|
opts.set_optimize_filters_for_memory(true);
|
|
opts.set_cache_index_and_filter_blocks(true);
|
|
opts.set_pin_top_level_index_and_filter(true);
|
|
|
|
opts
|
|
}
|
|
|
|
fn num_threads<T: TryFrom<usize>>(config: &Config) -> Result<T> {
|
|
const MIN_PARALLELISM: usize = 2;
|
|
|
|
let requested = if config.rocksdb_parallelism_threads != 0 {
|
|
config.rocksdb_parallelism_threads
|
|
} else {
|
|
utils::available_parallelism()
|
|
};
|
|
|
|
utils::math::try_into::<T, usize>(cmp::max(MIN_PARALLELISM, requested))
|
|
}
|