refactor rocksdb opts; split kvtree
Signed-off-by: Jason Volk <jason@zemos.net>
This commit is contained in:
parent
bade4ed17f
commit
a83da4f17b
3 changed files with 449 additions and 308 deletions
197
src/database/rocksdb/kvtree.rs
Normal file
197
src/database/rocksdb/kvtree.rs
Normal file
|
@ -0,0 +1,197 @@
|
||||||
|
use std::{future::Future, pin::Pin, sync::Arc};
|
||||||
|
|
||||||
|
use rust_rocksdb::WriteBatchWithTransaction;
|
||||||
|
|
||||||
|
use super::{watchers::Watchers, Engine, KeyValueDatabaseEngine, KvTree};
|
||||||
|
use crate::{utils, Result};
|
||||||
|
|
||||||
|
pub(super) struct RocksDbEngineTree<'a> {
|
||||||
|
pub db: Arc<Engine>,
|
||||||
|
pub name: &'a str,
|
||||||
|
pub watchers: Watchers,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl RocksDbEngineTree<'_> {
|
||||||
|
fn cf(&self) -> Arc<rust_rocksdb::BoundColumnFamily<'_>> { self.db.rocks.cf_handle(self.name).unwrap() }
|
||||||
|
}
|
||||||
|
|
||||||
|
impl KvTree for RocksDbEngineTree<'_> {
|
||||||
|
fn get(&self, key: &[u8]) -> Result<Option<Vec<u8>>> {
|
||||||
|
let mut readoptions = rust_rocksdb::ReadOptions::default();
|
||||||
|
readoptions.set_total_order_seek(true);
|
||||||
|
|
||||||
|
Ok(self.db.rocks.get_cf_opt(&self.cf(), key, &readoptions)?)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn multi_get(
|
||||||
|
&self, iter: Vec<(&Arc<rust_rocksdb::BoundColumnFamily<'_>>, Vec<u8>)>,
|
||||||
|
) -> Vec<Result<Option<Vec<u8>>, rust_rocksdb::Error>> {
|
||||||
|
let mut readoptions = rust_rocksdb::ReadOptions::default();
|
||||||
|
readoptions.set_total_order_seek(true);
|
||||||
|
|
||||||
|
self.db.rocks.multi_get_cf_opt(iter, &readoptions)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn insert(&self, key: &[u8], value: &[u8]) -> Result<()> {
|
||||||
|
let writeoptions = rust_rocksdb::WriteOptions::default();
|
||||||
|
|
||||||
|
self.db
|
||||||
|
.rocks
|
||||||
|
.put_cf_opt(&self.cf(), key, value, &writeoptions)?;
|
||||||
|
|
||||||
|
if !self.db.corked() {
|
||||||
|
self.db.flush()?;
|
||||||
|
}
|
||||||
|
|
||||||
|
self.watchers.wake(key);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn insert_batch(&self, iter: &mut dyn Iterator<Item = (Vec<u8>, Vec<u8>)>) -> Result<()> {
|
||||||
|
let writeoptions = rust_rocksdb::WriteOptions::default();
|
||||||
|
|
||||||
|
let mut batch = WriteBatchWithTransaction::<false>::default();
|
||||||
|
|
||||||
|
for (key, value) in iter {
|
||||||
|
batch.put_cf(&self.cf(), key, value);
|
||||||
|
}
|
||||||
|
|
||||||
|
let result = self.db.rocks.write_opt(batch, &writeoptions);
|
||||||
|
|
||||||
|
if !self.db.corked() {
|
||||||
|
self.db.flush()?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(result?)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn remove(&self, key: &[u8]) -> Result<()> {
|
||||||
|
let writeoptions = rust_rocksdb::WriteOptions::default();
|
||||||
|
|
||||||
|
let result = self.db.rocks.delete_cf_opt(&self.cf(), key, &writeoptions);
|
||||||
|
|
||||||
|
if !self.db.corked() {
|
||||||
|
self.db.flush()?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(result?)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn remove_batch(&self, iter: &mut dyn Iterator<Item = Vec<u8>>) -> Result<()> {
|
||||||
|
let writeoptions = rust_rocksdb::WriteOptions::default();
|
||||||
|
|
||||||
|
let mut batch = WriteBatchWithTransaction::<false>::default();
|
||||||
|
|
||||||
|
for key in iter {
|
||||||
|
batch.delete_cf(&self.cf(), key);
|
||||||
|
}
|
||||||
|
|
||||||
|
let result = self.db.rocks.write_opt(batch, &writeoptions);
|
||||||
|
|
||||||
|
if !self.db.corked() {
|
||||||
|
self.db.flush()?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(result?)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = (Vec<u8>, Vec<u8>)> + 'a> {
|
||||||
|
let mut readoptions = rust_rocksdb::ReadOptions::default();
|
||||||
|
readoptions.set_total_order_seek(true);
|
||||||
|
|
||||||
|
Box::new(
|
||||||
|
self.db
|
||||||
|
.rocks
|
||||||
|
.iterator_cf_opt(&self.cf(), readoptions, rust_rocksdb::IteratorMode::Start)
|
||||||
|
.map(Result::unwrap)
|
||||||
|
.map(|(k, v)| (Vec::from(k), Vec::from(v))),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn iter_from<'a>(&'a self, from: &[u8], backwards: bool) -> Box<dyn Iterator<Item = (Vec<u8>, Vec<u8>)> + 'a> {
|
||||||
|
let mut readoptions = rust_rocksdb::ReadOptions::default();
|
||||||
|
readoptions.set_total_order_seek(true);
|
||||||
|
|
||||||
|
Box::new(
|
||||||
|
self.db
|
||||||
|
.rocks
|
||||||
|
.iterator_cf_opt(
|
||||||
|
&self.cf(),
|
||||||
|
readoptions,
|
||||||
|
rust_rocksdb::IteratorMode::From(
|
||||||
|
from,
|
||||||
|
if backwards {
|
||||||
|
rust_rocksdb::Direction::Reverse
|
||||||
|
} else {
|
||||||
|
rust_rocksdb::Direction::Forward
|
||||||
|
},
|
||||||
|
),
|
||||||
|
)
|
||||||
|
.map(Result::unwrap)
|
||||||
|
.map(|(k, v)| (Vec::from(k), Vec::from(v))),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn increment(&self, key: &[u8]) -> Result<Vec<u8>> {
|
||||||
|
let mut readoptions = rust_rocksdb::ReadOptions::default();
|
||||||
|
readoptions.set_total_order_seek(true);
|
||||||
|
let writeoptions = rust_rocksdb::WriteOptions::default();
|
||||||
|
|
||||||
|
let old = self.db.rocks.get_cf_opt(&self.cf(), key, &readoptions)?;
|
||||||
|
let new = utils::increment(old.as_deref());
|
||||||
|
self.db
|
||||||
|
.rocks
|
||||||
|
.put_cf_opt(&self.cf(), key, &new, &writeoptions)?;
|
||||||
|
|
||||||
|
if !self.db.corked() {
|
||||||
|
self.db.flush()?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(new)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn increment_batch(&self, iter: &mut dyn Iterator<Item = Vec<u8>>) -> Result<()> {
|
||||||
|
let mut readoptions = rust_rocksdb::ReadOptions::default();
|
||||||
|
readoptions.set_total_order_seek(true);
|
||||||
|
let writeoptions = rust_rocksdb::WriteOptions::default();
|
||||||
|
|
||||||
|
let mut batch = WriteBatchWithTransaction::<false>::default();
|
||||||
|
|
||||||
|
for key in iter {
|
||||||
|
let old = self.db.rocks.get_cf_opt(&self.cf(), &key, &readoptions)?;
|
||||||
|
let new = utils::increment(old.as_deref());
|
||||||
|
batch.put_cf(&self.cf(), key, new);
|
||||||
|
}
|
||||||
|
|
||||||
|
self.db.rocks.write_opt(batch, &writeoptions)?;
|
||||||
|
|
||||||
|
if !self.db.corked() {
|
||||||
|
self.db.flush()?;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn scan_prefix<'a>(&'a self, prefix: Vec<u8>) -> Box<dyn Iterator<Item = (Vec<u8>, Vec<u8>)> + 'a> {
|
||||||
|
let mut readoptions = rust_rocksdb::ReadOptions::default();
|
||||||
|
readoptions.set_total_order_seek(true);
|
||||||
|
|
||||||
|
Box::new(
|
||||||
|
self.db
|
||||||
|
.rocks
|
||||||
|
.iterator_cf_opt(
|
||||||
|
&self.cf(),
|
||||||
|
readoptions,
|
||||||
|
rust_rocksdb::IteratorMode::From(&prefix, rust_rocksdb::Direction::Forward),
|
||||||
|
)
|
||||||
|
.map(Result::unwrap)
|
||||||
|
.map(|(k, v)| (Vec::from(k), Vec::from(v)))
|
||||||
|
.take_while(move |(k, _)| k.starts_with(&prefix)),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn watch_prefix<'a>(&'a self, prefix: &[u8]) -> Pin<Box<dyn Future<Output = ()> + Send + 'a>> {
|
||||||
|
self.watchers.watch(prefix)
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,20 +1,22 @@
|
||||||
use std::{
|
use std::sync::{atomic::AtomicU32, Arc};
|
||||||
future::Future,
|
|
||||||
pin::Pin,
|
|
||||||
sync::{atomic::AtomicU32, Arc},
|
|
||||||
};
|
|
||||||
|
|
||||||
use chrono::{DateTime, Utc};
|
use chrono::{DateTime, Utc};
|
||||||
use rust_rocksdb::{
|
use rust_rocksdb::{
|
||||||
backup::{BackupEngine, BackupEngineOptions},
|
backup::{BackupEngine, BackupEngineOptions},
|
||||||
DBWithThreadMode as Db,
|
DBWithThreadMode as Db, MultiThreaded,
|
||||||
LogLevel::{Debug, Error, Fatal, Info, Warn},
|
|
||||||
MultiThreaded, WriteBatchWithTransaction,
|
|
||||||
};
|
};
|
||||||
use tracing::{debug, error, info, warn};
|
use tracing::{debug, error, info, warn};
|
||||||
|
|
||||||
use super::{super::Config, watchers::Watchers, KeyValueDatabaseEngine, KvTree};
|
use super::{super::Config, watchers::Watchers, KeyValueDatabaseEngine, KvTree};
|
||||||
use crate::{utils, Result};
|
use crate::Result;
|
||||||
|
|
||||||
|
pub(crate) mod kvtree;
|
||||||
|
pub(crate) mod opts;
|
||||||
|
|
||||||
|
use kvtree::RocksDbEngineTree;
|
||||||
|
use opts::{cf_options, db_options};
|
||||||
|
|
||||||
|
use super::watchers;
|
||||||
|
|
||||||
pub(crate) struct Engine {
|
pub(crate) struct Engine {
|
||||||
rocks: Db<MultiThreaded>,
|
rocks: Db<MultiThreaded>,
|
||||||
|
@ -27,113 +29,6 @@ pub(crate) struct Engine {
|
||||||
corks: AtomicU32,
|
corks: AtomicU32,
|
||||||
}
|
}
|
||||||
|
|
||||||
struct RocksDbEngineTree<'a> {
|
|
||||||
db: Arc<Engine>,
|
|
||||||
name: &'a str,
|
|
||||||
watchers: Watchers,
|
|
||||||
}
|
|
||||||
|
|
||||||
fn db_options(
|
|
||||||
config: &Config, env: &rust_rocksdb::Env, row_cache: &rust_rocksdb::Cache, col_cache: &rust_rocksdb::Cache,
|
|
||||||
) -> rust_rocksdb::Options {
|
|
||||||
// database options: https://docs.rs/rocksdb/latest/rocksdb/struct.Options.html#
|
|
||||||
let mut db_opts = rust_rocksdb::Options::default();
|
|
||||||
|
|
||||||
// Logging
|
|
||||||
let rocksdb_log_level = match config.rocksdb_log_level.as_ref() {
|
|
||||||
"debug" => Debug,
|
|
||||||
"info" => Info,
|
|
||||||
"warn" => Warn,
|
|
||||||
"fatal" => Fatal,
|
|
||||||
_ => Error,
|
|
||||||
};
|
|
||||||
db_opts.set_log_level(rocksdb_log_level);
|
|
||||||
db_opts.set_max_log_file_size(config.rocksdb_max_log_file_size);
|
|
||||||
db_opts.set_log_file_time_to_roll(config.rocksdb_log_time_to_roll);
|
|
||||||
db_opts.set_keep_log_file_num(config.rocksdb_max_log_files);
|
|
||||||
|
|
||||||
// Processing
|
|
||||||
let threads = if config.rocksdb_parallelism_threads == 0 {
|
|
||||||
num_cpus::get_physical() // max cores if user specified 0
|
|
||||||
} else {
|
|
||||||
config.rocksdb_parallelism_threads
|
|
||||||
};
|
|
||||||
|
|
||||||
db_opts.set_max_background_jobs(threads.try_into().unwrap());
|
|
||||||
db_opts.set_max_subcompactions(threads.try_into().unwrap());
|
|
||||||
|
|
||||||
// IO
|
|
||||||
db_opts.set_manual_wal_flush(true);
|
|
||||||
db_opts.set_use_direct_reads(true);
|
|
||||||
db_opts.set_use_direct_io_for_flush_and_compaction(true);
|
|
||||||
if config.rocksdb_optimize_for_spinning_disks {
|
|
||||||
db_opts.set_skip_stats_update_on_db_open(true); // speeds up opening DB on hard
|
|
||||||
// drives
|
|
||||||
}
|
|
||||||
|
|
||||||
// Blocks
|
|
||||||
let mut block_based_options = rust_rocksdb::BlockBasedOptions::default();
|
|
||||||
block_based_options.set_block_size(4 * 1024);
|
|
||||||
block_based_options.set_metadata_block_size(4 * 1024);
|
|
||||||
block_based_options.set_bloom_filter(9.6, true);
|
|
||||||
block_based_options.set_optimize_filters_for_memory(true);
|
|
||||||
block_based_options.set_cache_index_and_filter_blocks(true);
|
|
||||||
block_based_options.set_pin_top_level_index_and_filter(true);
|
|
||||||
block_based_options.set_block_cache(col_cache);
|
|
||||||
db_opts.set_row_cache(row_cache);
|
|
||||||
|
|
||||||
// Buffers
|
|
||||||
db_opts.set_write_buffer_size(2 * 1024 * 1024);
|
|
||||||
db_opts.set_max_write_buffer_number(2);
|
|
||||||
db_opts.set_min_write_buffer_number(1);
|
|
||||||
|
|
||||||
// Files
|
|
||||||
db_opts.set_level_zero_file_num_compaction_trigger(1);
|
|
||||||
db_opts.set_target_file_size_base(64 * 1024 * 1024);
|
|
||||||
db_opts.set_max_bytes_for_level_base(128 * 1024 * 1024);
|
|
||||||
db_opts.set_ttl(14 * 24 * 60 * 60);
|
|
||||||
|
|
||||||
// Compression
|
|
||||||
let rocksdb_compression_algo = match config.rocksdb_compression_algo.as_ref() {
|
|
||||||
"zlib" => rust_rocksdb::DBCompressionType::Zlib,
|
|
||||||
"lz4" => rust_rocksdb::DBCompressionType::Lz4,
|
|
||||||
"bz2" => rust_rocksdb::DBCompressionType::Bz2,
|
|
||||||
_ => rust_rocksdb::DBCompressionType::Zstd,
|
|
||||||
};
|
|
||||||
|
|
||||||
if config.rocksdb_bottommost_compression {
|
|
||||||
db_opts.set_bottommost_compression_type(rocksdb_compression_algo);
|
|
||||||
db_opts.set_bottommost_zstd_max_train_bytes(0, true);
|
|
||||||
|
|
||||||
// -14 w_bits is only read by zlib.
|
|
||||||
db_opts.set_bottommost_compression_options(-14, config.rocksdb_bottommost_compression_level, 0, 0, true);
|
|
||||||
}
|
|
||||||
|
|
||||||
// -14 w_bits is only read by zlib.
|
|
||||||
db_opts.set_compression_options(-14, config.rocksdb_compression_level, 0, 0);
|
|
||||||
db_opts.set_compression_type(rocksdb_compression_algo);
|
|
||||||
|
|
||||||
// Misc
|
|
||||||
db_opts.create_if_missing(true);
|
|
||||||
|
|
||||||
// Default: https://github.com/facebook/rocksdb/wiki/WAL-Recovery-Modes#ktoleratecorruptedtailrecords
|
|
||||||
//
|
|
||||||
// Unclean shutdowns of a Matrix homeserver are likely to be fine when
|
|
||||||
// recovered in this manner as it's likely any lost information will be
|
|
||||||
// restored via federation.
|
|
||||||
db_opts.set_wal_recovery_mode(match config.rocksdb_recovery_mode {
|
|
||||||
0 => rust_rocksdb::DBRecoveryMode::AbsoluteConsistency,
|
|
||||||
1 => rust_rocksdb::DBRecoveryMode::TolerateCorruptedTailRecords,
|
|
||||||
2 => rust_rocksdb::DBRecoveryMode::PointInTime,
|
|
||||||
3 => rust_rocksdb::DBRecoveryMode::SkipAnyCorruptedRecord,
|
|
||||||
4_u8..=u8::MAX => unimplemented!(),
|
|
||||||
});
|
|
||||||
|
|
||||||
db_opts.set_block_based_table_factory(&block_based_options);
|
|
||||||
db_opts.set_env(env);
|
|
||||||
db_opts
|
|
||||||
}
|
|
||||||
|
|
||||||
impl KeyValueDatabaseEngine for Arc<Engine> {
|
impl KeyValueDatabaseEngine for Arc<Engine> {
|
||||||
fn open(config: &Config) -> Result<Self> {
|
fn open(config: &Config) -> Result<Self> {
|
||||||
let cache_capacity_bytes = config.db_cache_capacity_mb * 1024.0 * 1024.0;
|
let cache_capacity_bytes = config.db_cache_capacity_mb * 1024.0 * 1024.0;
|
||||||
|
@ -145,9 +40,6 @@ impl KeyValueDatabaseEngine for Arc<Engine> {
|
||||||
let col_cache = rust_rocksdb::Cache::new_lru_cache(col_cache_capacity_bytes);
|
let col_cache = rust_rocksdb::Cache::new_lru_cache(col_cache_capacity_bytes);
|
||||||
let db_opts = db_options(config, &db_env, &row_cache, &col_cache);
|
let db_opts = db_options(config, &db_env, &row_cache, &col_cache);
|
||||||
|
|
||||||
debug!("Listing column families in database");
|
|
||||||
let cfs = Db::<MultiThreaded>::list_cf(&db_opts, &config.database_path).unwrap_or_default();
|
|
||||||
|
|
||||||
if config.rocksdb_repair {
|
if config.rocksdb_repair {
|
||||||
warn!("Starting database repair. This may take a long time...");
|
warn!("Starting database repair. This may take a long time...");
|
||||||
if let Err(e) = Db::<MultiThreaded>::repair(&db_opts, &config.database_path) {
|
if let Err(e) = Db::<MultiThreaded>::repair(&db_opts, &config.database_path) {
|
||||||
|
@ -155,21 +47,23 @@ impl KeyValueDatabaseEngine for Arc<Engine> {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
debug!("Opening {} column family descriptors in database", cfs.len());
|
debug!("Listing column families in database");
|
||||||
info!("RocksDB database compaction will take place now, a delay in startup is expected");
|
let cfs = Db::<MultiThreaded>::list_cf(&db_opts, &config.database_path).unwrap_or_default();
|
||||||
|
|
||||||
|
debug!("Opening {} column family descriptors in database", cfs.len());
|
||||||
let cfds = cfs
|
let cfds = cfs
|
||||||
.iter()
|
.iter()
|
||||||
.map(|name| rust_rocksdb::ColumnFamilyDescriptor::new(name, db_opts.clone()))
|
.map(|name| rust_rocksdb::ColumnFamilyDescriptor::new(name, cf_options(name, db_opts.clone(), config)))
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
|
debug!("Opening database...");
|
||||||
let db = if config.rocksdb_read_only {
|
let db = if config.rocksdb_read_only {
|
||||||
Db::<MultiThreaded>::open_cf_for_read_only(&db_opts, &config.database_path, cfs.clone(), false)?
|
Db::<MultiThreaded>::open_cf_for_read_only(&db_opts, &config.database_path, cfs.clone(), false)?
|
||||||
} else {
|
} else {
|
||||||
Db::<MultiThreaded>::open_cf_descriptors(&db_opts, &config.database_path, cfds)?
|
Db::<MultiThreaded>::open_cf_descriptors(&db_opts, &config.database_path, cfds)?
|
||||||
};
|
};
|
||||||
|
|
||||||
debug!("Opened database at sequence number {}", db.latest_sequence_number());
|
info!("Opened database at sequence number {}", db.latest_sequence_number());
|
||||||
Ok(Arc::new(Engine {
|
Ok(Arc::new(Engine {
|
||||||
rocks: db,
|
rocks: db,
|
||||||
row_cache,
|
row_cache,
|
||||||
|
@ -346,188 +240,3 @@ impl KeyValueDatabaseEngine for Arc<Engine> {
|
||||||
#[allow(dead_code)]
|
#[allow(dead_code)]
|
||||||
fn clear_caches(&self) {}
|
fn clear_caches(&self) {}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl RocksDbEngineTree<'_> {
|
|
||||||
fn cf(&self) -> Arc<rust_rocksdb::BoundColumnFamily<'_>> { self.db.rocks.cf_handle(self.name).unwrap() }
|
|
||||||
}
|
|
||||||
|
|
||||||
impl KvTree for RocksDbEngineTree<'_> {
|
|
||||||
fn get(&self, key: &[u8]) -> Result<Option<Vec<u8>>> {
|
|
||||||
let mut readoptions = rust_rocksdb::ReadOptions::default();
|
|
||||||
readoptions.set_total_order_seek(true);
|
|
||||||
|
|
||||||
Ok(self.db.rocks.get_cf_opt(&self.cf(), key, &readoptions)?)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn multi_get(
|
|
||||||
&self, iter: Vec<(&Arc<rust_rocksdb::BoundColumnFamily<'_>>, Vec<u8>)>,
|
|
||||||
) -> Vec<Result<Option<Vec<u8>>, rust_rocksdb::Error>> {
|
|
||||||
let mut readoptions = rust_rocksdb::ReadOptions::default();
|
|
||||||
readoptions.set_total_order_seek(true);
|
|
||||||
|
|
||||||
self.db.rocks.multi_get_cf_opt(iter, &readoptions)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn insert(&self, key: &[u8], value: &[u8]) -> Result<()> {
|
|
||||||
let writeoptions = rust_rocksdb::WriteOptions::default();
|
|
||||||
|
|
||||||
self.db
|
|
||||||
.rocks
|
|
||||||
.put_cf_opt(&self.cf(), key, value, &writeoptions)?;
|
|
||||||
|
|
||||||
if !self.db.corked() {
|
|
||||||
self.db.flush()?;
|
|
||||||
}
|
|
||||||
|
|
||||||
self.watchers.wake(key);
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn insert_batch(&self, iter: &mut dyn Iterator<Item = (Vec<u8>, Vec<u8>)>) -> Result<()> {
|
|
||||||
let writeoptions = rust_rocksdb::WriteOptions::default();
|
|
||||||
|
|
||||||
let mut batch = WriteBatchWithTransaction::<false>::default();
|
|
||||||
|
|
||||||
for (key, value) in iter {
|
|
||||||
batch.put_cf(&self.cf(), key, value);
|
|
||||||
}
|
|
||||||
|
|
||||||
let result = self.db.rocks.write_opt(batch, &writeoptions);
|
|
||||||
|
|
||||||
if !self.db.corked() {
|
|
||||||
self.db.flush()?;
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(result?)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn remove(&self, key: &[u8]) -> Result<()> {
|
|
||||||
let writeoptions = rust_rocksdb::WriteOptions::default();
|
|
||||||
|
|
||||||
let result = self.db.rocks.delete_cf_opt(&self.cf(), key, &writeoptions);
|
|
||||||
|
|
||||||
if !self.db.corked() {
|
|
||||||
self.db.flush()?;
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(result?)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn remove_batch(&self, iter: &mut dyn Iterator<Item = Vec<u8>>) -> Result<()> {
|
|
||||||
let writeoptions = rust_rocksdb::WriteOptions::default();
|
|
||||||
|
|
||||||
let mut batch = WriteBatchWithTransaction::<false>::default();
|
|
||||||
|
|
||||||
for key in iter {
|
|
||||||
batch.delete_cf(&self.cf(), key);
|
|
||||||
}
|
|
||||||
|
|
||||||
let result = self.db.rocks.write_opt(batch, &writeoptions);
|
|
||||||
|
|
||||||
if !self.db.corked() {
|
|
||||||
self.db.flush()?;
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(result?)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = (Vec<u8>, Vec<u8>)> + 'a> {
|
|
||||||
let mut readoptions = rust_rocksdb::ReadOptions::default();
|
|
||||||
readoptions.set_total_order_seek(true);
|
|
||||||
|
|
||||||
Box::new(
|
|
||||||
self.db
|
|
||||||
.rocks
|
|
||||||
.iterator_cf_opt(&self.cf(), readoptions, rust_rocksdb::IteratorMode::Start)
|
|
||||||
.map(Result::unwrap)
|
|
||||||
.map(|(k, v)| (Vec::from(k), Vec::from(v))),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn iter_from<'a>(&'a self, from: &[u8], backwards: bool) -> Box<dyn Iterator<Item = (Vec<u8>, Vec<u8>)> + 'a> {
|
|
||||||
let mut readoptions = rust_rocksdb::ReadOptions::default();
|
|
||||||
readoptions.set_total_order_seek(true);
|
|
||||||
|
|
||||||
Box::new(
|
|
||||||
self.db
|
|
||||||
.rocks
|
|
||||||
.iterator_cf_opt(
|
|
||||||
&self.cf(),
|
|
||||||
readoptions,
|
|
||||||
rust_rocksdb::IteratorMode::From(
|
|
||||||
from,
|
|
||||||
if backwards {
|
|
||||||
rust_rocksdb::Direction::Reverse
|
|
||||||
} else {
|
|
||||||
rust_rocksdb::Direction::Forward
|
|
||||||
},
|
|
||||||
),
|
|
||||||
)
|
|
||||||
.map(Result::unwrap)
|
|
||||||
.map(|(k, v)| (Vec::from(k), Vec::from(v))),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn increment(&self, key: &[u8]) -> Result<Vec<u8>> {
|
|
||||||
let mut readoptions = rust_rocksdb::ReadOptions::default();
|
|
||||||
readoptions.set_total_order_seek(true);
|
|
||||||
let writeoptions = rust_rocksdb::WriteOptions::default();
|
|
||||||
|
|
||||||
let old = self.db.rocks.get_cf_opt(&self.cf(), key, &readoptions)?;
|
|
||||||
let new = utils::increment(old.as_deref());
|
|
||||||
self.db
|
|
||||||
.rocks
|
|
||||||
.put_cf_opt(&self.cf(), key, &new, &writeoptions)?;
|
|
||||||
|
|
||||||
if !self.db.corked() {
|
|
||||||
self.db.flush()?;
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(new)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn increment_batch(&self, iter: &mut dyn Iterator<Item = Vec<u8>>) -> Result<()> {
|
|
||||||
let mut readoptions = rust_rocksdb::ReadOptions::default();
|
|
||||||
readoptions.set_total_order_seek(true);
|
|
||||||
let writeoptions = rust_rocksdb::WriteOptions::default();
|
|
||||||
|
|
||||||
let mut batch = WriteBatchWithTransaction::<false>::default();
|
|
||||||
|
|
||||||
for key in iter {
|
|
||||||
let old = self.db.rocks.get_cf_opt(&self.cf(), &key, &readoptions)?;
|
|
||||||
let new = utils::increment(old.as_deref());
|
|
||||||
batch.put_cf(&self.cf(), key, new);
|
|
||||||
}
|
|
||||||
|
|
||||||
self.db.rocks.write_opt(batch, &writeoptions)?;
|
|
||||||
|
|
||||||
if !self.db.corked() {
|
|
||||||
self.db.flush()?;
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
fn scan_prefix<'a>(&'a self, prefix: Vec<u8>) -> Box<dyn Iterator<Item = (Vec<u8>, Vec<u8>)> + 'a> {
|
|
||||||
let mut readoptions = rust_rocksdb::ReadOptions::default();
|
|
||||||
readoptions.set_total_order_seek(true);
|
|
||||||
|
|
||||||
Box::new(
|
|
||||||
self.db
|
|
||||||
.rocks
|
|
||||||
.iterator_cf_opt(
|
|
||||||
&self.cf(),
|
|
||||||
readoptions,
|
|
||||||
rust_rocksdb::IteratorMode::From(&prefix, rust_rocksdb::Direction::Forward),
|
|
||||||
)
|
|
||||||
.map(Result::unwrap)
|
|
||||||
.map(|(k, v)| (Vec::from(k), Vec::from(v)))
|
|
||||||
.take_while(move |(k, _)| k.starts_with(&prefix)),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn watch_prefix<'a>(&'a self, prefix: &[u8]) -> Pin<Box<dyn Future<Output = ()> + Send + 'a>> {
|
|
||||||
self.watchers.watch(prefix)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
235
src/database/rocksdb/opts.rs
Normal file
235
src/database/rocksdb/opts.rs
Normal file
|
@ -0,0 +1,235 @@
|
||||||
|
#![allow(dead_code)]
|
||||||
|
|
||||||
|
use rust_rocksdb::{
|
||||||
|
BlockBasedOptions, Cache, DBCompactionStyle, DBCompressionType, DBRecoveryMode, Env, LogLevel, Options,
|
||||||
|
UniversalCompactOptions, UniversalCompactionStopStyle,
|
||||||
|
};
|
||||||
|
|
||||||
|
use super::Config;
|
||||||
|
|
||||||
|
/// Create database-wide options suitable for opening the database. This also
|
||||||
|
/// sets our default column options in case of opening a column with the same
|
||||||
|
/// resulting value. Note that we require special per-column options on some
|
||||||
|
/// columns, therefor columns should only be opened after passing this result
|
||||||
|
/// through cf_options().
|
||||||
|
pub(crate) fn db_options(config: &Config, env: &Env, row_cache: &Cache, col_cache: &Cache) -> Options {
|
||||||
|
let mut opts = Options::default();
|
||||||
|
|
||||||
|
// Logging
|
||||||
|
set_logging_defaults(&mut opts, config);
|
||||||
|
|
||||||
|
// Processing
|
||||||
|
let threads = if config.rocksdb_parallelism_threads == 0 {
|
||||||
|
num_cpus::get_physical() // max cores if user specified 0
|
||||||
|
} else {
|
||||||
|
config.rocksdb_parallelism_threads
|
||||||
|
};
|
||||||
|
|
||||||
|
opts.set_max_background_jobs(threads.try_into().unwrap());
|
||||||
|
opts.set_max_subcompactions(threads.try_into().unwrap());
|
||||||
|
opts.set_max_file_opening_threads(0);
|
||||||
|
|
||||||
|
// IO
|
||||||
|
opts.set_manual_wal_flush(true);
|
||||||
|
opts.set_use_direct_reads(true);
|
||||||
|
opts.set_use_direct_io_for_flush_and_compaction(true);
|
||||||
|
if config.rocksdb_optimize_for_spinning_disks {
|
||||||
|
// speeds up opening DB on hard drives
|
||||||
|
opts.set_skip_checking_sst_file_sizes_on_db_open(true);
|
||||||
|
opts.set_skip_stats_update_on_db_open(true);
|
||||||
|
//opts.set_max_file_opening_threads(threads.try_into().unwrap());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Blocks
|
||||||
|
let mut table_opts = table_options(config);
|
||||||
|
table_opts.set_block_cache(col_cache);
|
||||||
|
opts.set_row_cache(row_cache);
|
||||||
|
|
||||||
|
// Buffers
|
||||||
|
opts.set_write_buffer_size(2 * 1024 * 1024);
|
||||||
|
opts.set_max_write_buffer_number(2);
|
||||||
|
opts.set_min_write_buffer_number(1);
|
||||||
|
|
||||||
|
// Files
|
||||||
|
opts.set_max_total_wal_size(96 * 1024 * 1024);
|
||||||
|
opts.set_level_zero_file_num_compaction_trigger(2);
|
||||||
|
set_level_defaults(&mut opts, config);
|
||||||
|
opts.set_ttl(14 * 24 * 60 * 60);
|
||||||
|
|
||||||
|
// Compression
|
||||||
|
set_compression_defaults(&mut opts, config);
|
||||||
|
|
||||||
|
// Misc
|
||||||
|
opts.create_if_missing(true);
|
||||||
|
|
||||||
|
// Default: https://github.com/facebook/rocksdb/wiki/WAL-Recovery-Modes#ktoleratecorruptedtailrecords
|
||||||
|
//
|
||||||
|
// Unclean shutdowns of a Matrix homeserver are likely to be fine when
|
||||||
|
// recovered in this manner as it's likely any lost information will be
|
||||||
|
// restored via federation.
|
||||||
|
opts.set_wal_recovery_mode(match config.rocksdb_recovery_mode {
|
||||||
|
0 => DBRecoveryMode::AbsoluteConsistency,
|
||||||
|
1 => DBRecoveryMode::TolerateCorruptedTailRecords,
|
||||||
|
2 => DBRecoveryMode::PointInTime,
|
||||||
|
3 => DBRecoveryMode::SkipAnyCorruptedRecord,
|
||||||
|
4_u8..=u8::MAX => unimplemented!(),
|
||||||
|
});
|
||||||
|
|
||||||
|
opts.set_block_based_table_factory(&table_opts);
|
||||||
|
opts.set_env(env);
|
||||||
|
opts
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Adjust options for the specific column by name. Provide the result of
|
||||||
|
/// db_options() as the argument to this function and use the return value in
|
||||||
|
/// the arguments to open the specific column.
|
||||||
|
pub(crate) fn cf_options(name: &str, mut opts: Options, config: &Config) -> Options {
|
||||||
|
match name {
|
||||||
|
"backupid_algorithm"
|
||||||
|
| "backupid_etag"
|
||||||
|
| "backupkeyid_backup"
|
||||||
|
| "roomid_shortroomid"
|
||||||
|
| "shorteventid_shortstatehash"
|
||||||
|
| "shorteventid_eventid"
|
||||||
|
| "shortstatekey_statekey"
|
||||||
|
| "shortstatehash_statediff"
|
||||||
|
| "userdevicetxnid_response"
|
||||||
|
| "userfilterid_filter" => set_for_sequential_small_uc(&mut opts, config),
|
||||||
|
&_ => {},
|
||||||
|
}
|
||||||
|
|
||||||
|
opts
|
||||||
|
}
|
||||||
|
|
||||||
|
fn set_logging_defaults(opts: &mut Options, config: &Config) {
|
||||||
|
let rocksdb_log_level = match config.rocksdb_log_level.as_ref() {
|
||||||
|
"debug" => LogLevel::Debug,
|
||||||
|
"info" => LogLevel::Info,
|
||||||
|
"warn" => LogLevel::Warn,
|
||||||
|
"fatal" => LogLevel::Fatal,
|
||||||
|
_ => LogLevel::Error,
|
||||||
|
};
|
||||||
|
|
||||||
|
opts.set_log_level(rocksdb_log_level);
|
||||||
|
opts.set_max_log_file_size(config.rocksdb_max_log_file_size);
|
||||||
|
opts.set_log_file_time_to_roll(config.rocksdb_log_time_to_roll);
|
||||||
|
opts.set_keep_log_file_num(config.rocksdb_max_log_files);
|
||||||
|
opts.set_stats_dump_period_sec(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn set_compression_defaults(opts: &mut Options, config: &Config) {
|
||||||
|
let rocksdb_compression_algo = match config.rocksdb_compression_algo.as_ref() {
|
||||||
|
"zlib" => DBCompressionType::Zlib,
|
||||||
|
"lz4" => DBCompressionType::Lz4,
|
||||||
|
"bz2" => DBCompressionType::Bz2,
|
||||||
|
_ => DBCompressionType::Zstd,
|
||||||
|
};
|
||||||
|
|
||||||
|
if config.rocksdb_bottommost_compression {
|
||||||
|
opts.set_bottommost_compression_type(rocksdb_compression_algo);
|
||||||
|
opts.set_bottommost_zstd_max_train_bytes(0, true);
|
||||||
|
|
||||||
|
// -14 w_bits is only read by zlib.
|
||||||
|
opts.set_bottommost_compression_options(-14, config.rocksdb_bottommost_compression_level, 0, 0, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
// -14 w_bits is only read by zlib.
|
||||||
|
opts.set_compression_options(-14, config.rocksdb_compression_level, 0, 0);
|
||||||
|
opts.set_compression_type(rocksdb_compression_algo);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn set_for_random_small_uc(opts: &mut Options, config: &Config) {
|
||||||
|
let uco = uc_options(config);
|
||||||
|
set_for_random_small(opts, config);
|
||||||
|
opts.set_universal_compaction_options(&uco);
|
||||||
|
opts.set_compaction_style(DBCompactionStyle::Universal);
|
||||||
|
opts.set_level_zero_file_num_compaction_trigger(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn set_for_sequential_small_uc(opts: &mut Options, config: &Config) {
|
||||||
|
let uco = uc_options(config);
|
||||||
|
set_for_sequential_small(opts, config);
|
||||||
|
opts.set_universal_compaction_options(&uco);
|
||||||
|
opts.set_compaction_style(DBCompactionStyle::Universal);
|
||||||
|
opts.set_level_zero_file_num_compaction_trigger(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn set_for_random_small(opts: &mut Options, config: &Config) {
|
||||||
|
set_for_random(opts, config);
|
||||||
|
|
||||||
|
opts.set_write_buffer_size(1024 * 1024);
|
||||||
|
opts.set_target_file_size_base(65536);
|
||||||
|
opts.set_max_bytes_for_level_base(131072);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn set_for_sequential_small(opts: &mut Options, config: &Config) {
|
||||||
|
set_for_random(opts, config);
|
||||||
|
|
||||||
|
opts.set_write_buffer_size(1024 * 1024);
|
||||||
|
opts.set_target_file_size_base(65536);
|
||||||
|
opts.set_max_bytes_for_level_base(131072);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn set_for_random(opts: &mut Options, config: &Config) {
|
||||||
|
set_level_defaults(opts, config);
|
||||||
|
|
||||||
|
let pri = "compaction_pri=kOldestSmallestSeqFirst";
|
||||||
|
opts.set_options_from_string(pri)
|
||||||
|
.expect("set compaction priority string");
|
||||||
|
|
||||||
|
opts.set_max_bytes_for_level_base(8 * 1024 * 1024);
|
||||||
|
opts.set_max_bytes_for_level_multiplier(1.0);
|
||||||
|
opts.set_max_bytes_for_level_multiplier_additional(&[0, 1, 1, 3, 7, 15, 31]);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn set_for_sequential(opts: &mut Options, config: &Config) {
|
||||||
|
set_level_defaults(opts, config);
|
||||||
|
|
||||||
|
let pri = "compaction_pri=kOldestLargestSeqFirst";
|
||||||
|
opts.set_options_from_string(pri)
|
||||||
|
.expect("set compaction priority string");
|
||||||
|
|
||||||
|
opts.set_target_file_size_base(2 * 1024 * 1024);
|
||||||
|
opts.set_target_file_size_multiplier(2);
|
||||||
|
|
||||||
|
opts.set_max_bytes_for_level_base(32 * 1024 * 1024);
|
||||||
|
opts.set_max_bytes_for_level_multiplier(1.0);
|
||||||
|
opts.set_max_bytes_for_level_multiplier_additional(&[0, 1, 1, 3, 7, 15, 31]);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn set_level_defaults(opts: &mut Options, _config: &Config) {
|
||||||
|
opts.set_target_file_size_base(1024 * 1024);
|
||||||
|
opts.set_target_file_size_multiplier(2);
|
||||||
|
|
||||||
|
opts.set_level_compaction_dynamic_level_bytes(false);
|
||||||
|
opts.set_max_bytes_for_level_base(8 * 1024 * 1024);
|
||||||
|
opts.set_max_bytes_for_level_multiplier(2.0);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn uc_options(_config: &Config) -> UniversalCompactOptions {
|
||||||
|
let mut opts = UniversalCompactOptions::default();
|
||||||
|
|
||||||
|
opts.set_stop_style(UniversalCompactionStopStyle::Total);
|
||||||
|
opts.set_max_size_amplification_percent(10000);
|
||||||
|
opts.set_compression_size_percent(-1);
|
||||||
|
opts.set_size_ratio(1);
|
||||||
|
|
||||||
|
opts.set_min_merge_width(2);
|
||||||
|
opts.set_max_merge_width(16);
|
||||||
|
|
||||||
|
opts
|
||||||
|
}
|
||||||
|
|
||||||
|
fn table_options(_config: &Config) -> BlockBasedOptions {
|
||||||
|
let mut opts = BlockBasedOptions::default();
|
||||||
|
|
||||||
|
opts.set_block_size(4 * 1024);
|
||||||
|
opts.set_metadata_block_size(4 * 1024);
|
||||||
|
|
||||||
|
opts.set_bloom_filter(9.6, true);
|
||||||
|
opts.set_optimize_filters_for_memory(true);
|
||||||
|
opts.set_cache_index_and_filter_blocks(true);
|
||||||
|
opts.set_pin_top_level_index_and_filter(true);
|
||||||
|
|
||||||
|
opts
|
||||||
|
}
|
Loading…
Add table
Add a link
Reference in a new issue