This commit is contained in:
Jonathan de Jong 2021-07-14 07:07:08 +00:00 committed by Timo Kösters
parent bd4bd58612
commit 9d4fa9a220
49 changed files with 1525 additions and 681 deletions

View file

@ -0,0 +1,176 @@
use super::super::Config;
use crate::{utils, Result};
use std::{future::Future, pin::Pin, sync::Arc};
use super::{DatabaseEngine, Tree};
use std::{collections::BTreeMap, sync::RwLock};
pub struct Engine(rocksdb::DBWithThreadMode<rocksdb::MultiThreaded>);
pub struct RocksDbEngineTree<'a> {
db: Arc<Engine>,
name: &'a str,
watchers: RwLock<BTreeMap<Vec<u8>, Vec<tokio::sync::oneshot::Sender<()>>>>,
}
impl DatabaseEngine for Engine {
fn open(config: &Config) -> Result<Arc<Self>> {
let mut db_opts = rocksdb::Options::default();
db_opts.create_if_missing(true);
db_opts.set_max_open_files(16);
db_opts.set_compaction_style(rocksdb::DBCompactionStyle::Level);
db_opts.set_compression_type(rocksdb::DBCompressionType::Snappy);
db_opts.set_target_file_size_base(256 << 20);
db_opts.set_write_buffer_size(256 << 20);
let mut block_based_options = rocksdb::BlockBasedOptions::default();
block_based_options.set_block_size(512 << 10);
db_opts.set_block_based_table_factory(&block_based_options);
let cfs = rocksdb::DBWithThreadMode::<rocksdb::MultiThreaded>::list_cf(
&db_opts,
&config.database_path,
)
.unwrap_or_default();
let mut options = rocksdb::Options::default();
options.set_merge_operator_associative("increment", utils::increment_rocksdb);
let db = rocksdb::DBWithThreadMode::<rocksdb::MultiThreaded>::open_cf_descriptors(
&db_opts,
&config.database_path,
cfs.iter()
.map(|name| rocksdb::ColumnFamilyDescriptor::new(name, options.clone())),
)?;
Ok(Arc::new(Engine(db)))
}
fn open_tree(self: &Arc<Self>, name: &'static str) -> Result<Arc<dyn Tree>> {
let mut options = rocksdb::Options::default();
options.set_merge_operator_associative("increment", utils::increment_rocksdb);
// Create if it doesn't exist
let _ = self.0.create_cf(name, &options);
Ok(Arc::new(RocksDbEngineTree {
name,
db: Arc::clone(self),
watchers: RwLock::new(BTreeMap::new()),
}))
}
}
impl RocksDbEngineTree<'_> {
fn cf(&self) -> rocksdb::BoundColumnFamily<'_> {
self.db.0.cf_handle(self.name).unwrap()
}
}
impl Tree for RocksDbEngineTree<'_> {
fn get(&self, key: &[u8]) -> Result<Option<Vec<u8>>> {
Ok(self.db.0.get_cf(self.cf(), key)?)
}
fn insert(&self, key: &[u8], value: &[u8]) -> Result<()> {
let watchers = self.watchers.read().unwrap();
let mut triggered = Vec::new();
for length in 0..=key.len() {
if watchers.contains_key(&key[..length]) {
triggered.push(&key[..length]);
}
}
drop(watchers);
if !triggered.is_empty() {
let mut watchers = self.watchers.write().unwrap();
for prefix in triggered {
if let Some(txs) = watchers.remove(prefix) {
for tx in txs {
let _ = tx.send(());
}
}
}
}
Ok(self.db.0.put_cf(self.cf(), key, value)?)
}
fn remove(&self, key: &[u8]) -> Result<()> {
Ok(self.db.0.delete_cf(self.cf(), key)?)
}
fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = (Vec<u8>, Vec<u8>)> + Send + Sync + 'a> {
Box::new(
self.db
.0
.iterator_cf(self.cf(), rocksdb::IteratorMode::Start),
)
}
fn iter_from<'a>(
&'a self,
from: &[u8],
backwards: bool,
) -> Box<dyn Iterator<Item = (Vec<u8>, Vec<u8>)> + 'a> {
Box::new(self.db.0.iterator_cf(
self.cf(),
rocksdb::IteratorMode::From(
from,
if backwards {
rocksdb::Direction::Reverse
} else {
rocksdb::Direction::Forward
},
),
))
}
fn increment(&self, key: &[u8]) -> Result<Vec<u8>> {
let stats = rocksdb::perf::get_memory_usage_stats(Some(&[&self.db.0]), None).unwrap();
dbg!(stats.mem_table_total);
dbg!(stats.mem_table_unflushed);
dbg!(stats.mem_table_readers_total);
dbg!(stats.cache_total);
// TODO: atomic?
let old = self.get(key)?;
let new = utils::increment(old.as_deref()).unwrap();
self.insert(key, &new)?;
Ok(new)
}
fn scan_prefix<'a>(
&'a self,
prefix: Vec<u8>,
) -> Box<dyn Iterator<Item = (Vec<u8>, Vec<u8>)> + Send + 'a> {
Box::new(
self.db
.0
.iterator_cf(
self.cf(),
rocksdb::IteratorMode::From(&prefix, rocksdb::Direction::Forward),
)
.take_while(move |(k, _)| k.starts_with(&prefix)),
)
}
fn watch_prefix<'a>(&'a self, prefix: &[u8]) -> Pin<Box<dyn Future<Output = ()> + Send + 'a>> {
let (tx, rx) = tokio::sync::oneshot::channel();
self.watchers
.write()
.unwrap()
.entry(prefix.to_vec())
.or_default()
.push(tx);
Box::pin(async move {
// Tx is never destroyed
rx.await.unwrap();
})
}
}

View file

@ -0,0 +1,119 @@
use super::super::Config;
use crate::{utils, Result};
use log::warn;
use std::{future::Future, pin::Pin, sync::Arc};
use super::{DatabaseEngine, Tree};
pub struct Engine(sled::Db);
pub struct SledEngineTree(sled::Tree);
impl DatabaseEngine for Engine {
fn open(config: &Config) -> Result<Arc<Self>> {
Ok(Arc::new(Engine(
sled::Config::default()
.path(&config.database_path)
.cache_capacity((config.db_cache_capacity_mb * 1024 * 1024) as u64)
.use_compression(true)
.open()?,
)))
}
fn open_tree(self: &Arc<Self>, name: &'static str) -> Result<Arc<dyn Tree>> {
Ok(Arc::new(SledEngineTree(self.0.open_tree(name)?)))
}
fn flush(self: &Arc<Self>) -> Result<()> {
Ok(()) // noop
}
}
impl Tree for SledEngineTree {
fn get(&self, key: &[u8]) -> Result<Option<Vec<u8>>> {
Ok(self.0.get(key)?.map(|v| v.to_vec()))
}
fn insert(&self, key: &[u8], value: &[u8]) -> Result<()> {
self.0.insert(key, value)?;
Ok(())
}
fn remove(&self, key: &[u8]) -> Result<()> {
self.0.remove(key)?;
Ok(())
}
fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = (Vec<u8>, Vec<u8>)> + Send + 'a> {
Box::new(
self.0
.iter()
.filter_map(|r| {
if let Err(e) = &r {
warn!("Error: {}", e);
}
r.ok()
})
.map(|(k, v)| (k.to_vec().into(), v.to_vec().into())),
)
}
fn iter_from(
&self,
from: &[u8],
backwards: bool,
) -> Box<dyn Iterator<Item = (Vec<u8>, Vec<u8>)> + Send> {
let iter = if backwards {
self.0.range(..from)
} else {
self.0.range(from..)
};
let iter = iter
.filter_map(|r| {
if let Err(e) = &r {
warn!("Error: {}", e);
}
r.ok()
})
.map(|(k, v)| (k.to_vec().into(), v.to_vec().into()));
if backwards {
Box::new(iter.rev())
} else {
Box::new(iter)
}
}
fn increment(&self, key: &[u8]) -> Result<Vec<u8>> {
Ok(self
.0
.update_and_fetch(key, utils::increment)
.map(|o| o.expect("increment always sets a value").to_vec())?)
}
fn scan_prefix<'a>(
&'a self,
prefix: Vec<u8>,
) -> Box<dyn Iterator<Item = (Vec<u8>, Vec<u8>)> + Send + 'a> {
let iter = self
.0
.scan_prefix(prefix)
.filter_map(|r| {
if let Err(e) = &r {
warn!("Error: {}", e);
}
r.ok()
})
.map(|(k, v)| (k.to_vec().into(), v.to_vec().into()));
Box::new(iter)
}
fn watch_prefix<'a>(&'a self, prefix: &[u8]) -> Pin<Box<dyn Future<Output = ()> + Send + 'a>> {
let prefix = prefix.to_vec();
Box::pin(async move {
self.0.watch_prefix(prefix).await;
})
}
}

View file

@ -0,0 +1,444 @@
use std::{
collections::BTreeMap,
future::Future,
ops::Deref,
path::{Path, PathBuf},
pin::Pin,
sync::Arc,
thread,
time::{Duration, Instant},
};
use crate::{database::Config, Result};
use super::{DatabaseEngine, Tree};
use log::debug;
use crossbeam::channel::{bounded, Sender as ChannelSender};
use parking_lot::{Mutex, MutexGuard, RwLock};
use rusqlite::{params, Connection, DatabaseName::Main, OptionalExtension};
use tokio::sync::oneshot::Sender;
// const SQL_CREATE_TABLE: &str =
// "CREATE TABLE IF NOT EXISTS {} {{ \"key\" BLOB PRIMARY KEY, \"value\" BLOB NOT NULL }}";
// const SQL_SELECT: &str = "SELECT value FROM {} WHERE key = ?";
// const SQL_INSERT: &str = "INSERT OR REPLACE INTO {} (key, value) VALUES (?, ?)";
// const SQL_DELETE: &str = "DELETE FROM {} WHERE key = ?";
// const SQL_SELECT_ITER: &str = "SELECT key, value FROM {}";
// const SQL_SELECT_PREFIX: &str = "SELECT key, value FROM {} WHERE key LIKE ?||'%' ORDER BY key ASC";
// const SQL_SELECT_ITER_FROM_FORWARDS: &str = "SELECT key, value FROM {} WHERE key >= ? ORDER BY ASC";
// const SQL_SELECT_ITER_FROM_BACKWARDS: &str =
// "SELECT key, value FROM {} WHERE key <= ? ORDER BY DESC";
struct Pool {
writer: Mutex<Connection>,
readers: Vec<Mutex<Connection>>,
spill_tracker: Arc<()>,
path: PathBuf,
}
pub const MILLI: Duration = Duration::from_millis(1);
enum HoldingConn<'a> {
FromGuard(MutexGuard<'a, Connection>),
FromOwned(Connection, Arc<()>),
}
impl<'a> Deref for HoldingConn<'a> {
type Target = Connection;
fn deref(&self) -> &Self::Target {
match self {
HoldingConn::FromGuard(guard) => guard.deref(),
HoldingConn::FromOwned(conn, _) => conn,
}
}
}
impl Pool {
fn new<P: AsRef<Path>>(path: P, num_readers: usize, total_cache_size_mb: f64) -> Result<Self> {
// calculates cache-size per permanent connection
// 1. convert MB to KiB
// 2. divide by permanent connections
// 3. round down to nearest integer
let cache_size: u32 = ((total_cache_size_mb * 1024.0) / (num_readers + 1) as f64) as u32;
let writer = Mutex::new(Self::prepare_conn(&path, Some(cache_size))?);
let mut readers = Vec::new();
for _ in 0..num_readers {
readers.push(Mutex::new(Self::prepare_conn(&path, Some(cache_size))?))
}
Ok(Self {
writer,
readers,
spill_tracker: Arc::new(()),
path: path.as_ref().to_path_buf(),
})
}
fn prepare_conn<P: AsRef<Path>>(path: P, cache_size: Option<u32>) -> Result<Connection> {
let conn = Connection::open(path)?;
conn.pragma_update(Some(Main), "journal_mode", &"WAL".to_owned())?;
// conn.pragma_update(Some(Main), "wal_autocheckpoint", &250)?;
// conn.pragma_update(Some(Main), "wal_checkpoint", &"FULL".to_owned())?;
conn.pragma_update(Some(Main), "synchronous", &"OFF".to_owned())?;
if let Some(cache_kib) = cache_size {
conn.pragma_update(Some(Main), "cache_size", &(-Into::<i64>::into(cache_kib)))?;
}
Ok(conn)
}
fn write_lock(&self) -> MutexGuard<'_, Connection> {
self.writer.lock()
}
fn read_lock(&self) -> HoldingConn<'_> {
for r in &self.readers {
if let Some(reader) = r.try_lock() {
return HoldingConn::FromGuard(reader);
}
}
let spill_arc = self.spill_tracker.clone();
let now_count = Arc::strong_count(&spill_arc) - 1 /* because one is held by the pool */;
log::warn!("read_lock: all readers locked, creating spillover reader...");
if now_count > 1 {
log::warn!("read_lock: now {} spillover readers exist", now_count);
}
let spilled = Self::prepare_conn(&self.path, None).unwrap();
return HoldingConn::FromOwned(spilled, spill_arc);
}
}
pub struct Engine {
pool: Pool,
}
impl DatabaseEngine for Engine {
fn open(config: &Config) -> Result<Arc<Self>> {
let pool = Pool::new(
Path::new(&config.database_path).join("conduit.db"),
config.sqlite_read_pool_size,
config.db_cache_capacity_mb,
)?;
pool.write_lock()
.execute("CREATE TABLE IF NOT EXISTS _noop (\"key\" INT)", params![])?;
let arc = Arc::new(Engine { pool });
Ok(arc)
}
fn open_tree(self: &Arc<Self>, name: &str) -> Result<Arc<dyn Tree>> {
self.pool.write_lock().execute(format!("CREATE TABLE IF NOT EXISTS {} ( \"key\" BLOB PRIMARY KEY, \"value\" BLOB NOT NULL )", name).as_str(), [])?;
Ok(Arc::new(SqliteTable {
engine: Arc::clone(self),
name: name.to_owned(),
watchers: RwLock::new(BTreeMap::new()),
}))
}
fn flush(self: &Arc<Self>) -> Result<()> {
self.pool
.write_lock()
.execute_batch(
"
PRAGMA synchronous=FULL;
BEGIN;
DELETE FROM _noop;
INSERT INTO _noop VALUES (1);
COMMIT;
PRAGMA synchronous=OFF;
",
)
.map_err(Into::into)
}
}
impl Engine {
pub fn flush_wal(self: &Arc<Self>) -> Result<()> {
self.pool
.write_lock()
.execute_batch(
"
PRAGMA synchronous=FULL; PRAGMA wal_checkpoint=TRUNCATE;
BEGIN;
DELETE FROM _noop;
INSERT INTO _noop VALUES (1);
COMMIT;
PRAGMA wal_checkpoint=PASSIVE; PRAGMA synchronous=OFF;
",
)
.map_err(Into::into)
}
}
pub struct SqliteTable {
engine: Arc<Engine>,
name: String,
watchers: RwLock<BTreeMap<Vec<u8>, Vec<Sender<()>>>>,
}
type TupleOfBytes = (Vec<u8>, Vec<u8>);
impl SqliteTable {
fn get_with_guard(&self, guard: &Connection, key: &[u8]) -> Result<Option<Vec<u8>>> {
Ok(guard
.prepare(format!("SELECT value FROM {} WHERE key = ?", self.name).as_str())?
.query_row([key], |row| row.get(0))
.optional()?)
}
fn insert_with_guard(&self, guard: &Connection, key: &[u8], value: &[u8]) -> Result<()> {
guard.execute(
format!(
"INSERT INTO {} (key, value) VALUES (?, ?) ON CONFLICT(key) DO UPDATE SET value = excluded.value",
self.name
)
.as_str(),
[key, value],
)?;
Ok(())
}
fn _iter_from_thread<F>(&self, f: F) -> Box<dyn Iterator<Item = TupleOfBytes> + Send>
where
F: (for<'a> FnOnce(&'a Connection, ChannelSender<TupleOfBytes>)) + Send + 'static,
{
let (s, r) = bounded::<TupleOfBytes>(5);
let engine = self.engine.clone();
thread::spawn(move || {
let _ = f(&engine.pool.read_lock(), s);
});
Box::new(r.into_iter())
}
}
macro_rules! iter_from_thread {
($self:expr, $sql:expr, $param:expr) => {
$self._iter_from_thread(move |guard, s| {
let _ = guard
.prepare($sql)
.unwrap()
.query_map($param, |row| Ok((row.get_unwrap(0), row.get_unwrap(1))))
.unwrap()
.map(|r| r.unwrap())
.try_for_each(|bob| s.send(bob));
})
};
}
impl Tree for SqliteTable {
fn get(&self, key: &[u8]) -> Result<Option<Vec<u8>>> {
let guard = self.engine.pool.read_lock();
// let start = Instant::now();
let val = self.get_with_guard(&guard, key);
// debug!("get: took {:?}", start.elapsed());
// debug!("get key: {:?}", &key)
val
}
fn insert(&self, key: &[u8], value: &[u8]) -> Result<()> {
let guard = self.engine.pool.write_lock();
let start = Instant::now();
self.insert_with_guard(&guard, key, value)?;
let elapsed = start.elapsed();
if elapsed > MILLI {
debug!("insert: took {:012?} : {}", elapsed, &self.name);
}
drop(guard);
let watchers = self.watchers.read();
let mut triggered = Vec::new();
for length in 0..=key.len() {
if watchers.contains_key(&key[..length]) {
triggered.push(&key[..length]);
}
}
drop(watchers);
if !triggered.is_empty() {
let mut watchers = self.watchers.write();
for prefix in triggered {
if let Some(txs) = watchers.remove(prefix) {
for tx in txs {
let _ = tx.send(());
}
}
}
};
Ok(())
}
fn remove(&self, key: &[u8]) -> Result<()> {
let guard = self.engine.pool.write_lock();
let start = Instant::now();
guard.execute(
format!("DELETE FROM {} WHERE key = ?", self.name).as_str(),
[key],
)?;
let elapsed = start.elapsed();
if elapsed > MILLI {
debug!("remove: took {:012?} : {}", elapsed, &self.name);
}
// debug!("remove key: {:?}", &key);
Ok(())
}
fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = TupleOfBytes> + Send + 'a> {
let name = self.name.clone();
iter_from_thread!(
self,
format!("SELECT key, value FROM {}", name).as_str(),
params![]
)
}
fn iter_from<'a>(
&'a self,
from: &[u8],
backwards: bool,
) -> Box<dyn Iterator<Item = TupleOfBytes> + Send + 'a> {
let name = self.name.clone();
let from = from.to_vec(); // TODO change interface?
if backwards {
iter_from_thread!(
self,
format!(
"SELECT key, value FROM {} WHERE key <= ? ORDER BY key DESC",
name
)
.as_str(),
[from]
)
} else {
iter_from_thread!(
self,
format!(
"SELECT key, value FROM {} WHERE key >= ? ORDER BY key ASC",
name
)
.as_str(),
[from]
)
}
}
fn increment(&self, key: &[u8]) -> Result<Vec<u8>> {
let guard = self.engine.pool.write_lock();
let start = Instant::now();
let old = self.get_with_guard(&guard, key)?;
let new =
crate::utils::increment(old.as_deref()).expect("utils::increment always returns Some");
self.insert_with_guard(&guard, key, &new)?;
let elapsed = start.elapsed();
if elapsed > MILLI {
debug!("increment: took {:012?} : {}", elapsed, &self.name);
}
// debug!("increment key: {:?}", &key);
Ok(new)
}
fn scan_prefix<'a>(
&'a self,
prefix: Vec<u8>,
) -> Box<dyn Iterator<Item = TupleOfBytes> + Send + 'a> {
// let name = self.name.clone();
// iter_from_thread!(
// self,
// format!(
// "SELECT key, value FROM {} WHERE key BETWEEN ?1 AND ?1 || X'FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF' ORDER BY key ASC",
// name
// )
// .as_str(),
// [prefix]
// )
Box::new(
self.iter_from(&prefix, false)
.take_while(move |(key, _)| key.starts_with(&prefix)),
)
}
fn watch_prefix<'a>(&'a self, prefix: &[u8]) -> Pin<Box<dyn Future<Output = ()> + Send + 'a>> {
let (tx, rx) = tokio::sync::oneshot::channel();
self.watchers
.write()
.entry(prefix.to_vec())
.or_default()
.push(tx);
Box::pin(async move {
// Tx is never destroyed
rx.await.unwrap();
})
}
fn clear(&self) -> Result<()> {
debug!("clear: running");
self.engine
.pool
.write_lock()
.execute(format!("DELETE FROM {}", self.name).as_str(), [])?;
debug!("clear: ran");
Ok(())
}
}
// TODO
// struct Pool<const NUM_READERS: usize> {
// writer: Mutex<Connection>,
// readers: [Mutex<Connection>; NUM_READERS],
// }
// // then, to pick a reader:
// for r in &pool.readers {
// if let Ok(reader) = r.try_lock() {
// // use reader
// }
// }
// // none unlocked, pick the next reader
// pool.readers[pool.counter.fetch_add(1, Relaxed) % NUM_READERS].lock()