improvement: make better use of sqlite connections

This commit is contained in:
Timo Kösters 2021-08-02 10:13:34 +02:00
parent 2c4f966d60
commit bd63797213
No known key found for this signature in database
GPG key ID: 24DA7517711A2BA4
31 changed files with 422 additions and 568 deletions

View file

@ -28,20 +28,20 @@ pub trait Tree: Send + Sync {
fn remove(&self, key: &[u8]) -> Result<()>;
fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = (Vec<u8>, Vec<u8>)> + Send + 'a>;
fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = (Vec<u8>, Vec<u8>)> + 'a>;
fn iter_from<'a>(
&'a self,
from: &[u8],
backwards: bool,
) -> Box<dyn Iterator<Item = (Vec<u8>, Vec<u8>)> + Send + 'a>;
) -> Box<dyn Iterator<Item = (Vec<u8>, Vec<u8>)> + 'a>;
fn increment(&self, key: &[u8]) -> Result<Vec<u8>>;
fn scan_prefix<'a>(
&'a self,
prefix: Vec<u8>,
) -> Box<dyn Iterator<Item = (Vec<u8>, Vec<u8>)> + Send + 'a>;
) -> Box<dyn Iterator<Item = (Vec<u8>, Vec<u8>)> + 'a>;
fn watch_prefix<'a>(&'a self, prefix: &[u8]) -> Pin<Box<dyn Future<Output = ()> + Send + 'a>>;

View file

@ -81,7 +81,7 @@ impl EngineTree {
let (s, r) = bounded::<TupleOfBytes>(100);
let engine = Arc::clone(&self.engine);
let lock = self.engine.iter_pool.lock().unwrap();
let lock = self.engine.iter_pool.lock().await;
if lock.active_count() < lock.max_count() {
lock.execute(move || {
iter_from_thread_work(tree, &engine.env.read_txn().unwrap(), from, backwards, &s);

View file

@ -1,133 +1,61 @@
use super::{DatabaseEngine, Tree};
use crate::{database::Config, Result};
use crossbeam::channel::{
bounded, unbounded, Receiver as ChannelReceiver, Sender as ChannelSender, TryRecvError,
};
use parking_lot::{Mutex, MutexGuard, RwLock};
use rusqlite::{Connection, DatabaseName::Main, OptionalExtension, Params};
use rusqlite::{Connection, DatabaseName::Main, OptionalExtension};
use std::{
cell::RefCell,
collections::HashMap,
future::Future,
ops::Deref,
path::{Path, PathBuf},
pin::Pin,
sync::Arc,
time::{Duration, Instant},
};
use threadpool::ThreadPool;
use tokio::sync::oneshot::Sender;
use tracing::{debug, warn};
struct Pool {
writer: Mutex<Connection>,
readers: Vec<Mutex<Connection>>,
spills: ConnectionRecycler,
spill_tracker: Arc<()>,
path: PathBuf,
}
pub const MILLI: Duration = Duration::from_millis(1);
enum HoldingConn<'a> {
FromGuard(MutexGuard<'a, Connection>),
FromRecycled(RecycledConn, Arc<()>),
thread_local! {
static READ_CONNECTION: RefCell<Option<&'static Connection>> = RefCell::new(None);
}
impl<'a> Deref for HoldingConn<'a> {
type Target = Connection;
struct PreparedStatementIterator<'a> {
pub iterator: Box<dyn Iterator<Item = TupleOfBytes> + 'a>,
pub statement_ref: NonAliasingBox<rusqlite::Statement<'a>>,
}
fn deref(&self) -> &Self::Target {
match self {
HoldingConn::FromGuard(guard) => guard.deref(),
HoldingConn::FromRecycled(conn, _) => conn.deref(),
}
impl Iterator for PreparedStatementIterator<'_> {
type Item = TupleOfBytes;
fn next(&mut self) -> Option<Self::Item> {
self.iterator.next()
}
}
struct ConnectionRecycler(ChannelSender<Connection>, ChannelReceiver<Connection>);
impl ConnectionRecycler {
fn new() -> Self {
let (s, r) = unbounded();
Self(s, r)
}
fn recycle(&self, conn: Connection) -> RecycledConn {
let sender = self.0.clone();
RecycledConn(Some(conn), sender)
}
fn try_take(&self) -> Option<Connection> {
match self.1.try_recv() {
Ok(conn) => Some(conn),
Err(TryRecvError::Empty) => None,
// as this is pretty impossible, a panic is warranted if it ever occurs
Err(TryRecvError::Disconnected) => panic!("Receiving channel was disconnected. A a sender is owned by the current struct, this should never happen(!!!)")
}
}
}
struct RecycledConn(
Option<Connection>, // To allow moving out of the struct when `Drop` is called.
ChannelSender<Connection>,
);
impl Deref for RecycledConn {
type Target = Connection;
fn deref(&self) -> &Self::Target {
self.0
.as_ref()
.expect("RecycledConn does not have a connection in Option<>")
}
}
impl Drop for RecycledConn {
struct NonAliasingBox<T>(*mut T);
impl<T> Drop for NonAliasingBox<T> {
fn drop(&mut self) {
if let Some(conn) = self.0.take() {
debug!("Recycled connection");
if let Err(e) = self.1.send(conn) {
warn!("Recycling a connection led to the following error: {:?}", e)
}
}
unsafe { Box::from_raw(self.0) };
}
}
impl Pool {
fn new<P: AsRef<Path>>(path: P, num_readers: usize, total_cache_size_mb: f64) -> Result<Self> {
// calculates cache-size per permanent connection
// 1. convert MB to KiB
// 2. divide by permanent connections
// 3. round down to nearest integer
let cache_size: u32 = ((total_cache_size_mb * 1024.0) / (num_readers + 1) as f64) as u32;
pub struct Engine {
writer: Mutex<Connection>,
let writer = Mutex::new(Self::prepare_conn(&path, Some(cache_size))?);
path: PathBuf,
cache_size_per_thread: u32,
}
let mut readers = Vec::new();
for _ in 0..num_readers {
readers.push(Mutex::new(Self::prepare_conn(&path, Some(cache_size))?))
}
Ok(Self {
writer,
readers,
spills: ConnectionRecycler::new(),
spill_tracker: Arc::new(()),
path: path.as_ref().to_path_buf(),
})
}
fn prepare_conn<P: AsRef<Path>>(path: P, cache_size: Option<u32>) -> Result<Connection> {
let conn = Connection::open(path)?;
impl Engine {
fn prepare_conn(path: &Path, cache_size_kb: u32) -> Result<Connection> {
let conn = Connection::open(&path)?;
conn.pragma_update(Some(Main), "page_size", &32768)?;
conn.pragma_update(Some(Main), "journal_mode", &"WAL")?;
conn.pragma_update(Some(Main), "synchronous", &"NORMAL")?;
if let Some(cache_kib) = cache_size {
conn.pragma_update(Some(Main), "cache_size", &(-i64::from(cache_kib)))?;
}
conn.pragma_update(Some(Main), "cache_size", &(-i64::from(cache_size_kb)))?;
conn.pragma_update(Some(Main), "wal_autocheckpoint", &0)?;
Ok(conn)
}
@ -136,68 +64,52 @@ impl Pool {
self.writer.lock()
}
fn read_lock(&self) -> HoldingConn<'_> {
// First try to get a connection from the permanent pool
for r in &self.readers {
if let Some(reader) = r.try_lock() {
return HoldingConn::FromGuard(reader);
fn read_lock(&self) -> &'static Connection {
READ_CONNECTION.with(|cell| {
let connection = &mut cell.borrow_mut();
if (*connection).is_none() {
let c = Box::leak(Box::new(
Self::prepare_conn(&self.path, self.cache_size_per_thread).unwrap(),
));
**connection = Some(c);
}
}
debug!("read_lock: All permanent readers locked, obtaining spillover reader...");
// We didn't get a connection from the permanent pool, so we'll dumpster-dive for recycled connections.
// Either we have a connection or we dont, if we don't, we make a new one.
let conn = match self.spills.try_take() {
Some(conn) => conn,
None => {
debug!("read_lock: No recycled connections left, creating new one...");
Self::prepare_conn(&self.path, None).unwrap()
}
};
// Clone the spill Arc to mark how many spilled connections actually exist.
let spill_arc = Arc::clone(&self.spill_tracker);
// Get a sense of how many connections exist now.
let now_count = Arc::strong_count(&spill_arc) - 1 /* because one is held by the pool */;
// If the spillover readers are more than the number of total readers, there might be a problem.
if now_count > self.readers.len() {
warn!(
"Database is under high load. Consider increasing sqlite_read_pool_size ({} spillover readers exist)",
now_count
);
}
// Return the recyclable connection.
HoldingConn::FromRecycled(self.spills.recycle(conn), spill_arc)
connection.unwrap()
})
}
}
pub struct Engine {
pool: Pool,
iter_pool: Mutex<ThreadPool>,
pub fn flush_wal(self: &Arc<Self>) -> Result<()> {
self.write_lock()
.pragma_update(Some(Main), "wal_checkpoint", &"TRUNCATE")?;
Ok(())
}
}
impl DatabaseEngine for Engine {
fn open(config: &Config) -> Result<Arc<Self>> {
let pool = Pool::new(
Path::new(&config.database_path).join("conduit.db"),
config.sqlite_read_pool_size,
config.db_cache_capacity_mb,
)?;
let path = Path::new(&config.database_path).join("conduit.db");
// calculates cache-size per permanent connection
// 1. convert MB to KiB
// 2. divide by permanent connections
// 3. round down to nearest integer
let cache_size_per_thread: u32 =
((config.db_cache_capacity_mb * 1024.0) / (num_cpus::get().max(1) + 1) as f64) as u32;
let writer = Mutex::new(Self::prepare_conn(&path, cache_size_per_thread)?);
let arc = Arc::new(Engine {
pool,
iter_pool: Mutex::new(ThreadPool::new(10)),
writer,
path,
cache_size_per_thread,
});
Ok(arc)
}
fn open_tree(self: &Arc<Self>, name: &str) -> Result<Arc<dyn Tree>> {
self.pool.write_lock().execute(&format!("CREATE TABLE IF NOT EXISTS {} ( \"key\" BLOB PRIMARY KEY, \"value\" BLOB NOT NULL )", name), [])?;
self.write_lock().execute(&format!("CREATE TABLE IF NOT EXISTS {} ( \"key\" BLOB PRIMARY KEY, \"value\" BLOB NOT NULL )", name), [])?;
Ok(Arc::new(SqliteTable {
engine: Arc::clone(self),
@ -212,31 +124,6 @@ impl DatabaseEngine for Engine {
}
}
impl Engine {
pub fn flush_wal(self: &Arc<Self>) -> Result<()> {
self.pool.write_lock().pragma_update(Some(Main), "wal_checkpoint", &"RESTART")?;
Ok(())
}
// Reaps (at most) (.len() * `fraction`) (rounded down, min 1) connections.
pub fn reap_spillover_by_fraction(&self, fraction: f64) {
let mut reaped = 0;
let spill_amount = self.pool.spills.1.len() as f64;
let fraction = fraction.clamp(0.01, 1.0);
let amount = (spill_amount * fraction).max(1.0) as u32;
for _ in 0..amount {
if self.pool.spills.try_take().is_some() {
reaped += 1;
}
}
debug!("Reaped {} connections", reaped);
}
}
pub struct SqliteTable {
engine: Arc<Engine>,
name: String,
@ -258,7 +145,7 @@ impl SqliteTable {
fn insert_with_guard(&self, guard: &Connection, key: &[u8], value: &[u8]) -> Result<()> {
guard.execute(
format!(
"INSERT INTO {} (key, value) VALUES (?, ?) ON CONFLICT(key) DO UPDATE SET value = excluded.value",
"INSERT OR REPLACE INTO {} (key, value) VALUES (?, ?)",
self.name
)
.as_str(),
@ -266,70 +153,17 @@ impl SqliteTable {
)?;
Ok(())
}
#[tracing::instrument(skip(self, sql, param))]
fn iter_from_thread(
&self,
sql: String,
param: Option<Vec<u8>>,
) -> Box<dyn Iterator<Item = TupleOfBytes> + Send + Sync> {
let (s, r) = bounded::<TupleOfBytes>(5);
let engine = Arc::clone(&self.engine);
let lock = self.engine.iter_pool.lock();
if lock.active_count() < lock.max_count() {
lock.execute(move || {
if let Some(param) = param {
iter_from_thread_work(&engine.pool.read_lock(), &s, &sql, [param]);
} else {
iter_from_thread_work(&engine.pool.read_lock(), &s, &sql, []);
}
});
} else {
std::thread::spawn(move || {
if let Some(param) = param {
iter_from_thread_work(&engine.pool.read_lock(), &s, &sql, [param]);
} else {
iter_from_thread_work(&engine.pool.read_lock(), &s, &sql, []);
}
});
}
Box::new(r.into_iter())
}
}
fn iter_from_thread_work<P>(
guard: &HoldingConn<'_>,
s: &ChannelSender<(Vec<u8>, Vec<u8>)>,
sql: &str,
params: P,
) where
P: Params,
{
for bob in guard
.prepare(sql)
.unwrap()
.query_map(params, |row| Ok((row.get_unwrap(0), row.get_unwrap(1))))
.unwrap()
.map(|r| r.unwrap())
{
if s.send(bob).is_err() {
return;
}
}
}
impl Tree for SqliteTable {
#[tracing::instrument(skip(self, key))]
fn get(&self, key: &[u8]) -> Result<Option<Vec<u8>>> {
self.get_with_guard(&self.engine.pool.read_lock(), key)
self.get_with_guard(&self.engine.read_lock(), key)
}
#[tracing::instrument(skip(self, key, value))]
fn insert(&self, key: &[u8], value: &[u8]) -> Result<()> {
let guard = self.engine.pool.write_lock();
let guard = self.engine.write_lock();
let start = Instant::now();
@ -337,7 +171,7 @@ impl Tree for SqliteTable {
let elapsed = start.elapsed();
if elapsed > MILLI {
debug!("insert: took {:012?} : {}", elapsed, &self.name);
warn!("insert took {:?} : {}", elapsed, &self.name);
}
drop(guard);
@ -369,7 +203,7 @@ impl Tree for SqliteTable {
#[tracing::instrument(skip(self, key))]
fn remove(&self, key: &[u8]) -> Result<()> {
let guard = self.engine.pool.write_lock();
let guard = self.engine.write_lock();
let start = Instant::now();
@ -389,9 +223,28 @@ impl Tree for SqliteTable {
}
#[tracing::instrument(skip(self))]
fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = TupleOfBytes> + Send + 'a> {
let name = self.name.clone();
self.iter_from_thread(format!("SELECT key, value FROM {}", name), None)
fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = TupleOfBytes> + 'a> {
let guard = self.engine.read_lock();
let statement = Box::leak(Box::new(
guard
.prepare(&format!("SELECT key, value FROM {}", &self.name))
.unwrap(),
));
let statement_ref = NonAliasingBox(statement);
let iterator = Box::new(
statement
.query_map([], |row| Ok((row.get_unwrap(0), row.get_unwrap(1))))
.unwrap()
.map(|r| r.unwrap()),
);
Box::new(PreparedStatementIterator {
iterator,
statement_ref,
})
}
#[tracing::instrument(skip(self, from, backwards))]
@ -399,31 +252,61 @@ impl Tree for SqliteTable {
&'a self,
from: &[u8],
backwards: bool,
) -> Box<dyn Iterator<Item = TupleOfBytes> + Send + 'a> {
let name = self.name.clone();
) -> Box<dyn Iterator<Item = TupleOfBytes> + 'a> {
let guard = self.engine.read_lock();
let from = from.to_vec(); // TODO change interface?
if backwards {
self.iter_from_thread(
format!(
"SELECT key, value FROM {} WHERE key <= ? ORDER BY key DESC",
name
),
Some(from),
)
let statement = Box::leak(Box::new(
guard
.prepare(&format!(
"SELECT key, value FROM {} WHERE key <= ? ORDER BY key DESC",
&self.name
))
.unwrap(),
));
let statement_ref = NonAliasingBox(statement);
let iterator = Box::new(
statement
.query_map([from], |row| Ok((row.get_unwrap(0), row.get_unwrap(1))))
.unwrap()
.map(|r| r.unwrap()),
);
Box::new(PreparedStatementIterator {
iterator,
statement_ref,
})
} else {
self.iter_from_thread(
format!(
"SELECT key, value FROM {} WHERE key >= ? ORDER BY key ASC",
name
),
Some(from),
)
let statement = Box::leak(Box::new(
guard
.prepare(&format!(
"SELECT key, value FROM {} WHERE key >= ? ORDER BY key ASC",
&self.name
))
.unwrap(),
));
let statement_ref = NonAliasingBox(statement);
let iterator = Box::new(
statement
.query_map([from], |row| Ok((row.get_unwrap(0), row.get_unwrap(1))))
.unwrap()
.map(|r| r.unwrap()),
);
Box::new(PreparedStatementIterator {
iterator,
statement_ref,
})
}
}
#[tracing::instrument(skip(self, key))]
fn increment(&self, key: &[u8]) -> Result<Vec<u8>> {
let guard = self.engine.pool.write_lock();
let guard = self.engine.write_lock();
let start = Instant::now();
@ -445,10 +328,7 @@ impl Tree for SqliteTable {
}
#[tracing::instrument(skip(self, prefix))]
fn scan_prefix<'a>(
&'a self,
prefix: Vec<u8>,
) -> Box<dyn Iterator<Item = TupleOfBytes> + Send + 'a> {
fn scan_prefix<'a>(&'a self, prefix: Vec<u8>) -> Box<dyn Iterator<Item = TupleOfBytes> + 'a> {
// let name = self.name.clone();
// self.iter_from_thread(
// format!(
@ -483,25 +363,9 @@ impl Tree for SqliteTable {
fn clear(&self) -> Result<()> {
debug!("clear: running");
self.engine
.pool
.write_lock()
.execute(format!("DELETE FROM {}", self.name).as_str(), [])?;
debug!("clear: ran");
Ok(())
}
}
// TODO
// struct Pool<const NUM_READERS: usize> {
// writer: Mutex<Connection>,
// readers: [Mutex<Connection>; NUM_READERS],
// }
// // then, to pick a reader:
// for r in &pool.readers {
// if let Ok(reader) = r.try_lock() {
// // use reader
// }
// }
// // none unlocked, pick the next reader
// pool.readers[pool.counter.fetch_add(1, Relaxed) % NUM_READERS].lock()

View file

@ -49,22 +49,23 @@ impl Appservice {
)
}
pub fn iter_ids(&self) -> Result<impl Iterator<Item = Result<String>> + Send + '_> {
pub fn iter_ids(&self) -> Result<impl Iterator<Item = Result<String>> + '_> {
Ok(self.id_appserviceregistrations.iter().map(|(id, _)| {
utils::string_from_bytes(&id)
.map_err(|_| Error::bad_database("Invalid id bytes in id_appserviceregistrations."))
}))
}
pub fn iter_all(
&self,
) -> Result<impl Iterator<Item = Result<(String, serde_yaml::Value)>> + '_ + Send> {
Ok(self.iter_ids()?.filter_map(|id| id.ok()).map(move |id| {
Ok((
id.clone(),
self.get_registration(&id)?
.expect("iter_ids only returns appservices that exist"),
))
}))
pub fn all(&self) -> Result<Vec<(String, serde_yaml::Value)>> {
self.iter_ids()?
.filter_map(|id| id.ok())
.map(move |id| {
Ok((
id.clone(),
self.get_registration(&id)?
.expect("iter_ids only returns appservices that exist"),
))
})
.collect()
}
}

View file

@ -15,7 +15,7 @@ use std::{
sync::{Arc, RwLock},
time::{Duration, Instant},
};
use tokio::sync::{broadcast, watch::Receiver, Mutex, Semaphore};
use tokio::sync::{broadcast, watch::Receiver, Mutex as TokioMutex, Semaphore};
use tracing::{error, info};
use trust_dns_resolver::TokioAsyncResolver;
@ -45,8 +45,8 @@ pub struct Globals {
pub bad_signature_ratelimiter: Arc<RwLock<HashMap<Vec<String>, RateLimitState>>>,
pub servername_ratelimiter: Arc<RwLock<HashMap<Box<ServerName>, Arc<Semaphore>>>>,
pub sync_receivers: RwLock<HashMap<(UserId, Box<DeviceId>), SyncHandle>>,
pub roomid_mutex: RwLock<HashMap<RoomId, Arc<Mutex<()>>>>,
pub roomid_mutex_federation: RwLock<HashMap<RoomId, Arc<Mutex<()>>>>, // this lock will be held longer
pub roomid_mutex: RwLock<HashMap<RoomId, Arc<TokioMutex<()>>>>,
pub roomid_mutex_federation: RwLock<HashMap<RoomId, Arc<TokioMutex<()>>>>, // this lock will be held longer
pub rotate: RotationHandler,
}

View file

@ -101,8 +101,8 @@ impl Media {
prefix.extend_from_slice(&0_u32.to_be_bytes()); // Height = 0 if it's not a thumbnail
prefix.push(0xff);
let mut iter = self.mediaid_file.scan_prefix(prefix);
if let Some((key, _)) = iter.next() {
let first = self.mediaid_file.scan_prefix(prefix).next();
if let Some((key, _)) = first {
let path = globals.get_media_file(&key);
let mut file = Vec::new();
File::open(path).await?.read_to_end(&mut file).await?;
@ -190,7 +190,9 @@ impl Media {
original_prefix.extend_from_slice(&0_u32.to_be_bytes()); // Height = 0 if it's not a thumbnail
original_prefix.push(0xff);
if let Some((key, _)) = self.mediaid_file.scan_prefix(thumbnail_prefix).next() {
let first_thumbnailprefix = self.mediaid_file.scan_prefix(thumbnail_prefix).next();
let first_originalprefix = self.mediaid_file.scan_prefix(original_prefix).next();
if let Some((key, _)) = first_thumbnailprefix {
// Using saved thumbnail
let path = globals.get_media_file(&key);
let mut file = Vec::new();
@ -225,7 +227,7 @@ impl Media {
content_type,
file: file.to_vec(),
}))
} else if let Some((key, _)) = self.mediaid_file.scan_prefix(original_prefix).next() {
} else if let Some((key, _)) = first_originalprefix {
// Generate a thumbnail
let path = globals.get_media_file(&key);
let mut file = Vec::new();

View file

@ -2,7 +2,6 @@ mod edus;
pub use edus::RoomEdus;
use member::MembershipState;
use tokio::sync::MutexGuard;
use crate::{pdu::PduBuilder, utils, Database, Error, PduEvent, Result};
use lru_cache::LruCache;
@ -28,6 +27,7 @@ use std::{
mem,
sync::{Arc, Mutex},
};
use tokio::sync::MutexGuard;
use tracing::{debug, error, warn};
use super::{abstraction::Tree, admin::AdminCommand, pusher};
@ -1496,7 +1496,7 @@ impl Rooms {
db.sending.send_pdu(&server, &pdu_id)?;
}
for appservice in db.appservice.iter_all()?.filter_map(|r| r.ok()) {
for appservice in db.appservice.all()? {
if let Some(namespaces) = appservice.1.get("namespaces") {
let users = namespaces
.get("users")