factor search tokenization out into a function

This ensures that the tokenization algorithm will remain in sync between
querying, indexing, and deindexing. The existing code had slightly
different behavior for querying, because it did not discard words with
>50 bytes. This was inconsequential, because >50 byte tokens are never
present in the index.

Signed-off-by: strawberry <strawberry@puppygock.gay>
This commit is contained in:
Benjamin Lee 2024-06-12 00:22:49 -04:00 committed by June 🍓🦴
parent 81cd677b4e
commit 20a54aacd6

View file

@ -10,14 +10,20 @@ pub trait Data: Send + Sync {
fn search_pdus<'a>(&'a self, room_id: &RoomId, search_string: &str) -> SearchPdusResult<'a>; fn search_pdus<'a>(&'a self, room_id: &RoomId, search_string: &str) -> SearchPdusResult<'a>;
} }
impl Data for KeyValueDatabase { /// Splits a string into tokens used as keys in the search inverted index
fn index_pdu(&self, shortroomid: u64, pdu_id: &[u8], message_body: &str) -> Result<()> { ///
let mut batch = message_body /// This may be used to tokenize both message bodies (for indexing) or search
.split_terminator(|c: char| !c.is_alphanumeric()) /// queries (for querying).
fn tokenize(body: &str) -> impl Iterator<Item = String> + '_ {
body.split_terminator(|c: char| !c.is_alphanumeric())
.filter(|s| !s.is_empty()) .filter(|s| !s.is_empty())
.filter(|word| word.len() <= 50) .filter(|word| word.len() <= 50)
.map(str::to_lowercase) .map(str::to_lowercase)
.map(|word| { }
impl Data for KeyValueDatabase {
fn index_pdu(&self, shortroomid: u64, pdu_id: &[u8], message_body: &str) -> Result<()> {
let mut batch = tokenize(message_body).map(|word| {
let mut key = shortroomid.to_be_bytes().to_vec(); let mut key = shortroomid.to_be_bytes().to_vec();
key.extend_from_slice(word.as_bytes()); key.extend_from_slice(word.as_bytes());
key.push(0xFF); key.push(0xFF);
@ -37,11 +43,7 @@ impl Data for KeyValueDatabase {
.to_be_bytes() .to_be_bytes()
.to_vec(); .to_vec();
let words: Vec<_> = search_string let words: Vec<_> = tokenize(search_string).collect();
.split_terminator(|c: char| !c.is_alphanumeric())
.filter(|s| !s.is_empty())
.map(str::to_lowercase)
.collect();
let iterators = words.clone().into_iter().map(move |word| { let iterators = words.clone().into_iter().map(move |word| {
let mut prefix2 = prefix.clone(); let mut prefix2 = prefix.clone();