millelibri/crates/zlib-searcher-core/src/tokenizer.rs

19 lines
573 B
Rust

use std::sync::Arc;
use cang_jie::{CangJieTokenizer, TokenizerOption};
use jieba_rs::Jieba;
use tantivy::tokenizer::{AsciiFoldingFilter, LowerCaser, RemoveLongFilter, TextAnalyzer};
pub const META_DATA_TOKENIZER: &str = "meta_data_tokenizer";
pub fn get_tokenizer() -> TextAnalyzer {
let cangjie = CangJieTokenizer {
worker: Arc::new(Jieba::new()),
option: TokenizerOption::ForSearch { hmm: false },
};
TextAnalyzer::from(cangjie)
.filter(RemoveLongFilter::limit(20))
.filter(AsciiFoldingFilter)
.filter(LowerCaser)
}