inital commit (v0.3)

pull/2/head
lamacchinadesiderante 9 months ago
commit 26092fdc35

@ -0,0 +1,9 @@
index
.github
target
*.csv
.git
.vscode/*
.idea
.DS_Store

9
.gitignore vendored

@ -0,0 +1,9 @@
/target
/index
*.csv
/release
/zlib-searcher
.vscode
index_0.6.zip

4765
Cargo.lock generated

File diff suppressed because it is too large Load Diff

@ -0,0 +1,33 @@
[workspace]
members = [
"crates/zlib-searcher",
"crates/zlib-searcher-core",
"crates/zlib-searcher-desktop",
]
[workspace.package]
edition = "2021"
authors = ["zu1k <i@zu1k.com>"]
description = "search z-library index."
homepage = "https://github.com/zlib-searcher/zlib-searcher"
repository = "https://github.com/zlib-searcher/zlib-searcher"
license = "MIT"
exclude = [".github/", "index/", "frontend/"]
[profile.release]
strip = true
lto = true
opt-level = 3
codegen-units = 1
[workspace.dependencies]
anyhow = "1.0"
env_logger = "0.10"
log = "0.4"
serde = { version = "1.0", features = ["derive"] }
serde_with = "2.0"
zlib-searcher-core = { path = "crates/zlib-searcher-core" }

@ -0,0 +1,10 @@
[build.env]
passthrough = [
"RUSTFLAGS"
]
[target.mips-unknown-linux-musl]
image = "rustembedded/cross:mips-unknown-linux-musl-0.2.1"
[target.mipsel-unknown-linux-musl]
image = "rustembedded/cross:mipsel-unknown-linux-musl-0.2.1"

@ -0,0 +1,16 @@
FROM node:19-bullseye as frontend
COPY . /source
RUN cd /source/frontend && npm install && npm run build
FROM rust:1.65-buster as backend
COPY . /source
COPY --from=frontend /source/frontend/dist /source/frontend/dist
RUN cd /source && cargo build --release -p zlib-searcher
FROM ubuntu:22.04
COPY --from=backend /source/target/release/zlib-searcher /zlib-searcher
CMD ["/zlib-searcher", "run", "-b", "0.0.0.0:7070"]

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2022 zlib-searcher's authors
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

@ -0,0 +1,27 @@
NAME=zlib-searcher
PREFIX ?= /usr/local/bin
TARGET ?= debug
.PHONY: all frontend_preinstall frontend build clean
all: build
frontend_preinstall:
pnpm -C frontend install
frontend:
pnpm -C frontend run build
build: frontend
ifeq (${TARGET}, release)
cargo build -p zlib-searcher --release
else
cargo build -p zlib-searcher
endif
clean:
cargo clean
rm -rf release
releases:
cd scripts && ./build_release.sh -a a

@ -0,0 +1,131 @@
# Millelibri project
This is a fork from zlib-searcher project. Future goals:
- improve search indexes (language)
- add books
- expand file types
# zlib(libgen) searcher
[![GitHub stars](https://img.shields.io/github/stars/zlib-searcher/zlib-searcher)](https://github.com/zlib-searcher/zlib-searcher/stargazers)
[![GitHub forks](https://img.shields.io/github/forks/zlib-searcher/zlib-searcher)](https://github.com/zlib-searcher/zlib-searcher/network)
[![Release](https://img.shields.io/github/release/zlib-searcher/zlib-searcher)](https://github.com/zlib-searcher/zlib-searcher/releases)
[![GitHub issues](https://img.shields.io/github/issues/zlib-searcher/zlib-searcher)](https://github.com/zlib-searcher/zlib-searcher/issues)
[![GitHub license](https://img.shields.io/github/license/zlib-searcher/zlib-searcher)](https://github.com/zlib-searcher/zlib-searcher/blob/master/LICENSE)
Search `zlib`/`libgen` index to get `ipfs_cid`.
We don't save and provide files, we provide search.
I hope everyone have a copy of the index locally, so that no need to rely on any centralized service.
## Deploy with Docker
```
git clone https://github.com/zlib-searcher/zlib-searcher.git && cd zlib-searcher
wget https://github.com/zlib-searcher/zlib-searcher/releases/download/0.6.0/index_0.6.zip && unzip index_0.6.zip
docker-compose up -d
```
Now `zlib-searcher` it will listen to `0.0.0.0:7070`.
## Usage
### 1. Download the pre-compiled binary from [Release](https://github.com/zlib-searcher/zlib-searcher/releases).
Or you can compile by yourself. Refer to [Build from source](#build-from-source) for instructions.
### 2. Download the `index` file that has been created.
We will give the corresponding `index` download links for each version in the release page.
Or you can make your own via `zlib-searcher index`.
Extract the `index` folder to the same level as the program, it should look like the following:
```
zlib_searcher_dir
├── index
│   ├── some index files...
│   └── meta.json
└── zlib-searcher
```
### 3. Run `zlib-searcher run`, it will listen to `127.0.0.1:7070`.
Access http://127.0.0.1:7070/ to use webui, or you can use the original api.
#### original search api
You can search by the following fields:
- title
- author
- publisher
- extension
- language
- isbn
- zlib_id
Examples:
- `http://127.0.0.1:7070/search?limit=30&query=余华`
- `http://127.0.0.1:7070/search?limit=30&query=title:机器学习 extension:azw3 publisher:清华`
- `http://127.0.0.1:7070/search?limit=30&query=zlib_id:18557063`
- `http://127.0.0.1:7070/search?limit=30&query=isbn:9787302423287`
## Build from source
### 1. Build `zlib-searcher`
First build frontend
```bash
make frontend_preinstall frontend
```
Then build zlib-searcher
```bash
TARGET=release make
# move the compiled binary to the project root directory
mv target/release/zlib-searcher .
```
### 2. Build `index`
Download `zlib_index_books.csv.zip` and `libgen_index_books.csv.zip` and extract the `csv` files to the project root directory.
Then run `zlib-searcher index`. You may need to `rm index/*` first.
If you have other csv files, you can run `zlib-searcher index -f *.csv` to index them.
The finally folder structure should look like this:
```
zlib_searcher_dir // in the example above, it is project root directory.
├── index
│   ├── some index files...
│   └── meta.json
└── zlib-searcher
```
## Raw data
We downloaded `libgen` sql and `zlib` sql and exported the necessary data from them.
```
id, title, author, publisher, extension, filesize, language, year, pages, isbn, ipfs_cid
```
This raw data is used to generate our `index`, you can download the raw data from here:
- [zlib_index_books.csv.zip](https://github.com/zlib-searcher/zlib-searcher/releases/download/0.4.0/zlib_index_books.csv.zip)
- [libgen_index_books.csv.zip](https://github.com/zlib-searcher/zlib-searcher/releases/download/0.4.0/libgen_index_books.csv.zip)
## License
**zlib-searcher** © [zlib-searcher's authors](https://github.com/zlib-searcher/zlib-searcher/graphs/contributors), Released under the [MIT](./LICENSE) License.

@ -0,0 +1,31 @@
[package]
name = "zlib-searcher-core"
version = "0.7.0"
edition.workspace = true
authors.workspace = true
description.workspace = true
homepage.workspace = true
repository.workspace = true
license.workspace = true
[dependencies]
anyhow = { workspace = true }
env_logger = { workspace = true }
log = { workspace = true }
serde = { workspace = true }
serde_with = { workspace = true }
tantivy = { version = "0.18", default-features = false, features = ["mmap"] }
cang-jie = "0.14"
jieba-rs = { version = "0.6", features = ["default-dict"] }
csv = "1.1"
indicatif = "0.17"
sysinfo = { version = "0.27", default-features = false }
[features]
default = ["best-size"]
best-size = ["tantivy/brotli-compression"]
best-speed = ["tantivy/lz4-compression"]

@ -0,0 +1,101 @@
use crate::{Book, Searcher};
use indicatif::{ProgressBar, ProgressIterator, ProgressStyle};
use log::info;
use std::{
fs::File,
io::{BufRead, BufReader},
path::Path,
};
use sysinfo::{System, SystemExt};
use tantivy::doc;
fn get_memory_arena_num_bytes() -> usize {
let sys = System::new_all();
let available_memory = sys.available_memory() as usize;
let cpu_num = sys.cpus().len();
info!("Your system has cpu {cpu_num} cores and {available_memory} Bytes available");
let chunk_size = 1024 * 1024 * 1024; // 1GB
let total_num_chunk = available_memory / chunk_size;
let s = if total_num_chunk < 2 {
// <2G
available_memory - 100 * 1024 * 1024 // available_memory-100MB
} else {
// >2G
available_memory * (total_num_chunk - 1) // available_memory-1GB
};
let num_threads = std::cmp::min(cpu_num, 8);
let s = std::cmp::min(s, num_threads * 4293967294);
info!("Using {num_threads} threads and {s} Bytes to do index");
s
}
impl Searcher {
pub fn index(&mut self, csv_file: impl AsRef<Path>) {
let mut writer = self.index.writer(get_memory_arena_num_bytes()).unwrap();
let file = File::open(&csv_file).unwrap();
let reader = BufReader::new(file);
let mut rdr = csv::ReaderBuilder::new()
.has_headers(false)
.from_reader(reader);
let line_count = BufReader::new(File::open(&csv_file).unwrap())
.lines()
.count();
let style = ProgressStyle::default_bar()
.template("[{elapsed_precise}] {bar:40.cyan/blue} {pos:>7}/{len:7} {msg}")
.unwrap();
let bar = ProgressBar::new(line_count as u64)
.with_message(format!("Indexing {}", csv_file.as_ref().to_str().unwrap()))
.with_style(style);
for result in rdr.deserialize::<Book>().progress_with(bar) {
match result {
Ok(item) => {
if let Err(err) = writer.add_document(doc!(
self.id => item.id,
self.title => item.title,
self.author => item.author,
self.publisher => item.publisher,
self.extension => item.extension,
self.filesize => item.filesize,
self.language => item.language,
self.year => item.year,
self.pages => item.pages,
self.isbn => item.isbn,
self.ipfs_cid => item.ipfs_cid,
)) {
println!("{err}");
}
}
Err(err) => {
println!("{err}");
}
}
}
writer.commit().unwrap();
writer.wait_merging_threads().expect("merge complete");
}
}
#[test]
fn test_csv_der() {
let file = File::open("zlib_index_books.csv").unwrap();
let reader = BufReader::new(file);
let mut rdr = csv::ReaderBuilder::new()
.has_headers(false)
.from_reader(reader);
for result in rdr.records() {
if let Err(err) = result {
println!("{err:?}");
break;
}
}
println!("{:?}", rdr.position());
}

@ -0,0 +1,152 @@
use std::path::Path;
use serde::{Deserialize, Serialize};
use serde_with::{serde_as, DefaultOnError, DefaultOnNull};
use tantivy::{schema::*, store::Compressor, Index};
use tokenizer::{get_tokenizer, META_DATA_TOKENIZER};
pub mod index;
pub mod search;
mod tokenizer;
#[serde_as]
#[derive(Debug, Default, Serialize, Deserialize)]
pub struct Book {
pub id: u64,
pub title: String,
#[serde_as(deserialize_as = "DefaultOnNull")]
pub author: String,
#[serde_as(deserialize_as = "DefaultOnNull")]
pub publisher: String,
#[serde_as(deserialize_as = "DefaultOnNull")]
pub extension: String,
#[serde_as(deserialize_as = "DefaultOnError")]
pub filesize: u64,
#[serde_as(deserialize_as = "DefaultOnNull")]
pub language: String,
#[serde_as(deserialize_as = "DefaultOnError")]
pub year: u64,
#[serde_as(deserialize_as = "DefaultOnError")]
pub pages: u64,
#[serde_as(deserialize_as = "DefaultOnNull")]
pub isbn: String,
#[serde_as(deserialize_as = "DefaultOnNull")]
pub ipfs_cid: String,
}
impl From<(&Schema, Document)> for Book {
fn from((schema, doc): (&Schema, Document)) -> Self {
macro_rules! get_field_text {
($field:expr) => {
doc.get_first(schema.get_field($field).unwrap())
.unwrap()
.as_text()
.unwrap_or_default()
.to_owned()
};
}
macro_rules! get_field_u64 {
($field:expr) => {
doc.get_first(schema.get_field($field).unwrap())
.unwrap()
.as_u64()
.unwrap_or_default()
};
}
Book {
id: get_field_u64!("id"),
title: get_field_text!("title"),
author: get_field_text!("author"),
publisher: get_field_text!("publisher"),
extension: get_field_text!("extension"),
filesize: get_field_u64!("filesize"),
language: get_field_text!("language"),
year: get_field_u64!("year"),
pages: get_field_u64!("pages"),
isbn: get_field_text!("isbn"),
ipfs_cid: get_field_text!("ipfs_cid"),
}
}
}
pub struct Searcher {
index: Index,
schema: Schema,
// fields
id: Field,
title: Field,
author: Field,
publisher: Field,
extension: Field,
filesize: Field,
language: Field,
year: Field,
pages: Field,
isbn: Field,
ipfs_cid: Field,
}
impl Searcher {
pub fn new(index_dir: impl AsRef<Path>) -> Self {
let text_indexing = TextFieldIndexing::default()
.set_tokenizer(META_DATA_TOKENIZER)
.set_index_option(IndexRecordOption::WithFreqsAndPositions);
let text_options = TextOptions::default()
.set_indexing_options(text_indexing)
.set_stored();
let mut schema_builder = Schema::builder();
let id = schema_builder.add_u64_field("id", INDEXED | STORED);
let title = schema_builder.add_text_field("title", text_options.clone());
let author = schema_builder.add_text_field("author", text_options.clone());
let publisher = schema_builder.add_text_field("publisher", text_options);
let extension = schema_builder.add_text_field("extension", STRING | STORED);
let filesize = schema_builder.add_u64_field("filesize", STORED);
let language = schema_builder.add_text_field("language", TEXT | STORED);
let year = schema_builder.add_u64_field("year", STORED);
let pages = schema_builder.add_u64_field("pages", STORED);
let isbn = schema_builder.add_text_field("isbn", TEXT | STORED);
let ipfs_cid = schema_builder.add_text_field("ipfs_cid", STORED);
let schema = schema_builder.build();
// open or create index
let index_dir = index_dir.as_ref();
let mut index = Index::open_in_dir(index_dir).unwrap_or_else(|_| {
std::fs::create_dir_all(index_dir).expect("create index directory");
Index::create_in_dir(index_dir, schema.clone()).unwrap()
});
#[cfg(feature = "best-size")]
{
index.settings_mut().docstore_compression = Compressor::Brotli; // size: 2.1G, size is best
}
#[cfg(feature = "best-speed")]
{
index.settings_mut().docstore_compression = Compressor::Lz4; // size: 3.1G, speed is best
}
index
.tokenizers()
.register(META_DATA_TOKENIZER, get_tokenizer());
_ = index.set_default_multithread_executor();
Self {
index,
schema,
id,
title,
author,
publisher,
extension,
filesize,
language,
year,
pages,
isbn,
ipfs_cid,
}
}
}

@ -0,0 +1,29 @@
use crate::{Book, Searcher};
use tantivy::{collector::TopDocs, query::QueryParser};
impl Searcher {
pub fn search(&self, query: &str, limit: usize) -> Vec<Book> {
let reader = self.index.reader().unwrap();
let searcher = reader.searcher();
let mut query_parser = QueryParser::for_index(
&self.index,
vec![self.title, self.author, self.publisher, self.isbn],
);
query_parser.set_conjunction_by_default();
let query = query_parser.parse_query(query).unwrap();
let top_docs = searcher
.search(&query, &TopDocs::with_limit(limit))
.unwrap();
top_docs
.iter()
.map(|d| {
let doc = searcher.doc(d.1).unwrap();
let item: Book = (&self.schema, doc).into();
item
})
.collect()
}
}

@ -0,0 +1,18 @@
use std::sync::Arc;
use cang_jie::{CangJieTokenizer, TokenizerOption};
use jieba_rs::Jieba;
use tantivy::tokenizer::{AsciiFoldingFilter, LowerCaser, RemoveLongFilter, TextAnalyzer};
pub const META_DATA_TOKENIZER: &str = "meta_data_tokenizer";
pub fn get_tokenizer() -> TextAnalyzer {
let cangjie = CangJieTokenizer {
worker: Arc::new(Jieba::new()),
option: TokenizerOption::ForSearch { hmm: false },
};
TextAnalyzer::from(cangjie)
.filter(RemoveLongFilter::limit(20))
.filter(AsciiFoldingFilter)
.filter(LowerCaser)
}

@ -0,0 +1,4 @@
# Generated by Cargo
# will have compiled files and executables
/target/
/index/

@ -0,0 +1,38 @@
[package]
name = "zlib-searcher-desktop"
version = "0.7.0"
edition = "2021"
authors = ["Wybxc <wybxc@qq.com>", "zu1k <i@zu1k.com>"]
description = "search z-library index."
homepage = "https://github.com/zlib-searcher/zlib-searcher"
repository = "https://github.com/zlib-searcher/zlib-searcher"
license = "MIT"
[build-dependencies]
tauri-build = { version = "1.2.1", features = [] }
[dependencies]
zlib-searcher-core = { workspace = true }
anyhow = { workspace = true }
serde = { workspace = true }
env_logger = { workspace = true }
log = { workspace = true }
serde_json = "1.0"
tauri = { version = "1.2.1", features = ["dialog-open", "shell-open"] }
tokio = { version = "1", features = ["sync", "parking_lot"] }
confy = "0.5"
dunce = "1.0"
[features]
# by default Tauri runs in production mode
# when `tauri dev` runs it is executed with `cargo run --no-default-features` if `devPath` is an URL
default = ["custom-protocol"]
# this feature is used for production builds where `devPath` points to the filesystem
# DO NOT remove this
custom-protocol = ["tauri/custom-protocol"]

@ -0,0 +1,3 @@
fn main() {
tauri_build::build()
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.4 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.7 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.4 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.4 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 4.7 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 4.9 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 10 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.3 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 12 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.7 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.3 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.8 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.8 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 14 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 6.2 KiB

@ -0,0 +1,121 @@
#![cfg_attr(
all(not(debug_assertions), target_os = "windows"),
windows_subsystem = "windows"
)]
use log::info;
use std::{error::Error, path::PathBuf};
use zlib_searcher_core::{Book, Searcher};
const VERSION: &str = env!("CARGO_PKG_VERSION");
use serde::{Deserialize, Serialize};
use tauri::State;
use tokio::sync::Mutex;
#[derive(Clone, Debug, Serialize, Deserialize)]
struct AppConfig {
pub index_dir: PathBuf,
pub ipfs_gateways: Vec<String>,
}
fn get_dir(name: &str) -> Option<PathBuf> {
let dir = std::env::current_exe().ok()?.parent()?.join(name);
std::fs::create_dir_all(&dir).ok()?;
let dir = dunce::canonicalize(dir).ok()?;
Some(dir)
}
impl Default for AppConfig {
fn default() -> Self {
let index_dir = get_dir("index").unwrap_or_else(|| PathBuf::from("index"));
Self {
index_dir,
ipfs_gateways: vec![],
}
}
}
impl AppConfig {
const APP_NAME: &'static str = "zlib-searcher-desktop";
pub fn load() -> Result<Self, Box<dyn std::error::Error>> {
let config = confy::load(Self::APP_NAME, None)?;
Ok(config)
}
pub fn save(&self) -> Result<(), Box<dyn std::error::Error>> {
confy::store(Self::APP_NAME, None, self)?;
Ok(())
}
pub fn configuration_file_path() -> Result<PathBuf, Box<dyn std::error::Error>> {
Ok(confy::get_configuration_file_path(Self::APP_NAME, None)?)
}
}
#[tauri::command]
async fn get_config(config: State<'_, Mutex<AppConfig>>) -> Result<AppConfig, String> {
Ok(config.lock().await.clone())
}
#[tauri::command]
async fn set_config(
new_config: AppConfig,
config: State<'_, Mutex<AppConfig>>,
searcher: tauri::State<'_, Mutex<Searcher>>,
) -> Result<(), String> {
let mut config = config.lock().await;
// reload searcher if index_dir changed
if config.index_dir != new_config.index_dir {
info!("index_dir changed, reloading searcher");
let mut searcher = searcher.lock().await;
*searcher = Searcher::new(new_config.index_dir.clone());
}
*config = new_config;
config.save().map_err(|e| e.to_string())?;
info!("Config saved: {:?}", config);
Ok(())
}
#[tauri::command]
async fn search(
searcher: tauri::State<'_, Mutex<Searcher>>,
query: String,
limit: usize,
) -> Result<Vec<Book>, ()> {
info!("Search: {}", query);
Ok(searcher.lock().await.search(&query, limit))
}
#[tauri::command]
fn version() -> String {
VERSION.to_string()
}
fn main() -> Result<(), Box<dyn Error>> {
env_logger::init();
let config = AppConfig::load()?;
let searcher = Mutex::new(Searcher::new(&config.index_dir));
let config = Mutex::new(config);
info!(
"load config from {:?}",
AppConfig::configuration_file_path()?
);
tauri::Builder::default()
.manage(config)
.manage(searcher)
.invoke_handler(tauri::generate_handler![
version, search, get_config, set_config
])
.run(tauri::generate_context!())
.expect("error while running tauri application");
Ok(())
}

@ -0,0 +1,71 @@
{
"build": {
"beforeBuildCommand": "cd ../frontend && pnpm run build",
"beforeDevCommand": "cd ../frontend && pnpm run dev",
"devPath": "http://localhost:5173/",
"distDir": "../../frontend/dist"
},
"package": {
"productName": "zLib Searcher",
"version": "0.1.0"
},
"tauri": {
"allowlist": {
"all": false,
"shell": {
"open": true
},
"dialog": {
"open": true
}
},
"bundle": {
"active": true,
"category": "DeveloperTool",
"copyright": "",
"deb": {
"depends": []
},
"externalBin": [],
"icon": [
"icons/32x32.png",
"icons/128x128.png",
"icons/128x128@2x.png",
"icons/icon.icns",
"icons/icon.ico"
],
"identifier": "com.github.zlib-searcher",
"longDescription": "",
"macOS": {
"entitlements": null,
"exceptionDomain": "",
"frameworks": [],
"providerShortName": null,
"signingIdentity": null
},
"resources": [],
"shortDescription": "",
"targets": "all",
"windows": {
"certificateThumbprint": null,
"digestAlgorithm": "sha256",
"timestampUrl": ""
}
},
"security": {
"csp": null
},
"updater": {
"active": false
},
"windows": [
{
"fullscreen": false,
"height": 900,
"resizable": true,
"title": "zLib Searcher",
"width": 1500
}
]
}
}

@ -0,0 +1,34 @@
[package]
name = "zlib-searcher"
version = "0.7.0"
edition.workspace = true
authors.workspace = true
description.workspace = true
homepage.workspace = true
repository.workspace = true
license.workspace = true
default-run = "zlib-searcher"
[dependencies]
zlib-searcher-core = { workspace = true }
anyhow = { workspace = true }
env_logger = { workspace = true }
log = { workspace = true }
serde = { workspace = true }
serde_with = { workspace = true }
actix-web = "4"
actix-web-static-files = "4.0"
static-files = "0.2"
csv = "1.1"
clap = { version = "4", features = ["derive"] }
[build-dependencies]
static-files = "0.2"
[features]
default = ["best-size"]
best-size = ["zlib-searcher-core/best-size"]
best-speed = ["zlib-searcher-core/best-speed"]

@ -0,0 +1,6 @@
use static_files::resource_dir;
fn main() -> std::io::Result<()> {
println!("cargo:rerun-if-changed=../../frontend/dist");
resource_dir("../../frontend/dist").build()
}

@ -0,0 +1,61 @@
use std::{fs::File, io::BufReader};
use zlib_searcher_core::Book;
fn main() {
let mut writer = csv::Writer::from_path("zlib_libgen_chinese_books.csv").unwrap();
let mut filter_csv = |path: &str| {
let file = File::open(path).unwrap();
let reader = BufReader::new(file);
let mut rdr = csv::ReaderBuilder::new()
.has_headers(false)
.from_reader(reader);
for result in rdr.deserialize::<Book>() {
match result {
Ok(ref book) => {
if is_chinese_title(book) {
if let Err(err) = writer.serialize(book) {
println!("err: {err}");
}
}
}
Err(err) => {
println!("{err}");
}
}
}
println!("{:?}", rdr.position());
};
filter_csv("zlib_index_books.csv");
filter_csv("libgen_index_books.csv");
}
fn is_chinese_title(book: &Book) -> bool {
let chinese_char_count = book.title.matches(is_chinese_char).count();
chinese_char_count as f32 / book.title.len() as f32 > 0.3
}
#[inline(always)]
const fn is_chinese_char(c: char) -> bool {
matches!(c as u32,
0x4E00..=0x9FA5 |
0x9FA6..=0x9FFF |
0x3400..=0x4DB5 |
0x20000..=0x2A6D6 |
0x2A700..=0x2B734 |
0x2B740..=0x2B81D |
0x2F00..=0x2FD5 |
0x2E80..=0x2EF3 |
0xF900..=0xFAD9 |
0x2F800..=0x2FA1D |
0xE815..=0xE86F |
0xE400..=0xE5E8 |
0xE600..=0xE6CF |
0x31C0..=0x31E3 |
0x2FF0..=0x2FFB |
0x3105..=0x3120 |
0x31A0..=0x31BA
)
}

@ -0,0 +1,141 @@
use actix_web::{
get, http::header, middleware::Logger, web, App, HttpResponse, HttpServer, Responder,
};
use actix_web_static_files::ResourceFiles;
use clap::Parser;
use log::{info, LevelFilter};
use serde::{Deserialize, Serialize};
use std::{path::PathBuf, sync::Arc};
use zlib_searcher_core::{Book, Searcher};
include!(concat!(env!("OUT_DIR"), "/generated.rs"));
#[derive(Clone)]
struct AppState {
searcher: Arc<Searcher>,
}
impl AppState {
pub fn init(index_dir: &str) -> Self {
info!("AppState init!");
AppState {
searcher: Arc::new(Searcher::new(index_dir)),
}
}
}
fn default_limit() -> usize {
30
}
#[derive(Deserialize)]
struct SearchQuery {
query: String,
#[serde(default = "default_limit")]
limit: usize,
}
#[derive(Serialize)]
struct SearchResult {
books: Vec<Book>,
}
#[get("/search")]
async fn search(query: web::Query<SearchQuery>, state: web::Data<AppState>) -> impl Responder {
let books = state.searcher.search(&query.query, query.limit);
let result = SearchResult { books };
return HttpResponse::Ok()
.insert_header(header::ContentType::json())
.insert_header((header::ACCESS_CONTROL_ALLOW_ORIGIN, "*"))
.json(result);
}
#[derive(Parser)]
#[clap(author, version, about, long_about)]
struct AppOpts {
#[clap(subcommand)]
subcmd: SubCommand,
}
#[derive(Parser)]
enum SubCommand {
/// run search webserver
Run(Run),
/// index the raw data
Index(Index),
}
#[derive(Parser)]
struct Run {
#[clap(
short,
long,
default_value = "127.0.0.1:7070",
help = "webserver bind address"
)]
bind: String,
}
#[derive(Parser)]
struct Index {
#[clap(short, long, num_args=1.., help = "specify csv file to be indexed")]
file: Vec<PathBuf>,
}
fn main() {
env_logger::builder().filter_level(LevelFilter::Info).init();
let args = AppOpts::parse();
match args.subcmd {
SubCommand::Run(opts) => run(opts).unwrap(),
SubCommand::Index(opts) => index(opts),
}
}
#[actix_web::main]
async fn run(opts: Run) -> std::io::Result<()> {
info!("zlib-searcher webserver started!");
let index_dir = std::env::current_exe()
.unwrap()
.parent()
.unwrap()
.join("index")
.to_str()
.unwrap()
.to_string();
let app_state = AppState::init(&index_dir);
HttpServer::new(move || {
let generated = generate();
App::new()
.wrap(Logger::default())
.app_data(web::Data::new(app_state.clone()))
.service(search)
.service(ResourceFiles::new("/", generated))
})
.bind(opts.bind)?
.run()
.await
}
fn index(opts: Index) {
let index_dir = std::env::current_exe()
.unwrap()
.parent()
.unwrap()
.join("index")
.to_str()
.unwrap()
.to_string();
let mut searcher = Searcher::new(&index_dir);
if opts.file.is_empty() {
vec!["zlib_index_books.csv", "libgen_index_books.csv"]
.iter()
.for_each(|file| searcher.index(file));
} else {
opts.file.iter().for_each(|file| searcher.index(file));
}
}

@ -0,0 +1,21 @@
version: '3'
services:
zlib:
image: lamacchinadesiderante/millelibri:latest
# image: millelibri:v0.2
# image: millelibri
# build:
# context: .
# dockerfile: ./Dockerfile
restart: always
ports:
- "7070:7070"
volumes:
- ./index:/index

@ -0,0 +1,54 @@
import time
import pandas as pd
import hanlp
import torch
import random
import os
import numpy as np
from tqdm import tqdm
def seed_everything(seed=2022):
'''
设置整个开发环境的seed
'''
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
# some cudnn methods can be random even after fixing the seed
# unless you tell it to be deterministic
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
seed_everything()
"""
专业级本地模型
"""
def Pro_tokenize():
mul_tokenizer = hanlp.load(hanlp.pretrained.tok.UD_TOK_MMINILMV2L12)
df_zlib = pd.read_csv('zlib_index_books.csv', header=None)
df_title = df_zlib.iloc[:, :2].astype(str)
df_title.columns = ["id", "title"]
df_title["title_token"] = None
print("分词中……\n")
# df_title = df_title.head()
total = len(df_title)
for i in tqdm(range(total)):
try:
title = df_title["title"][i]
batch_token_lis = mul_tokenizer(title)
df_title["title_token"][i] = batch_token_lis
except Exception as e:
print(e)
print(len(df_title))
df_title.to_csv("title_token.csv")
print("分词结果保存完成……")
if __name__ == '__main__':
Pro_tokenize()

@ -0,0 +1,4 @@
*.log
node_modules
dist
dist-ssr

@ -0,0 +1,2 @@
# .env.production
VITE_BACKEND_BASE_API = 'http://127.0.0.1:7070/'

@ -0,0 +1,2 @@
# .env.production
VITE_BACKEND_BASE_API = ''

@ -0,0 +1,26 @@
# Logs
logs
*.log
npm-debug.log*
yarn-debug.log*
yarn-error.log*
pnpm-debug.log*
lerna-debug.log*
node_modules
dist
dist-ssr
*.local
# Editor directories and files
.vscode/*
!.vscode/extensions.json
.idea
.DS_Store
*.suo
*.ntvs*
*.njsproj
*.sln
*.sw?
components.d.ts

@ -0,0 +1,4 @@
singleQuote: true
semi: true
printWidth: 100
trailingComma: none

@ -0,0 +1,12 @@
<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Millelibri / zlib searcher</title>
</head>
<body>
<div id="app"></div>
<script type="module" src="/src/main.tsx"></script>
</body>
</html>

File diff suppressed because it is too large Load Diff

@ -0,0 +1,49 @@
{
"name": "frontend",
"private": true,
"version": "0.2.0",
"type": "module",
"repository": "https://github.com/lamacchinadesiderante/millelibri",
"scripts": {
"dev": "vite",
"build": "vite build",
"preview": "vite preview"
},
"dependencies": {
"@chakra-ui/react": "^2.4.6",
"@chakra-ui/skip-nav": "^2.0.13",
"@chakra-ui/system": "^2.3.7",
"@emotion/react": "^11.10.5",
"@emotion/styled": "^11.10.5",
"@tanstack/react-table": "^8.7.4",
"@tanstack/table-core": "^8.7.4",
"@tauri-apps/api": "^1.2.0",
"ahooks": "^3.7.4",
"axios": "^1.2.2",
"filesize": "^10.0.6",
"framer-motion": "^7.10.3",
"i18next": "^22.4.6",
"i18next-browser-languagedetector": "^7.0.1",
"lodash": "^4.17.21",
"react": "^18.2.0",
"react-dom": "^18.2.0",
"react-hook-form": "^7.41.3",
"react-i18next": "^12.1.1",
"react-icons": "^4.7.1",
"react-intersection-observer": "^9.4.1",
"react-responsive": "^9.0.2"
},
"devDependencies": {
"@babel/core": "^7.20.7",
"@darkobits/vite-plugin-favicons": "^0.1.8",
"@types/lodash": "^4.14.191",
"@types/node": "^18.11.18",
"@types/react": "^18.0.26",
"@types/react-dom": "^18.0.10",
"@vitejs/plugin-react": "^2.2.0",
"prettier": "^2.8.1",
"typescript": "^4.9.4",
"vite": "^3.2.5",
"vite-plugin-top-level-await": "^1.2.2"
}
}

File diff suppressed because it is too large Load Diff

@ -0,0 +1,67 @@
import { Flex, HStack, Icon, IconButton, Spacer } from '@chakra-ui/react';
import React, { Suspense, useState } from 'react';
import { SkipNavContent, SkipNavLink } from '@chakra-ui/skip-nav';
import { Book } from './scripts/searcher';
import BooksView from './components/BooksView';
import ColorModeSwitch from './components/ColorModeSwitch';
import ExternalLink from './components/ExternalLink';
import { FaGithub } from 'react-icons/fa';
import Footer from './components/Footer';
import Header from './components/Header';
import LanguageSwitch from './components/LanguageSwitch';
import Search from './components/Search';
import { repository } from '../package.json';
import { useTranslation } from 'react-i18next';
const Main: React.FC = () => {
const [books, setBooks] = useState<Book[]>([]);
return (
<>
<SkipNavContent />
<Search setBooks={setBooks} />
<BooksView books={books} />
</>
);
};
const Settings =
import.meta.env.VITE_TAURI === '1'
? React.lazy(() => import('./components/Settings-tauri'))
: React.lazy(() => import('./components/Settings'));
const App: React.FC = () => {
const { t } = useTranslation();
return (
<Flex direction="column" minH="100vh">
<SkipNavLink>Skip to content</SkipNavLink>
<Header title="Millelibri">
<HStack spacing={{ base: 1, md: 2 }}>
<IconButton
as={ExternalLink}
aria-label={t('nav.repository')}
title={t('nav.repository') ?? ''}