| use pyo3::prelude::*; |
| use tantivy::collector::TopDocs; |
| use tantivy::query::QueryParser; |
| use tantivy::schema::*; |
| use tantivy::{doc, Index, ReloadPolicy}; |
| use walkdir::WalkDir; |
| use std::path::PathBuf; |
| use std::fs; |
|
|
| #[pyfunction] |
| fn index_directory(index_path: String, repo_path: String) -> PyResult<u64> { |
| let mut schema_builder = Schema::builder(); |
| let filepath = schema_builder.add_text_field("filepath", STRING | STORED); |
| let content = schema_builder.add_text_field("content", TEXT); |
| let schema = schema_builder.build(); |
| let index_path_buf = PathBuf::from(&index_path); |
|
|
| let index = if index_path_buf.exists() { |
| Index::open_in_dir(&index_path_buf) |
| .map_err(|e| PyErr::new::<pyo3::exceptions::PyIOError, _>(format!("{}", e)))? |
| } else { |
| std::fs::create_dir_all(&index_path_buf)?; |
| Index::create_in_dir(&index_path_buf, schema.clone()) |
| .map_err(|e| PyErr::new::<pyo3::exceptions::PyIOError, _>(format!("{}", e)))? |
| }; |
|
|
| let mut writer = index.writer(50_000_000) |
| .map_err(|e| PyErr::new::<pyo3::exceptions::PyIOError, _>(format!("{}", e)))?; |
|
|
| let mut count: u64 = 0; |
| for entry in WalkDir::new(&repo_path).into_iter().filter_map(|e| e.ok()) { |
| let path = entry.path(); |
| if path.is_file() { |
| if let Some(ext) = path.extension() { |
| let ext = ext.to_string_lossy().to_lowercase(); |
| if matches!(ext.as_str(), "png"|"jpg"|"jpeg"|"gif"|"svg"|"ico"|"woff"|"woff2"|"ttf"|"eot"|"otf") { |
| continue; |
| } |
| } |
| if let Ok(text) = fs::read_to_string(path) { |
| let rel_path = path.strip_prefix(&repo_path) |
| .unwrap_or(path) |
| .to_string_lossy() |
| .to_string(); |
| writer.add_document(doc!( |
| filepath => rel_path, |
| content => text, |
| )).map_err(|e| PyErr::new::<pyo3::exceptions::PyIOError, _>(format!("{}", e)))?; |
| count += 1; |
| } |
| } |
| } |
|
|
| writer.commit() |
| .map_err(|e| PyErr::new::<pyo3::exceptions::PyIOError, _>(format!("{}", e)))?; |
|
|
| Ok(count) |
| } |
|
|
| #[pyfunction] |
| fn search(index_path: String, query: String) -> PyResult<Vec<String>> { |
| let index = Index::open_in_dir(&index_path) |
| .map_err(|e| PyErr::new::<pyo3::exceptions::PyIOError, _>(format!("{}", e)))?; |
| let reader = index.reader_builder() |
| .reload_policy(ReloadPolicy::OnCommitWithDelay) |
| .try_into() |
| .map_err(|e| PyErr::new::<pyo3::exceptions::PyIOError, _>(format!("{}", e)))?; |
| let searcher = reader.searcher(); |
|
|
| let schema = index.schema(); |
| let content_field = schema.get_field("content") |
| .map_err(|e| PyErr::new::<pyo3::exceptions::PyValueError, _>(format!("{}", e)))?; |
| let filepath_field = schema.get_field("filepath") |
| .map_err(|e| PyErr::new::<pyo3::exceptions::PyValueError, _>(format!("{}", e)))?; |
|
|
| let query_parser = QueryParser::for_index(&index, vec![content_field]); |
| let query = query_parser.parse_query(&query) |
| .map_err(|e| PyErr::new::<pyo3::exceptions::PyValueError, _>(format!("{}", e)))?; |
|
|
| let top_docs = searcher.search(&query, &TopDocs::with_limit(20)) |
| .map_err(|e| PyErr::new::<pyo3::exceptions::PyIOError, _>(format!("{}", e)))?; |
|
|
| let mut results = Vec::new(); |
| for (_score, doc_address) in top_docs { |
| let doc = searcher.doc::<TantivyDocument>(doc_address) |
| .map_err(|e| PyErr::new::<pyo3::exceptions::PyIOError, _>(format!("{}", e)))?; |
| if let Some(path) = doc.get_first(filepath_field) { |
| if let Some(text) = path.as_str() { |
| results.push(text.to_string()); |
| } |
| } |
| } |
| Ok(results) |
| } |
|
|
| #[pymodule] |
| fn uspoo_core(m: &Bound<'_, PyModule>) -> PyResult<()> { |
| m.add_function(wrap_pyfunction!(index_directory, m)?)?; |
| m.add_function(wrap_pyfunction!(search, m)?)?; |
| Ok(()) |
| } |