Spaces:
Sleeping
Sleeping
| # searcher/search_engine.py | |
| import yaml | |
| from searcher.query_understanding import QueryUnderstanding | |
| from searcher.dense_retriever import DenseRetriever | |
| from searcher.sparse_retriever import SparseRetriever | |
| from searcher.fusion_ranker import FusionRanker | |
| from searcher.reranker import Reranker | |
| from searcher.facet_filter import FacetFilter | |
| from searcher.highlighter import Highlighter | |
| class SearchEngine: | |
| """ | |
| Orchestrates the full search pipeline end-to-end: | |
| raw query | |
| β QueryUnderstanding (expand + rewrite) | |
| β DenseRetriever (semantic FAISS search) | |
| β SparseRetriever (BM25 lexical search) | |
| β FusionRanker (RRF merge) | |
| β Reranker (cross-encoder precision) | |
| β FacetFilter (type / date / size / directory) | |
| β Highlighter (preview + HTML highlights) | |
| β final results | |
| """ | |
| def __init__(self, config_path="config.yaml"): | |
| self.config_path = config_path | |
| with open(config_path) as f: | |
| self.config = yaml.safe_load(f) | |
| self.query_understanding = QueryUnderstanding(config_path) | |
| self.dense_retriever = DenseRetriever(config_path) | |
| self.sparse_retriever = SparseRetriever(config_path) | |
| self.fusion_ranker = FusionRanker(k=60) | |
| self.reranker = Reranker(config_path) | |
| self.facet_filter = FacetFilter() | |
| self.highlighter = Highlighter(preview_words=30) | |
| self.candidate_k = self.config.get("candidate_k", 20) | |
| self.final_k = self.config.get("top_k", 5) | |
| def search( | |
| self, | |
| query: str, | |
| top_k: int = None, | |
| file_type: list[str] = None, | |
| date_after=None, | |
| date_before=None, | |
| min_size: int = None, | |
| max_size: int = None, | |
| directory: str = None, | |
| ) -> dict: | |
| """ | |
| Run the full search pipeline. | |
| Args: | |
| query β natural language user query | |
| top_k β number of final results (overrides config) | |
| file_type β e.g. [".pdf", ".docx"] | |
| date_after β datetime; exclude older files | |
| date_before β datetime; exclude newer files | |
| min_size β min file size in bytes | |
| max_size β max file size in bytes | |
| directory β restrict to this directory | |
| Returns: | |
| dict: | |
| query_info β dict from QueryUnderstanding | |
| results β list of final result dicts, each with: | |
| filepath, chunk_text, chunk_index, | |
| preview, preview_html, | |
| dense_score (if present), | |
| sparse_score (if present), | |
| rrf_score, rerank_score | |
| """ | |
| k = top_k or self.final_k | |
| # Step 1 β query understanding | |
| query_info = self.query_understanding.process(query) | |
| query_info.setdefault("original", query) | |
| query_info.setdefault("expanded", query) | |
| query_info.setdefault("rewritten", query) | |
| # Step 2 β dense retrieval (uses expanded query for better semantic reach) | |
| dense_results = self.dense_retriever.retrieve( | |
| query_info["expanded"], top_k=self.candidate_k | |
| ) | |
| # Step 3 β sparse retrieval (uses rewritten query; expansion hurts BM25) | |
| sparse_results = self.sparse_retriever.retrieve( | |
| query_info["rewritten"], top_k=self.candidate_k | |
| ) | |
| # Step 4 β RRF fusion | |
| fused = self.fusion_ranker.fuse(dense_results, sparse_results, top_k=self.candidate_k) | |
| # Step 5 β cross-encoder reranking | |
| reranked = self.reranker.rerank(query_info["original"], fused, top_k=k * 2) | |
| # Step 6 β facet filtering | |
| filtered = self.facet_filter.filter( | |
| reranked, | |
| file_type=file_type, | |
| date_after=date_after, | |
| date_before=date_before, | |
| min_size=min_size, | |
| max_size=max_size, | |
| directory=directory, | |
| ) | |
| # Trim to top_k after filtering | |
| final = filtered[:k] | |
| # Step 7 β highlight previews | |
| final = self.highlighter.annotate(final, query_info["original"]) | |
| for r in final: | |
| if "preview" not in r or not r["preview"]: | |
| r["preview"] = r.get("chunk_text", "")[:200] | |
| return { | |
| "query_info": query_info, | |
| "results": final or [], | |
| } | |
| if __name__ == "__main__": | |
| engine = SearchEngine() | |
| while True: | |
| query = input("\nπ Enter your search query (or type 'exit'): ") | |
| if query.lower() == "exit": | |
| print("Exiting search engine...") | |
| break | |
| output = engine.search(query, top_k=3) | |
| print(f"\nQuery : {output['query_info']['original']}") | |
| print(f"Expanded : {output['query_info']['expanded']}") | |
| print(f"Results : {len(output['results'])}\n") | |
| for i, r in enumerate(output["results"], 1): | |
| print(f"--- Result {i} ---") | |
| print(f"File : {r['filepath']}") | |
| print(f"Preview : {r['preview']}") | |
| # Handle safe printing of scores | |
| rrf = r.get('rrf_score') | |
| rerank = r.get('rerank_score') | |
| if rrf is not None: | |
| print(f"RRF : {rrf:.5f}") | |
| else: | |
| print("RRF : n/a") | |
| if rerank is not None: | |
| print(f"Rerank : {rerank:.4f}") | |
| else: | |
| print("Rerank : n/a") | |
| print() |