Semantic_File / searcher /search_engine.py
JackSparrow89's picture
Upload 65 files
bb04c5f verified
# searcher/search_engine.py
import yaml
from searcher.query_understanding import QueryUnderstanding
from searcher.dense_retriever import DenseRetriever
from searcher.sparse_retriever import SparseRetriever
from searcher.fusion_ranker import FusionRanker
from searcher.reranker import Reranker
from searcher.facet_filter import FacetFilter
from searcher.highlighter import Highlighter
class SearchEngine:
"""
Orchestrates the full search pipeline end-to-end:
raw query
β†’ QueryUnderstanding (expand + rewrite)
β†’ DenseRetriever (semantic FAISS search)
β†’ SparseRetriever (BM25 lexical search)
β†’ FusionRanker (RRF merge)
β†’ Reranker (cross-encoder precision)
β†’ FacetFilter (type / date / size / directory)
β†’ Highlighter (preview + HTML highlights)
β†’ final results
"""
def __init__(self, config_path="config.yaml"):
self.config_path = config_path
with open(config_path) as f:
self.config = yaml.safe_load(f)
self.query_understanding = QueryUnderstanding(config_path)
self.dense_retriever = DenseRetriever(config_path)
self.sparse_retriever = SparseRetriever(config_path)
self.fusion_ranker = FusionRanker(k=60)
self.reranker = Reranker(config_path)
self.facet_filter = FacetFilter()
self.highlighter = Highlighter(preview_words=30)
self.candidate_k = self.config.get("candidate_k", 20)
self.final_k = self.config.get("top_k", 5)
def search(
self,
query: str,
top_k: int = None,
file_type: list[str] = None,
date_after=None,
date_before=None,
min_size: int = None,
max_size: int = None,
directory: str = None,
) -> dict:
"""
Run the full search pipeline.
Args:
query β€” natural language user query
top_k β€” number of final results (overrides config)
file_type β€” e.g. [".pdf", ".docx"]
date_after β€” datetime; exclude older files
date_before β€” datetime; exclude newer files
min_size β€” min file size in bytes
max_size β€” max file size in bytes
directory β€” restrict to this directory
Returns:
dict:
query_info β€” dict from QueryUnderstanding
results β€” list of final result dicts, each with:
filepath, chunk_text, chunk_index,
preview, preview_html,
dense_score (if present),
sparse_score (if present),
rrf_score, rerank_score
"""
k = top_k or self.final_k
# Step 1 β€” query understanding
query_info = self.query_understanding.process(query)
query_info.setdefault("original", query)
query_info.setdefault("expanded", query)
query_info.setdefault("rewritten", query)
# Step 2 β€” dense retrieval (uses expanded query for better semantic reach)
dense_results = self.dense_retriever.retrieve(
query_info["expanded"], top_k=self.candidate_k
)
# Step 3 β€” sparse retrieval (uses rewritten query; expansion hurts BM25)
sparse_results = self.sparse_retriever.retrieve(
query_info["rewritten"], top_k=self.candidate_k
)
# Step 4 β€” RRF fusion
fused = self.fusion_ranker.fuse(dense_results, sparse_results, top_k=self.candidate_k)
# Step 5 β€” cross-encoder reranking
reranked = self.reranker.rerank(query_info["original"], fused, top_k=k * 2)
# Step 6 β€” facet filtering
filtered = self.facet_filter.filter(
reranked,
file_type=file_type,
date_after=date_after,
date_before=date_before,
min_size=min_size,
max_size=max_size,
directory=directory,
)
# Trim to top_k after filtering
final = filtered[:k]
# Step 7 β€” highlight previews
final = self.highlighter.annotate(final, query_info["original"])
for r in final:
if "preview" not in r or not r["preview"]:
r["preview"] = r.get("chunk_text", "")[:200]
return {
"query_info": query_info,
"results": final or [],
}
if __name__ == "__main__":
engine = SearchEngine()
while True:
query = input("\nπŸ” Enter your search query (or type 'exit'): ")
if query.lower() == "exit":
print("Exiting search engine...")
break
output = engine.search(query, top_k=3)
print(f"\nQuery : {output['query_info']['original']}")
print(f"Expanded : {output['query_info']['expanded']}")
print(f"Results : {len(output['results'])}\n")
for i, r in enumerate(output["results"], 1):
print(f"--- Result {i} ---")
print(f"File : {r['filepath']}")
print(f"Preview : {r['preview']}")
# Handle safe printing of scores
rrf = r.get('rrf_score')
rerank = r.get('rerank_score')
if rrf is not None:
print(f"RRF : {rrf:.5f}")
else:
print("RRF : n/a")
if rerank is not None:
print(f"Rerank : {rerank:.4f}")
else:
print("Rerank : n/a")
print()