Semantic_File / evaluation /query_runner.py
JackSparrow89's picture
Upload 65 files
bb04c5f verified
# evaluation/query_runner.py
from searcher.search_engine import SearchEngine
class QueryRunner:
"""
Runs all evaluation queries through your SearchEngine and collects
the ranked result lists for scoring.
The results are formatted exactly as the Evaluator expects:
{query_id: [(doc_id, score), ...]} ranked best-first
"""
def __init__(self, config_path: str = "config.yaml"):
self.engine = SearchEngine(config_path)
def _extract_doc_id(self, filepath: str) -> str:
"""
Strip dataset prefix from fake filepath so it matches qrels doc_ids.
Examples:
"scifact://12345" β†’ "12345"
"nfcorpus://MED-10" β†’ "MED-10"
"/real/file.pdf" β†’ "/real/file.pdf" (real files unchanged)
This is critical β€” without stripping, doc_ids like "nfcorpus://MED-10"
will never match qrels keys like "MED-10" and all scores will be 0.0
"""
if "://" in filepath:
return filepath.split("://", 1)[1]
return filepath
def run(
self,
queries: dict,
top_k: int = 100,
mode: str = "full",
) -> dict:
"""
Run all queries and return ranked results.
Args:
queries β€” {query_id: query_text}
top_k β€” number of results per query (use 100 for eval)
mode β€” pipeline variant to test:
"dense" β†’ dense retrieval only
"sparse" β†’ BM25 only
"hybrid" β†’ dense + BM25 + RRF (no reranker)
"full" β†’ complete pipeline with reranker
Returns:
dict β€” {query_id: [(doc_id, rank_score), ...]}
"""
results = {}
total = len(queries)
for i, (query_id, query_text) in enumerate(queries.items(), 1):
if i % 50 == 0:
print(f" Running query {i}/{total}...")
try:
if mode == "dense":
raw = self.engine.dense_retriever.retrieve(query_text, top_k=top_k)
ranked = [
(self._extract_doc_id(r["filepath"]), -r["dense_score"])
for r in raw
]
elif mode == "sparse":
raw = self.engine.sparse_retriever.retrieve(query_text, top_k=top_k)
ranked = [
(self._extract_doc_id(r["filepath"]), r["sparse_score"])
for r in raw
]
elif mode == "hybrid":
dense_raw = self.engine.dense_retriever.retrieve(query_text, top_k=top_k)
sparse_raw = self.engine.sparse_retriever.retrieve(query_text, top_k=top_k)
fused = self.engine.fusion_ranker.fuse(dense_raw, sparse_raw, top_k=top_k)
ranked = [
(self._extract_doc_id(r["filepath"]), r["rrf_score"])
for r in fused
]
else: # full pipeline
output = self.engine.search(query_text, top_k=top_k)
ranked = [
(
self._extract_doc_id(r["filepath"]),
r.get("rerank_score", r.get("rrf_score", 0))
)
for r in output["results"]
]
# Deduplicate by doc_id
# multiple chunks from same doc β†’ keep only the best score
seen = {}
for doc_id, score in ranked:
if doc_id not in seen or score > seen[doc_id]:
seen[doc_id] = score
results[query_id] = sorted(
seen.items(),
key=lambda x: x[1],
reverse=True
)
except Exception as e:
print(f" Error on query {query_id}: {e}")
results[query_id] = []
return results
if __name__ == "__main__":
from evaluation.dataset_loader import DatasetLoader
loader = DatasetLoader("data/scifact")
queries = loader.load_queries()
runner = QueryRunner()
results = runner.run(queries, top_k=10, mode="full")
sample_qid = list(results.keys())[0]
print(f"\nQuery {sample_qid} top results:")
for doc_id, score in results[sample_qid][:5]:
print(f" doc {doc_id} score={score:.4f}")