Spaces:
Sleeping
Sleeping
File size: 4,713 Bytes
bb04c5f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 | # evaluation/query_runner.py
from searcher.search_engine import SearchEngine
class QueryRunner:
"""
Runs all evaluation queries through your SearchEngine and collects
the ranked result lists for scoring.
The results are formatted exactly as the Evaluator expects:
{query_id: [(doc_id, score), ...]} ranked best-first
"""
def __init__(self, config_path: str = "config.yaml"):
self.engine = SearchEngine(config_path)
def _extract_doc_id(self, filepath: str) -> str:
"""
Strip dataset prefix from fake filepath so it matches qrels doc_ids.
Examples:
"scifact://12345" β "12345"
"nfcorpus://MED-10" β "MED-10"
"/real/file.pdf" β "/real/file.pdf" (real files unchanged)
This is critical β without stripping, doc_ids like "nfcorpus://MED-10"
will never match qrels keys like "MED-10" and all scores will be 0.0
"""
if "://" in filepath:
return filepath.split("://", 1)[1]
return filepath
def run(
self,
queries: dict,
top_k: int = 100,
mode: str = "full",
) -> dict:
"""
Run all queries and return ranked results.
Args:
queries β {query_id: query_text}
top_k β number of results per query (use 100 for eval)
mode β pipeline variant to test:
"dense" β dense retrieval only
"sparse" β BM25 only
"hybrid" β dense + BM25 + RRF (no reranker)
"full" β complete pipeline with reranker
Returns:
dict β {query_id: [(doc_id, rank_score), ...]}
"""
results = {}
total = len(queries)
for i, (query_id, query_text) in enumerate(queries.items(), 1):
if i % 50 == 0:
print(f" Running query {i}/{total}...")
try:
if mode == "dense":
raw = self.engine.dense_retriever.retrieve(query_text, top_k=top_k)
ranked = [
(self._extract_doc_id(r["filepath"]), -r["dense_score"])
for r in raw
]
elif mode == "sparse":
raw = self.engine.sparse_retriever.retrieve(query_text, top_k=top_k)
ranked = [
(self._extract_doc_id(r["filepath"]), r["sparse_score"])
for r in raw
]
elif mode == "hybrid":
dense_raw = self.engine.dense_retriever.retrieve(query_text, top_k=top_k)
sparse_raw = self.engine.sparse_retriever.retrieve(query_text, top_k=top_k)
fused = self.engine.fusion_ranker.fuse(dense_raw, sparse_raw, top_k=top_k)
ranked = [
(self._extract_doc_id(r["filepath"]), r["rrf_score"])
for r in fused
]
else: # full pipeline
output = self.engine.search(query_text, top_k=top_k)
ranked = [
(
self._extract_doc_id(r["filepath"]),
r.get("rerank_score", r.get("rrf_score", 0))
)
for r in output["results"]
]
# Deduplicate by doc_id
# multiple chunks from same doc β keep only the best score
seen = {}
for doc_id, score in ranked:
if doc_id not in seen or score > seen[doc_id]:
seen[doc_id] = score
results[query_id] = sorted(
seen.items(),
key=lambda x: x[1],
reverse=True
)
except Exception as e:
print(f" Error on query {query_id}: {e}")
results[query_id] = []
return results
if __name__ == "__main__":
from evaluation.dataset_loader import DatasetLoader
loader = DatasetLoader("data/scifact")
queries = loader.load_queries()
runner = QueryRunner()
results = runner.run(queries, top_k=10, mode="full")
sample_qid = list(results.keys())[0]
print(f"\nQuery {sample_qid} top results:")
for doc_id, score in results[sample_qid][:5]:
print(f" doc {doc_id} score={score:.4f}") |