Spaces:
Sleeping
Sleeping
| # evaluation/query_runner.py | |
| from searcher.search_engine import SearchEngine | |
| class QueryRunner: | |
| """ | |
| Runs all evaluation queries through your SearchEngine and collects | |
| the ranked result lists for scoring. | |
| The results are formatted exactly as the Evaluator expects: | |
| {query_id: [(doc_id, score), ...]} ranked best-first | |
| """ | |
| def __init__(self, config_path: str = "config.yaml"): | |
| self.engine = SearchEngine(config_path) | |
| def _extract_doc_id(self, filepath: str) -> str: | |
| """ | |
| Strip dataset prefix from fake filepath so it matches qrels doc_ids. | |
| Examples: | |
| "scifact://12345" β "12345" | |
| "nfcorpus://MED-10" β "MED-10" | |
| "/real/file.pdf" β "/real/file.pdf" (real files unchanged) | |
| This is critical β without stripping, doc_ids like "nfcorpus://MED-10" | |
| will never match qrels keys like "MED-10" and all scores will be 0.0 | |
| """ | |
| if "://" in filepath: | |
| return filepath.split("://", 1)[1] | |
| return filepath | |
| def run( | |
| self, | |
| queries: dict, | |
| top_k: int = 100, | |
| mode: str = "full", | |
| ) -> dict: | |
| """ | |
| Run all queries and return ranked results. | |
| Args: | |
| queries β {query_id: query_text} | |
| top_k β number of results per query (use 100 for eval) | |
| mode β pipeline variant to test: | |
| "dense" β dense retrieval only | |
| "sparse" β BM25 only | |
| "hybrid" β dense + BM25 + RRF (no reranker) | |
| "full" β complete pipeline with reranker | |
| Returns: | |
| dict β {query_id: [(doc_id, rank_score), ...]} | |
| """ | |
| results = {} | |
| total = len(queries) | |
| for i, (query_id, query_text) in enumerate(queries.items(), 1): | |
| if i % 50 == 0: | |
| print(f" Running query {i}/{total}...") | |
| try: | |
| if mode == "dense": | |
| raw = self.engine.dense_retriever.retrieve(query_text, top_k=top_k) | |
| ranked = [ | |
| (self._extract_doc_id(r["filepath"]), -r["dense_score"]) | |
| for r in raw | |
| ] | |
| elif mode == "sparse": | |
| raw = self.engine.sparse_retriever.retrieve(query_text, top_k=top_k) | |
| ranked = [ | |
| (self._extract_doc_id(r["filepath"]), r["sparse_score"]) | |
| for r in raw | |
| ] | |
| elif mode == "hybrid": | |
| dense_raw = self.engine.dense_retriever.retrieve(query_text, top_k=top_k) | |
| sparse_raw = self.engine.sparse_retriever.retrieve(query_text, top_k=top_k) | |
| fused = self.engine.fusion_ranker.fuse(dense_raw, sparse_raw, top_k=top_k) | |
| ranked = [ | |
| (self._extract_doc_id(r["filepath"]), r["rrf_score"]) | |
| for r in fused | |
| ] | |
| else: # full pipeline | |
| output = self.engine.search(query_text, top_k=top_k) | |
| ranked = [ | |
| ( | |
| self._extract_doc_id(r["filepath"]), | |
| r.get("rerank_score", r.get("rrf_score", 0)) | |
| ) | |
| for r in output["results"] | |
| ] | |
| # Deduplicate by doc_id | |
| # multiple chunks from same doc β keep only the best score | |
| seen = {} | |
| for doc_id, score in ranked: | |
| if doc_id not in seen or score > seen[doc_id]: | |
| seen[doc_id] = score | |
| results[query_id] = sorted( | |
| seen.items(), | |
| key=lambda x: x[1], | |
| reverse=True | |
| ) | |
| except Exception as e: | |
| print(f" Error on query {query_id}: {e}") | |
| results[query_id] = [] | |
| return results | |
| if __name__ == "__main__": | |
| from evaluation.dataset_loader import DatasetLoader | |
| loader = DatasetLoader("data/scifact") | |
| queries = loader.load_queries() | |
| runner = QueryRunner() | |
| results = runner.run(queries, top_k=10, mode="full") | |
| sample_qid = list(results.keys())[0] | |
| print(f"\nQuery {sample_qid} top results:") | |
| for doc_id, score in results[sample_qid][:5]: | |
| print(f" doc {doc_id} score={score:.4f}") |