File size: 4,713 Bytes
bb04c5f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# evaluation/query_runner.py

from searcher.search_engine import SearchEngine


class QueryRunner:
    """

    Runs all evaluation queries through your SearchEngine and collects

    the ranked result lists for scoring.



    The results are formatted exactly as the Evaluator expects:

        {query_id: [(doc_id, score), ...]}   ranked best-first

    """

    def __init__(self, config_path: str = "config.yaml"):
        self.engine = SearchEngine(config_path)

    def _extract_doc_id(self, filepath: str) -> str:
        """

        Strip dataset prefix from fake filepath so it matches qrels doc_ids.



        Examples:

            "scifact://12345"    β†’  "12345"

            "nfcorpus://MED-10"  β†’  "MED-10"

            "/real/file.pdf"     β†’  "/real/file.pdf"  (real files unchanged)



        This is critical β€” without stripping, doc_ids like "nfcorpus://MED-10"

        will never match qrels keys like "MED-10" and all scores will be 0.0

        """
        if "://" in filepath:
            return filepath.split("://", 1)[1]
        return filepath

    def run(

        self,

        queries: dict,

        top_k: int = 100,

        mode: str = "full",

    ) -> dict:
        """

        Run all queries and return ranked results.



        Args:

            queries β€” {query_id: query_text}

            top_k   β€” number of results per query (use 100 for eval)

            mode    β€” pipeline variant to test:

                        "dense"   β†’ dense retrieval only

                        "sparse"  β†’ BM25 only

                        "hybrid"  β†’ dense + BM25 + RRF (no reranker)

                        "full"    β†’ complete pipeline with reranker



        Returns:

            dict β€” {query_id: [(doc_id, rank_score), ...]}

        """
        results = {}
        total   = len(queries)

        for i, (query_id, query_text) in enumerate(queries.items(), 1):
            if i % 50 == 0:
                print(f"  Running query {i}/{total}...")

            try:
                if mode == "dense":
                    raw    = self.engine.dense_retriever.retrieve(query_text, top_k=top_k)
                    ranked = [
                        (self._extract_doc_id(r["filepath"]), -r["dense_score"])
                        for r in raw
                    ]

                elif mode == "sparse":
                    raw    = self.engine.sparse_retriever.retrieve(query_text, top_k=top_k)
                    ranked = [
                        (self._extract_doc_id(r["filepath"]), r["sparse_score"])
                        for r in raw
                    ]

                elif mode == "hybrid":
                    dense_raw  = self.engine.dense_retriever.retrieve(query_text, top_k=top_k)
                    sparse_raw = self.engine.sparse_retriever.retrieve(query_text, top_k=top_k)
                    fused      = self.engine.fusion_ranker.fuse(dense_raw, sparse_raw, top_k=top_k)
                    ranked     = [
                        (self._extract_doc_id(r["filepath"]), r["rrf_score"])
                        for r in fused
                    ]

                else:  # full pipeline
                    output = self.engine.search(query_text, top_k=top_k)
                    ranked = [
                        (
                            self._extract_doc_id(r["filepath"]),
                            r.get("rerank_score", r.get("rrf_score", 0))
                        )
                        for r in output["results"]
                    ]

                # Deduplicate by doc_id
                # multiple chunks from same doc β†’ keep only the best score
                seen = {}
                for doc_id, score in ranked:
                    if doc_id not in seen or score > seen[doc_id]:
                        seen[doc_id] = score

                results[query_id] = sorted(
                    seen.items(),
                    key=lambda x: x[1],
                    reverse=True
                )

            except Exception as e:
                print(f"  Error on query {query_id}: {e}")
                results[query_id] = []

        return results


if __name__ == "__main__":
    from evaluation.dataset_loader import DatasetLoader

    loader  = DatasetLoader("data/scifact")
    queries = loader.load_queries()

    runner  = QueryRunner()
    results = runner.run(queries, top_k=10, mode="full")

    sample_qid = list(results.keys())[0]
    print(f"\nQuery {sample_qid} top results:")
    for doc_id, score in results[sample_qid][:5]:
        print(f"  doc {doc_id}  score={score:.4f}")