File size: 5,847 Bytes
bb04c5f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
# searcher/search_engine.py

import yaml
from searcher.query_understanding import QueryUnderstanding
from searcher.dense_retriever import DenseRetriever
from searcher.sparse_retriever import SparseRetriever
from searcher.fusion_ranker import FusionRanker
from searcher.reranker import Reranker
from searcher.facet_filter import FacetFilter
from searcher.highlighter import Highlighter


class SearchEngine:
    """

    Orchestrates the full search pipeline end-to-end:



        raw query

            β†’ QueryUnderstanding  (expand + rewrite)

            β†’ DenseRetriever      (semantic FAISS search)

            β†’ SparseRetriever     (BM25 lexical search)

            β†’ FusionRanker        (RRF merge)

            β†’ Reranker            (cross-encoder precision)

            β†’ FacetFilter         (type / date / size / directory)

            β†’ Highlighter         (preview + HTML highlights)

            β†’ final results

    """

    def __init__(self, config_path="config.yaml"):
        self.config_path = config_path
        with open(config_path) as f:
            self.config = yaml.safe_load(f)

        self.query_understanding = QueryUnderstanding(config_path)
        self.dense_retriever = DenseRetriever(config_path)
        self.sparse_retriever = SparseRetriever(config_path)
        self.fusion_ranker = FusionRanker(k=60)
        self.reranker = Reranker(config_path)
        self.facet_filter = FacetFilter()
        self.highlighter = Highlighter(preview_words=30)

        self.candidate_k = self.config.get("candidate_k", 20)
        self.final_k = self.config.get("top_k", 5)

    def search(

        self,

        query: str,

        top_k: int = None,

        file_type: list[str] = None,

        date_after=None,

        date_before=None,

        min_size: int = None,

        max_size: int = None,

        directory: str = None,

    ) -> dict:
        """

        Run the full search pipeline.



        Args:

            query       β€” natural language user query

            top_k       β€” number of final results (overrides config)

            file_type   β€” e.g. [".pdf", ".docx"]

            date_after  β€” datetime; exclude older files

            date_before β€” datetime; exclude newer files

            min_size    β€” min file size in bytes

            max_size    β€” max file size in bytes

            directory   β€” restrict to this directory



        Returns:

            dict:

                query_info  β€” dict from QueryUnderstanding

                results     β€” list of final result dicts, each with:

                                filepath, chunk_text, chunk_index,

                                preview, preview_html,

                                dense_score (if present),

                                sparse_score (if present),

                                rrf_score, rerank_score

        """
        k = top_k or self.final_k

        # Step 1 β€” query understanding
        query_info = self.query_understanding.process(query)

        query_info.setdefault("original", query)
        query_info.setdefault("expanded", query)
        query_info.setdefault("rewritten", query)

        # Step 2 β€” dense retrieval (uses expanded query for better semantic reach)
        dense_results = self.dense_retriever.retrieve(
            query_info["expanded"], top_k=self.candidate_k
        )

        # Step 3 β€” sparse retrieval (uses rewritten query; expansion hurts BM25)
        sparse_results = self.sparse_retriever.retrieve(
            query_info["rewritten"], top_k=self.candidate_k
        )

        # Step 4 β€” RRF fusion
        fused = self.fusion_ranker.fuse(dense_results, sparse_results, top_k=self.candidate_k)

        # Step 5 β€” cross-encoder reranking
        reranked = self.reranker.rerank(query_info["original"], fused, top_k=k * 2)

        # Step 6 β€” facet filtering
        filtered = self.facet_filter.filter(
            reranked,
            file_type=file_type,
            date_after=date_after,
            date_before=date_before,
            min_size=min_size,
            max_size=max_size,
            directory=directory,
        )

        # Trim to top_k after filtering
        final = filtered[:k]

        # Step 7 β€” highlight previews
        final = self.highlighter.annotate(final, query_info["original"])
        for r in final:
            if "preview" not in r or not r["preview"]:
                r["preview"] = r.get("chunk_text", "")[:200]

        return {
            "query_info": query_info,
            "results": final or [],
        }

if __name__ == "__main__":
    engine = SearchEngine()

    while True:
        query = input("\nπŸ” Enter your search query (or type 'exit'): ")

        if query.lower() == "exit":
            print("Exiting search engine...")
            break

        output = engine.search(query, top_k=3)

        print(f"\nQuery     : {output['query_info']['original']}")
        print(f"Expanded  : {output['query_info']['expanded']}")
        print(f"Results   : {len(output['results'])}\n")

        for i, r in enumerate(output["results"], 1):
            print(f"--- Result {i} ---")
            print(f"File     : {r['filepath']}")
            print(f"Preview  : {r['preview']}")

            # Handle safe printing of scores
            rrf = r.get('rrf_score')
            rerank = r.get('rerank_score')

            if rrf is not None:
                print(f"RRF      : {rrf:.5f}")
            else:
                print("RRF      : n/a")

            if rerank is not None:
                print(f"Rerank   : {rerank:.4f}")
            else:
                print("Rerank   : n/a")

            print()