File size: 6,112 Bytes
f780124
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
"""
BM25 sparse retrieval index for keyword-based search.

BM25 (Best Match 25) is the gold standard keyword search algorithm.
It powers Elasticsearch, Solr, and was the backbone of Google Search
before neural methods. It rewards:
    - Term frequency: how often the query word appears in the chunk
    - Inverse document frequency: rare words are more discriminative
    - Document length normalization: prevents long chunks from dominating

WHY WE NEED THIS ALONGSIDE VECTOR SEARCH:
    Query: "what is LoRA fine-tuning?"

    Vector search: finds chunks about "parameter-efficient training"
    (semantically related but may miss the exact acronym)

    BM25: finds chunks containing the EXACT token "LoRA"
    (exact match, regardless of semantic similarity)

    Hybrid: finds chunks that are BOTH semantically relevant
    AND contain the keyword - best of both worlds.
"""

from copyreg import pickle
import json
import pickle
import re
from pathlib import Path

import numpy as np
from rank_bm25 import BM25Okapi

from src.utils.logger import get_logger
from config.settings import CHUNKS_DIR, EMBEDDINGS_DIR

logger = get_logger(__name__)


# Where we persist the BM25 index
BM25_INDEX_PATH = EMBEDDINGS_DIR / "bm25_index.pkl"



def tokenize(text: str) -> list[str]:
    """
    Simple tokenizer for BM25.

    BM25 works on token lists, not raw strings.
    We lowercase and split on non-alphanumeric characters.

    WHY NOT USE NLTK/SPACY:
        For BM25 in a RAG pipeline, simple whitespace+punctuation
        tokenization is sufficient and avoids heavy dependencies.
        The quality difference is minimal for retrieval tasks.
    """
    text = text.lower()

    # Split on anything that't not a letter, number, or hyphen
    tokens = re.findall(r'[a-z0-9]+(?:-[a-z0-9]+)*', text)
    return tokens



class BM25Store:
    """
    Manages a BM25 index over all chunk texts.

    The index is built once and persisted to disk as a pickle file.
    Loading from pickle is near-instant vs rebuilding from scratch.
    """

    def __init__(self):
        self.bm25:          BM25Okapi = None
        self.chunk_ids:     list[str] = []
        self.texts:         list[str] = []


    def build_index(self) -> None:
        """
        Build BM25 index from all chunk files.

        Loads all chunk texts, tokenizes them, and creates the BM25 index.
        This takes ~30 seconds for 15,664 chunks.
        """
        logger.info("Building BM25 index from chunk files...")

        chunk_ids = []
        texts     = []

        for cf in CHUNKS_DIR.glob("*_semantic.json"):
            with open(cf, "r", encoding = 'utf-8') as f:
                chunks = json.load(f)
            
            for chunk in chunks:
                chunk_ids.append(chunk["chunk_id"])
                texts.append(chunk["text"])

        logger.info(f"Tokenizing {len(texts):,} chunks...")

        # Tokenize all texts
        # bm250kapi expects a list of token lists
        tokenized_corpus = [tokenize(text) for text in texts]

        # Build the BM25 index
        # BM250kapi is the standard 0kapi BM25 variant
        self.bm25       = BM25Okapi(tokenized_corpus)
        self.chunk_ids  = chunk_ids
        self.texts      = texts

        logger.info(f"BM25 index built: {len(chunk_ids):,} documents")

        # Persist to disk
        self._save()


    
    def _save(self) -> None:
        """Save index to disk using pickle."""
        data = {
            "bm25":      self.bm25,
            "chunk_ids": self.chunk_ids,
            "texts":     self.texts,
        }

        with open(BM25_INDEX_PATH, "wb") as f:
            pickle.dump(data, f)
        size_mb = BM25_INDEX_PATH.stat().st_size / 1024 / 1024
        logger.info(f"BM25 index saved: {BM25_INDEX_PATH} ({size_mb:.1f} MB)")


    
    def load(self) -> bool:
        """
        Look index from disk
        Return True if loaded, False if index doesn't exists
        """
        if not BM25_INDEX_PATH.exists():
            logger.info("No BM25 index found on disk")
            return False

        logger.info("Loading BM25 index from disk...")
        with open(BM25_INDEX_PATH, "rb") as f:
            data = pickle.load(f)

        self.bm25       = data["bm25"]
        self.chunk_ids  = data["chunk_ids"]
        self.texts      = data["texts"]

        logger.info(f"BM25 index loaded: {len(self.chunk_ids):,} documents")
        return True


    def search(self, query: str, top_k: int = 20) -> list[dict]:
        """
        Search BM25 index with a text query.

        Args:
            query: Raw query string (NOT embedded - BM25 uses tokens)
            top_k: Number of top results to return

        Returns:
            List of dicts with chunk_id, bm25_score, text

        HOW BM25 SCORING WORKS:
            Given query tokens ["lora", "fine-tuning"],
            BM25 scores each document based on how frequently
            these tokens appear, weighted by their rarity across
            all documents (IDF) and normalized by document length.
            Higher score = better keyword match.
        """
        if self.bm25 is None:
            raise RuntimeError("BM25 index not loaded. Call build_index() or load() first.")

        query_tokens = tokenize(query)

        if not query_tokens:
            return []


        # get_scores returns array of shape (n_documents,)
        # with BM25 score for each document
        scores = self.bm25.get_scores(query_tokens)


        # Get indices of top-k scores (argsort ascending, take last k, reverse)
        top_indices = np.argsort(scores)[-top_k:][::-1]


        results = []
        for idx in top_indices:
            score = float(scores[idx])
            if score <= 0:
                # Skip zero-score results - no keywords overlap at all
                continue
            results.append(
                {
                    "chunk_id":     self.chunk_ids[idx],
                    "bm25_score":   round(score, 4),
                    "text":         self.texts[idx],
                }
            )

        return results