""" BM25 sparse retrieval index for keyword-based search. BM25 (Best Match 25) is the gold standard keyword search algorithm. It powers Elasticsearch, Solr, and was the backbone of Google Search before neural methods. It rewards: - Term frequency: how often the query word appears in the chunk - Inverse document frequency: rare words are more discriminative - Document length normalization: prevents long chunks from dominating WHY WE NEED THIS ALONGSIDE VECTOR SEARCH: Query: "what is LoRA fine-tuning?" Vector search: finds chunks about "parameter-efficient training" (semantically related but may miss the exact acronym) BM25: finds chunks containing the EXACT token "LoRA" (exact match, regardless of semantic similarity) Hybrid: finds chunks that are BOTH semantically relevant AND contain the keyword - best of both worlds. """ from copyreg import pickle import json import pickle import re from pathlib import Path import numpy as np from rank_bm25 import BM25Okapi from src.utils.logger import get_logger from config.settings import CHUNKS_DIR, EMBEDDINGS_DIR logger = get_logger(__name__) # Where we persist the BM25 index BM25_INDEX_PATH = EMBEDDINGS_DIR / "bm25_index.pkl" def tokenize(text: str) -> list[str]: """ Simple tokenizer for BM25. BM25 works on token lists, not raw strings. We lowercase and split on non-alphanumeric characters. WHY NOT USE NLTK/SPACY: For BM25 in a RAG pipeline, simple whitespace+punctuation tokenization is sufficient and avoids heavy dependencies. The quality difference is minimal for retrieval tasks. """ text = text.lower() # Split on anything that't not a letter, number, or hyphen tokens = re.findall(r'[a-z0-9]+(?:-[a-z0-9]+)*', text) return tokens class BM25Store: """ Manages a BM25 index over all chunk texts. The index is built once and persisted to disk as a pickle file. Loading from pickle is near-instant vs rebuilding from scratch. """ def __init__(self): self.bm25: BM25Okapi = None self.chunk_ids: list[str] = [] self.texts: list[str] = [] def build_index(self) -> None: """ Build BM25 index from all chunk files. Loads all chunk texts, tokenizes them, and creates the BM25 index. This takes ~30 seconds for 15,664 chunks. """ logger.info("Building BM25 index from chunk files...") chunk_ids = [] texts = [] for cf in CHUNKS_DIR.glob("*_semantic.json"): with open(cf, "r", encoding = 'utf-8') as f: chunks = json.load(f) for chunk in chunks: chunk_ids.append(chunk["chunk_id"]) texts.append(chunk["text"]) logger.info(f"Tokenizing {len(texts):,} chunks...") # Tokenize all texts # bm250kapi expects a list of token lists tokenized_corpus = [tokenize(text) for text in texts] # Build the BM25 index # BM250kapi is the standard 0kapi BM25 variant self.bm25 = BM25Okapi(tokenized_corpus) self.chunk_ids = chunk_ids self.texts = texts logger.info(f"BM25 index built: {len(chunk_ids):,} documents") # Persist to disk self._save() def _save(self) -> None: """Save index to disk using pickle.""" data = { "bm25": self.bm25, "chunk_ids": self.chunk_ids, "texts": self.texts, } with open(BM25_INDEX_PATH, "wb") as f: pickle.dump(data, f) size_mb = BM25_INDEX_PATH.stat().st_size / 1024 / 1024 logger.info(f"BM25 index saved: {BM25_INDEX_PATH} ({size_mb:.1f} MB)") def load(self) -> bool: """ Look index from disk Return True if loaded, False if index doesn't exists """ if not BM25_INDEX_PATH.exists(): logger.info("No BM25 index found on disk") return False logger.info("Loading BM25 index from disk...") with open(BM25_INDEX_PATH, "rb") as f: data = pickle.load(f) self.bm25 = data["bm25"] self.chunk_ids = data["chunk_ids"] self.texts = data["texts"] logger.info(f"BM25 index loaded: {len(self.chunk_ids):,} documents") return True def search(self, query: str, top_k: int = 20) -> list[dict]: """ Search BM25 index with a text query. Args: query: Raw query string (NOT embedded - BM25 uses tokens) top_k: Number of top results to return Returns: List of dicts with chunk_id, bm25_score, text HOW BM25 SCORING WORKS: Given query tokens ["lora", "fine-tuning"], BM25 scores each document based on how frequently these tokens appear, weighted by their rarity across all documents (IDF) and normalized by document length. Higher score = better keyword match. """ if self.bm25 is None: raise RuntimeError("BM25 index not loaded. Call build_index() or load() first.") query_tokens = tokenize(query) if not query_tokens: return [] # get_scores returns array of shape (n_documents,) # with BM25 score for each document scores = self.bm25.get_scores(query_tokens) # Get indices of top-k scores (argsort ascending, take last k, reverse) top_indices = np.argsort(scores)[-top_k:][::-1] results = [] for idx in top_indices: score = float(scores[idx]) if score <= 0: # Skip zero-score results - no keywords overlap at all continue results.append( { "chunk_id": self.chunk_ids[idx], "bm25_score": round(score, 4), "text": self.texts[idx], } ) return results