Spaces:
Running
Running
File size: 6,112 Bytes
f780124 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 | """
BM25 sparse retrieval index for keyword-based search.
BM25 (Best Match 25) is the gold standard keyword search algorithm.
It powers Elasticsearch, Solr, and was the backbone of Google Search
before neural methods. It rewards:
- Term frequency: how often the query word appears in the chunk
- Inverse document frequency: rare words are more discriminative
- Document length normalization: prevents long chunks from dominating
WHY WE NEED THIS ALONGSIDE VECTOR SEARCH:
Query: "what is LoRA fine-tuning?"
Vector search: finds chunks about "parameter-efficient training"
(semantically related but may miss the exact acronym)
BM25: finds chunks containing the EXACT token "LoRA"
(exact match, regardless of semantic similarity)
Hybrid: finds chunks that are BOTH semantically relevant
AND contain the keyword - best of both worlds.
"""
from copyreg import pickle
import json
import pickle
import re
from pathlib import Path
import numpy as np
from rank_bm25 import BM25Okapi
from src.utils.logger import get_logger
from config.settings import CHUNKS_DIR, EMBEDDINGS_DIR
logger = get_logger(__name__)
# Where we persist the BM25 index
BM25_INDEX_PATH = EMBEDDINGS_DIR / "bm25_index.pkl"
def tokenize(text: str) -> list[str]:
"""
Simple tokenizer for BM25.
BM25 works on token lists, not raw strings.
We lowercase and split on non-alphanumeric characters.
WHY NOT USE NLTK/SPACY:
For BM25 in a RAG pipeline, simple whitespace+punctuation
tokenization is sufficient and avoids heavy dependencies.
The quality difference is minimal for retrieval tasks.
"""
text = text.lower()
# Split on anything that't not a letter, number, or hyphen
tokens = re.findall(r'[a-z0-9]+(?:-[a-z0-9]+)*', text)
return tokens
class BM25Store:
"""
Manages a BM25 index over all chunk texts.
The index is built once and persisted to disk as a pickle file.
Loading from pickle is near-instant vs rebuilding from scratch.
"""
def __init__(self):
self.bm25: BM25Okapi = None
self.chunk_ids: list[str] = []
self.texts: list[str] = []
def build_index(self) -> None:
"""
Build BM25 index from all chunk files.
Loads all chunk texts, tokenizes them, and creates the BM25 index.
This takes ~30 seconds for 15,664 chunks.
"""
logger.info("Building BM25 index from chunk files...")
chunk_ids = []
texts = []
for cf in CHUNKS_DIR.glob("*_semantic.json"):
with open(cf, "r", encoding = 'utf-8') as f:
chunks = json.load(f)
for chunk in chunks:
chunk_ids.append(chunk["chunk_id"])
texts.append(chunk["text"])
logger.info(f"Tokenizing {len(texts):,} chunks...")
# Tokenize all texts
# bm250kapi expects a list of token lists
tokenized_corpus = [tokenize(text) for text in texts]
# Build the BM25 index
# BM250kapi is the standard 0kapi BM25 variant
self.bm25 = BM25Okapi(tokenized_corpus)
self.chunk_ids = chunk_ids
self.texts = texts
logger.info(f"BM25 index built: {len(chunk_ids):,} documents")
# Persist to disk
self._save()
def _save(self) -> None:
"""Save index to disk using pickle."""
data = {
"bm25": self.bm25,
"chunk_ids": self.chunk_ids,
"texts": self.texts,
}
with open(BM25_INDEX_PATH, "wb") as f:
pickle.dump(data, f)
size_mb = BM25_INDEX_PATH.stat().st_size / 1024 / 1024
logger.info(f"BM25 index saved: {BM25_INDEX_PATH} ({size_mb:.1f} MB)")
def load(self) -> bool:
"""
Look index from disk
Return True if loaded, False if index doesn't exists
"""
if not BM25_INDEX_PATH.exists():
logger.info("No BM25 index found on disk")
return False
logger.info("Loading BM25 index from disk...")
with open(BM25_INDEX_PATH, "rb") as f:
data = pickle.load(f)
self.bm25 = data["bm25"]
self.chunk_ids = data["chunk_ids"]
self.texts = data["texts"]
logger.info(f"BM25 index loaded: {len(self.chunk_ids):,} documents")
return True
def search(self, query: str, top_k: int = 20) -> list[dict]:
"""
Search BM25 index with a text query.
Args:
query: Raw query string (NOT embedded - BM25 uses tokens)
top_k: Number of top results to return
Returns:
List of dicts with chunk_id, bm25_score, text
HOW BM25 SCORING WORKS:
Given query tokens ["lora", "fine-tuning"],
BM25 scores each document based on how frequently
these tokens appear, weighted by their rarity across
all documents (IDF) and normalized by document length.
Higher score = better keyword match.
"""
if self.bm25 is None:
raise RuntimeError("BM25 index not loaded. Call build_index() or load() first.")
query_tokens = tokenize(query)
if not query_tokens:
return []
# get_scores returns array of shape (n_documents,)
# with BM25 score for each document
scores = self.bm25.get_scores(query_tokens)
# Get indices of top-k scores (argsort ascending, take last k, reverse)
top_indices = np.argsort(scores)[-top_k:][::-1]
results = []
for idx in top_indices:
score = float(scores[idx])
if score <= 0:
# Skip zero-score results - no keywords overlap at all
continue
results.append(
{
"chunk_id": self.chunk_ids[idx],
"bm25_score": round(score, 4),
"text": self.texts[idx],
}
)
return results |