Spaces:

Subhadip007
/

researchpilot-api

Running

App Files Files Community

researchpilot-api / src /vectorstore /bm25_store.py

Subhadip007

feat: retrieval optimization pipeline complete

f780124 2 months ago

raw

history blame contribute delete

6.11 kB

	"""
	BM25 sparse retrieval index for keyword-based search.

	BM25 (Best Match 25) is the gold standard keyword search algorithm.
	It powers Elasticsearch, Solr, and was the backbone of Google Search
	before neural methods. It rewards:
	- Term frequency: how often the query word appears in the chunk
	- Inverse document frequency: rare words are more discriminative
	- Document length normalization: prevents long chunks from dominating

	WHY WE NEED THIS ALONGSIDE VECTOR SEARCH:
	Query: "what is LoRA fine-tuning?"

	Vector search: finds chunks about "parameter-efficient training"
	(semantically related but may miss the exact acronym)

	BM25: finds chunks containing the EXACT token "LoRA"
	(exact match, regardless of semantic similarity)

	Hybrid: finds chunks that are BOTH semantically relevant
	AND contain the keyword - best of both worlds.
	"""

	from copyreg import pickle
	import json
	import pickle
	import re
	from pathlib import Path

	import numpy as np
	from rank_bm25 import BM25Okapi

	from src.utils.logger import get_logger
	from config.settings import CHUNKS_DIR, EMBEDDINGS_DIR

	logger = get_logger(__name__)


	# Where we persist the BM25 index
	BM25_INDEX_PATH = EMBEDDINGS_DIR / "bm25_index.pkl"



	def tokenize(text: str) -> list[str]:
	"""
	Simple tokenizer for BM25.

	BM25 works on token lists, not raw strings.
	We lowercase and split on non-alphanumeric characters.

	WHY NOT USE NLTK/SPACY:
	For BM25 in a RAG pipeline, simple whitespace+punctuation
	tokenization is sufficient and avoids heavy dependencies.
	The quality difference is minimal for retrieval tasks.
	"""
	text = text.lower()

	# Split on anything that't not a letter, number, or hyphen
	tokens = re.findall(r'[a-z0-9]+(?:-[a-z0-9]+)*', text)
	return tokens



	class BM25Store:
	"""
	Manages a BM25 index over all chunk texts.

	The index is built once and persisted to disk as a pickle file.
	Loading from pickle is near-instant vs rebuilding from scratch.
	"""

	def __init__(self):
	self.bm25: BM25Okapi = None
	self.chunk_ids: list[str] = []
	self.texts: list[str] = []


	def build_index(self) -> None:
	"""
	Build BM25 index from all chunk files.

	Loads all chunk texts, tokenizes them, and creates the BM25 index.
	This takes ~30 seconds for 15,664 chunks.
	"""
	logger.info("Building BM25 index from chunk files...")

	chunk_ids = []
	texts = []

	for cf in CHUNKS_DIR.glob("*_semantic.json"):
	with open(cf, "r", encoding = 'utf-8') as f:
	chunks = json.load(f)

	for chunk in chunks:
	chunk_ids.append(chunk["chunk_id"])
	texts.append(chunk["text"])

	logger.info(f"Tokenizing {len(texts):,} chunks...")

	# Tokenize all texts
	# bm250kapi expects a list of token lists
	tokenized_corpus = [tokenize(text) for text in texts]

	# Build the BM25 index
	# BM250kapi is the standard 0kapi BM25 variant
	self.bm25 = BM25Okapi(tokenized_corpus)
	self.chunk_ids = chunk_ids
	self.texts = texts

	logger.info(f"BM25 index built: {len(chunk_ids):,} documents")

	# Persist to disk
	self._save()



	def _save(self) -> None:
	"""Save index to disk using pickle."""
	data = {
	"bm25": self.bm25,
	"chunk_ids": self.chunk_ids,
	"texts": self.texts,
	}

	with open(BM25_INDEX_PATH, "wb") as f:
	pickle.dump(data, f)
	size_mb = BM25_INDEX_PATH.stat().st_size / 1024 / 1024
	logger.info(f"BM25 index saved: {BM25_INDEX_PATH} ({size_mb:.1f} MB)")



	def load(self) -> bool:
	"""
	Look index from disk
	Return True if loaded, False if index doesn't exists
	"""
	if not BM25_INDEX_PATH.exists():
	logger.info("No BM25 index found on disk")
	return False

	logger.info("Loading BM25 index from disk...")
	with open(BM25_INDEX_PATH, "rb") as f:
	data = pickle.load(f)

	self.bm25 = data["bm25"]
	self.chunk_ids = data["chunk_ids"]
	self.texts = data["texts"]

	logger.info(f"BM25 index loaded: {len(self.chunk_ids):,} documents")
	return True


	def search(self, query: str, top_k: int = 20) -> list[dict]:
	"""
	Search BM25 index with a text query.

	Args:
	query: Raw query string (NOT embedded - BM25 uses tokens)
	top_k: Number of top results to return

	Returns:
	List of dicts with chunk_id, bm25_score, text

	HOW BM25 SCORING WORKS:
	Given query tokens ["lora", "fine-tuning"],
	BM25 scores each document based on how frequently
	these tokens appear, weighted by their rarity across
	all documents (IDF) and normalized by document length.
	Higher score = better keyword match.
	"""
	if self.bm25 is None:
	raise RuntimeError("BM25 index not loaded. Call build_index() or load() first.")

	query_tokens = tokenize(query)

	if not query_tokens:
	return []


	# get_scores returns array of shape (n_documents,)
	# with BM25 score for each document
	scores = self.bm25.get_scores(query_tokens)


	# Get indices of top-k scores (argsort ascending, take last k, reverse)
	top_indices = np.argsort(scores)[-top_k:][::-1]


	results = []
	for idx in top_indices:
	score = float(scores[idx])
	if score <= 0:
	# Skip zero-score results - no keywords overlap at all
	continue
	results.append(
	{
	"chunk_id": self.chunk_ids[idx],
	"bm25_score": round(score, 4),
	"text": self.texts[idx],
	}
	)

	return results