Spaces:

Subhadip007
/

researchpilot-api

Running

File size: 6,112 Bytes

f780124

"""
BM25 sparse retrieval index for keyword-based search.

BM25 (Best Match 25) is the gold standard keyword search algorithm.
It powers Elasticsearch, Solr, and was the backbone of Google Search
before neural methods. It rewards:
    - Term frequency: how often the query word appears in the chunk
    - Inverse document frequency: rare words are more discriminative
    - Document length normalization: prevents long chunks from dominating

WHY WE NEED THIS ALONGSIDE VECTOR SEARCH:
    Query: "what is LoRA fine-tuning?"

    Vector search: finds chunks about "parameter-efficient training"
    (semantically related but may miss the exact acronym)

    BM25: finds chunks containing the EXACT token "LoRA"
    (exact match, regardless of semantic similarity)

    Hybrid: finds chunks that are BOTH semantically relevant
    AND contain the keyword - best of both worlds.
"""

from copyreg import pickle
import json
import pickle
import re
from pathlib import Path

import numpy as np
from rank_bm25 import BM25Okapi

from src.utils.logger import get_logger
from config.settings import CHUNKS_DIR, EMBEDDINGS_DIR

logger = get_logger(__name__)


# Where we persist the BM25 index
BM25_INDEX_PATH = EMBEDDINGS_DIR / "bm25_index.pkl"



def tokenize(text: str) -> list[str]:
    """
    Simple tokenizer for BM25.

    BM25 works on token lists, not raw strings.
    We lowercase and split on non-alphanumeric characters.

    WHY NOT USE NLTK/SPACY:
        For BM25 in a RAG pipeline, simple whitespace+punctuation
        tokenization is sufficient and avoids heavy dependencies.
        The quality difference is minimal for retrieval tasks.
    """
    text = text.lower()

    # Split on anything that't not a letter, number, or hyphen
    tokens = re.findall(r'[a-z0-9]+(?:-[a-z0-9]+)*', text)
    return tokens



class BM25Store:
    """
    Manages a BM25 index over all chunk texts.

    The index is built once and persisted to disk as a pickle file.
    Loading from pickle is near-instant vs rebuilding from scratch.
    """

    def __init__(self):
        self.bm25:          BM25Okapi = None
        self.chunk_ids:     list[str] = []
        self.texts:         list[str] = []


    def build_index(self) -> None:
        """
        Build BM25 index from all chunk files.

        Loads all chunk texts, tokenizes them, and creates the BM25 index.
        This takes ~30 seconds for 15,664 chunks.
        """
        logger.info("Building BM25 index from chunk files...")

        chunk_ids = []
        texts     = []

        for cf in CHUNKS_DIR.glob("*_semantic.json"):
            with open(cf, "r", encoding = 'utf-8') as f:
                chunks = json.load(f)
            
            for chunk in chunks:
                chunk_ids.append(chunk["chunk_id"])
                texts.append(chunk["text"])

        logger.info(f"Tokenizing {len(texts):,} chunks...")

        # Tokenize all texts
        # bm250kapi expects a list of token lists
        tokenized_corpus = [tokenize(text) for text in texts]

        # Build the BM25 index
        # BM250kapi is the standard 0kapi BM25 variant
        self.bm25       = BM25Okapi(tokenized_corpus)
        self.chunk_ids  = chunk_ids
        self.texts      = texts

        logger.info(f"BM25 index built: {len(chunk_ids):,} documents")

        # Persist to disk
        self._save()


    
    def _save(self) -> None:
        """Save index to disk using pickle."""
        data = {
            "bm25":      self.bm25,
            "chunk_ids": self.chunk_ids,
            "texts":     self.texts,
        }

        with open(BM25_INDEX_PATH, "wb") as f:
            pickle.dump(data, f)
        size_mb = BM25_INDEX_PATH.stat().st_size / 1024 / 1024
        logger.info(f"BM25 index saved: {BM25_INDEX_PATH} ({size_mb:.1f} MB)")


    
    def load(self) -> bool:
        """
        Look index from disk
        Return True if loaded, False if index doesn't exists
        """
        if not BM25_INDEX_PATH.exists():
            logger.info("No BM25 index found on disk")
            return False

        logger.info("Loading BM25 index from disk...")
        with open(BM25_INDEX_PATH, "rb") as f:
            data = pickle.load(f)

        self.bm25       = data["bm25"]
        self.chunk_ids  = data["chunk_ids"]
        self.texts      = data["texts"]

        logger.info(f"BM25 index loaded: {len(self.chunk_ids):,} documents")
        return True


    def search(self, query: str, top_k: int = 20) -> list[dict]:
        """
        Search BM25 index with a text query.

        Args:
            query: Raw query string (NOT embedded - BM25 uses tokens)
            top_k: Number of top results to return

        Returns:
            List of dicts with chunk_id, bm25_score, text

        HOW BM25 SCORING WORKS:
            Given query tokens ["lora", "fine-tuning"],
            BM25 scores each document based on how frequently
            these tokens appear, weighted by their rarity across
            all documents (IDF) and normalized by document length.
            Higher score = better keyword match.
        """
        if self.bm25 is None:
            raise RuntimeError("BM25 index not loaded. Call build_index() or load() first.")

        query_tokens = tokenize(query)

        if not query_tokens:
            return []


        # get_scores returns array of shape (n_documents,)
        # with BM25 score for each document
        scores = self.bm25.get_scores(query_tokens)


        # Get indices of top-k scores (argsort ascending, take last k, reverse)
        top_indices = np.argsort(scores)[-top_k:][::-1]


        results = []
        for idx in top_indices:
            score = float(scores[idx])
            if score <= 0:
                # Skip zero-score results - no keywords overlap at all
                continue
            results.append(
                {
                    "chunk_id":     self.chunk_ids[idx],
                    "bm25_score":   round(score, 4),
                    "text":         self.texts[idx],
                }
            )

        return results