Spaces:

crackbit
/

ai-learning-path-generator

Sleeping

File size: 5,238 Bytes

7644eac

"""
BM25 Retriever for keyword-based document search.

BM25 (Best Matching 25) is a probabilistic ranking function used for keyword-based
document retrieval. It's particularly effective for exact keyword matches that
semantic search might miss.
"""
from typing import List, Dict, Any, Optional
import numpy as np
from rank_bm25 import BM25Okapi
from langchain.schema import Document


class BM25Retriever:
    """
    BM25-based keyword retriever for hybrid search.
    
    BM25 uses term frequency (TF) and inverse document frequency (IDF) to rank
    documents based on keyword relevance.
    """
    
    def __init__(self, k1: float = 1.5, b: float = 0.75):
        """
        Initialize BM25 retriever.
        
        Args:
            k1: Term frequency saturation parameter (default: 1.5)
                Higher values give more weight to term frequency
            b: Length normalization parameter (default: 0.75)
                0 = no normalization, 1 = full normalization
        """
        self.k1 = k1
        self.b = b
        self.bm25 = None
        self.documents = []
        self.tokenized_corpus = []
        
    def index_documents(self, documents: List[Document]) -> None:
        """
        Index documents for BM25 search.
        
        Args:
            documents: List of Document objects to index
        """
        if not documents:
            return
            
        self.documents = documents
        
        # Tokenize documents (simple whitespace tokenization)
        self.tokenized_corpus = [
            doc.page_content.lower().split() 
            for doc in documents
        ]
        
        # Create BM25 index
        self.bm25 = BM25Okapi(self.tokenized_corpus, k1=self.k1, b=self.b)
        
        print(f"✅ BM25 index created with {len(documents)} documents")
    
    def search(self, query: str, top_k: int = 5) -> List[Dict[str, Any]]:
        """
        Search documents using BM25 keyword matching.
        
        Args:
            query: Search query
            top_k: Number of top results to return
            
        Returns:
            List of dictionaries with 'document' and 'score' keys
        """
        if not self.bm25 or not self.documents:
            return []
        
        # Tokenize query
        tokenized_query = query.lower().split()
        
        # Get BM25 scores
        scores = self.bm25.get_scores(tokenized_query)
        
        # Get top-k indices
        top_indices = np.argsort(scores)[::-1][:top_k]
        
        # Build results
        results = []
        for idx in top_indices:
            if scores[idx] > 0:  # Only include documents with non-zero scores
                results.append({
                    'document': self.documents[idx],
                    'score': float(scores[idx]),
                    'rank': len(results) + 1
                })
        
        return results
    
    def get_stats(self) -> Dict[str, Any]:
        """
        Get statistics about the indexed corpus.
        
        Returns:
            Dictionary with corpus statistics
        """
        if not self.bm25:
            return {"indexed": False}
        
        return {
            "indexed": True,
            "document_count": len(self.documents),
            "avg_doc_length": np.mean([len(doc) for doc in self.tokenized_corpus]),
            "k1": self.k1,
            "b": self.b
        }


def reciprocal_rank_fusion(
    results_list: List[List[Dict[str, Any]]], 
    k: int = 60
) -> List[Dict[str, Any]]:
    """
    Combine multiple ranked lists using Reciprocal Rank Fusion (RRF).
    
    RRF is a simple but effective method for combining rankings from different
    retrieval systems. It gives higher scores to documents that appear in
    multiple result lists and/or appear higher in those lists.
    
    Formula: RRF_score(d) = Σ(1 / (k + rank(d)))
    
    Args:
        results_list: List of result lists from different retrievers
        k: Constant to prevent division by zero (default: 60)
        
    Returns:
        Fused and re-ranked results
    """
    # Track scores for each unique document
    doc_scores = {}
    doc_objects = {}
    
    for results in results_list:
        for result in results:
            doc = result['document']
            rank = result.get('rank', result.get('score', 1))
            
            # Use document content as key (hash for uniqueness)
            doc_key = hash(doc.page_content)
            
            # Calculate RRF score
            rrf_score = 1.0 / (k + rank)
            
            # Accumulate scores
            if doc_key in doc_scores:
                doc_scores[doc_key] += rrf_score
            else:
                doc_scores[doc_key] = rrf_score
                doc_objects[doc_key] = doc
    
    # Sort by RRF score
    sorted_docs = sorted(
        doc_scores.items(), 
        key=lambda x: x[1], 
        reverse=True
    )
    
    # Build final results
    fused_results = []
    for doc_key, score in sorted_docs:
        fused_results.append({
            'document': doc_objects[doc_key],
            'score': score,
            'rank': len(fused_results) + 1
        })
    
    return fused_results