File size: 3,533 Bytes
068aa4e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
"""
Hybrid Search: Combines Vector Search + BM25 Sparse Retrieval
"""
from rank_bm25 import BM25Okapi
import numpy as np

class HybridSearch:
    def __init__(self, documents):
        """
        Initialize BM25 index
        documents: list of document texts
        """
        print("πŸ“š Building BM25 index...")
        
        # Tokenize documents
        self.tokenized_docs = [doc.lower().split() for doc in documents]
        self.documents = documents
        
        # Create BM25 index
        self.bm25 = BM25Okapi(self.tokenized_docs)
        print(f"βœ… BM25 index created for {len(documents)} documents\n")
    
    def bm25_search(self, query, top_k=5):
        """Search using BM25 (keyword matching)"""
        tokenized_query = query.lower().split()
        scores = self.bm25.get_scores(tokenized_query)
        
        # Get top-k indices
        top_indices = np.argsort(scores)[::-1][:top_k]
        
        results = []
        for idx in top_indices:
            results.append({
                'index': idx,
                'score': scores[idx],
                'content': self.documents[idx]
            })
        
        return results
    
    def hybrid_search(self, query, vector_results, top_k=5):
        """
        Combine vector search + BM25 results
        Uses Reciprocal Rank Fusion (RRF)
        """
        print(f"πŸ”€ Performing hybrid search for: '{query}'\n")
        
        # Get BM25 results
        bm25_results = self.bm25_search(query, top_k)
        
        # Normalize and combine scores (simple average)
        combined_scores = {}
        
        # Add vector scores
        for vec_result in vector_results:
            doc_id = vec_result.get('index', 0)
            combined_scores[doc_id] = {
                'vector_score': vec_result['score'],
                'bm25_score': 0,
                'content': vec_result['content']
            }
        
        # Add BM25 scores
        for bm25_result in bm25_results:
            doc_id = bm25_result['index']
            if doc_id not in combined_scores:
                combined_scores[doc_id] = {
                    'vector_score': 0,
                    'bm25_score': 0,
                    'content': bm25_result['content']
                }
            combined_scores[doc_id]['bm25_score'] = bm25_result['score']
        
        # Calculate combined score (weighted average)
        for doc_id in combined_scores:
            vector_score = combined_scores[doc_id]['vector_score']
            bm25_score = combined_scores[doc_id]['bm25_score'] / 100  # Normalize
            
            # Weighted combination
            combined_scores[doc_id]['combined_score'] = (
                0.6 * vector_score +  # 60% weight to vector
                0.4 * bm25_score      # 40% weight to BM25
            )
        
        # Sort by combined score
        sorted_results = sorted(
            combined_scores.items(),
            key=lambda x: x[1]['combined_score'],
            reverse=True
        )[:top_k]
        
        results = []
        for doc_id, scores_info in sorted_results:
            results.append({
                'index': doc_id,
                'content': scores_info['content'],
                'vector_score': scores_info['vector_score'],
                'bm25_score': scores_info['bm25_score'],
                'combined_score': scores_info['combined_score']
            })
        
        print(f"βœ… Hybrid search returned {len(results)} results\n")
        return results