Spaces:
Sleeping
Sleeping
| """ | |
| Hybrid Search: Combines Vector Search + BM25 Sparse Retrieval | |
| """ | |
| from rank_bm25 import BM25Okapi | |
| import numpy as np | |
| class HybridSearch: | |
| def __init__(self, documents): | |
| """ | |
| Initialize BM25 index | |
| documents: list of document texts | |
| """ | |
| print("π Building BM25 index...") | |
| # Tokenize documents | |
| self.tokenized_docs = [doc.lower().split() for doc in documents] | |
| self.documents = documents | |
| # Create BM25 index | |
| self.bm25 = BM25Okapi(self.tokenized_docs) | |
| print(f"β BM25 index created for {len(documents)} documents\n") | |
| def bm25_search(self, query, top_k=5): | |
| """Search using BM25 (keyword matching)""" | |
| tokenized_query = query.lower().split() | |
| scores = self.bm25.get_scores(tokenized_query) | |
| # Get top-k indices | |
| top_indices = np.argsort(scores)[::-1][:top_k] | |
| results = [] | |
| for idx in top_indices: | |
| results.append({ | |
| 'index': idx, | |
| 'score': scores[idx], | |
| 'content': self.documents[idx] | |
| }) | |
| return results | |
| def hybrid_search(self, query, vector_results, top_k=5): | |
| """ | |
| Combine vector search + BM25 results | |
| Uses Reciprocal Rank Fusion (RRF) | |
| """ | |
| print(f"π Performing hybrid search for: '{query}'\n") | |
| # Get BM25 results | |
| bm25_results = self.bm25_search(query, top_k) | |
| # Normalize and combine scores (simple average) | |
| combined_scores = {} | |
| # Add vector scores | |
| for vec_result in vector_results: | |
| doc_id = vec_result.get('index', 0) | |
| combined_scores[doc_id] = { | |
| 'vector_score': vec_result['score'], | |
| 'bm25_score': 0, | |
| 'content': vec_result['content'] | |
| } | |
| # Add BM25 scores | |
| for bm25_result in bm25_results: | |
| doc_id = bm25_result['index'] | |
| if doc_id not in combined_scores: | |
| combined_scores[doc_id] = { | |
| 'vector_score': 0, | |
| 'bm25_score': 0, | |
| 'content': bm25_result['content'] | |
| } | |
| combined_scores[doc_id]['bm25_score'] = bm25_result['score'] | |
| # Calculate combined score (weighted average) | |
| for doc_id in combined_scores: | |
| vector_score = combined_scores[doc_id]['vector_score'] | |
| bm25_score = combined_scores[doc_id]['bm25_score'] / 100 # Normalize | |
| # Weighted combination | |
| combined_scores[doc_id]['combined_score'] = ( | |
| 0.6 * vector_score + # 60% weight to vector | |
| 0.4 * bm25_score # 40% weight to BM25 | |
| ) | |
| # Sort by combined score | |
| sorted_results = sorted( | |
| combined_scores.items(), | |
| key=lambda x: x[1]['combined_score'], | |
| reverse=True | |
| )[:top_k] | |
| results = [] | |
| for doc_id, scores_info in sorted_results: | |
| results.append({ | |
| 'index': doc_id, | |
| 'content': scores_info['content'], | |
| 'vector_score': scores_info['vector_score'], | |
| 'bm25_score': scores_info['bm25_score'], | |
| 'combined_score': scores_info['combined_score'] | |
| }) | |
| print(f"β Hybrid search returned {len(results)} results\n") | |
| return results | |