vn6295337's picture
Initial commit: RAG Document Assistant with Zero-Storage Privacy
f866820
"""
Hybrid retrieval combining semantic and keyword search.
Provides better recall by leveraging both:
- Semantic search: conceptual similarity
- Keyword search: exact term matching
"""
from typing import List, Dict, Any, Optional
from dataclasses import dataclass
from src.retrieval.retriever import query_pinecone
from src.retrieval.keyword_search import keyword_search, hybrid_score_chunks
@dataclass
class HybridSearchResult:
"""Result from hybrid search."""
chunks: List[Dict[str, Any]]
semantic_count: int
keyword_count: int
strategy: str
def hybrid_search(
query: str,
top_k: int = 10,
semantic_weight: float = 0.7,
keyword_weight: float = 0.3,
fetch_k: int = None,
chunks_path: str = "data/chunks.jsonl"
) -> HybridSearchResult:
"""
Perform hybrid search combining semantic and keyword retrieval.
Args:
query: Search query
top_k: Final number of results to return
semantic_weight: Weight for semantic search results (0-1)
keyword_weight: Weight for keyword search results (0-1)
fetch_k: Number to fetch from each source (default: 2x top_k)
chunks_path: Path to chunks file for BM25
Returns:
HybridSearchResult with combined chunks and metadata
"""
if fetch_k is None:
fetch_k = top_k * 2
semantic_chunks = []
keyword_chunks = []
# 1. Semantic search via Pinecone
try:
semantic_results = query_pinecone(query, top_k=fetch_k)
# Ensure chunks have text field from metadata if not present
for chunk in semantic_results:
if "text" not in chunk and "metadata" in chunk:
chunk["text"] = chunk["metadata"].get("text", "")
semantic_chunks = semantic_results
except Exception:
semantic_chunks = []
# 2. Keyword search via BM25
try:
keyword_result = keyword_search(query, top_k=fetch_k, chunks_path=chunks_path)
keyword_chunks = keyword_result.chunks
except Exception:
keyword_chunks = []
# 3. Determine strategy based on what succeeded
if semantic_chunks and keyword_chunks:
strategy = "hybrid"
combined = hybrid_score_chunks(
semantic_chunks=semantic_chunks,
keyword_chunks=keyword_chunks,
semantic_weight=semantic_weight,
keyword_weight=keyword_weight,
top_k=top_k
)
elif semantic_chunks:
strategy = "semantic_only"
combined = semantic_chunks[:top_k]
elif keyword_chunks:
strategy = "keyword_only"
combined = keyword_chunks[:top_k]
else:
strategy = "none"
combined = []
return HybridSearchResult(
chunks=combined,
semantic_count=len(semantic_chunks),
keyword_count=len(keyword_chunks),
strategy=strategy
)