Custom-LLM-Chat / rag /retriever.py
Bhaskar Ram
fix: sentence-aware chunking, score threshold, DOCX tables, streaming error handling, LLM_MODEL env var
2623b17
"""
retriever.py
Performs cosine-similarity search against the FAISS index.
"""
from __future__ import annotations
import numpy as np
import faiss
from rag.embedder import VectorIndex
DEFAULT_TOP_K = 5
# Chunks with a cosine similarity below this threshold are considered
# too dissimilar to the query and are dropped before reaching the LLM.
# This prevents low-quality context from polluting the answer.
# Range: 0.0 (no filtering) → 1.0 (exact match only). 0.30 is a safe default.
MIN_SCORE = 0.30
def retrieve(query: str, vector_index: VectorIndex, top_k: int = DEFAULT_TOP_K) -> list[dict]:
"""
Embed the query and return top_k most similar chunks above MIN_SCORE.
Each result: {"source": str, "text": str, "score": float}
Scores are cosine similarities (higher = more relevant).
"""
if vector_index is None or vector_index.index is None:
return []
query_embedding = vector_index.embedder.encode([query], show_progress_bar=False)
query_embedding = np.array(query_embedding, dtype="float32")
faiss.normalize_L2(query_embedding) # Must match IndexFlatIP cosine index
n_results = min(top_k, vector_index.index.ntotal)
scores, indices = vector_index.index.search(query_embedding, n_results)
results = []
for score, idx in zip(scores[0], indices[0]):
if idx == -1:
continue
if float(score) < MIN_SCORE:
continue # Drop chunks below relevance threshold
chunk = vector_index.chunks[idx]
results.append({
"source": chunk["source"],
"text": chunk["text"],
"score": float(score), # cosine similarity (0–1 range)
})
return results