cortexa-ai / rag /retriever.py
VedNav1's picture
Initial commit for AI~
4724d74
"""
Document retrieval component
"""
from typing import List, Dict
from vectordb.mongodb_store import get_mongodb_store
from config import TOP_K, SIMILARITY_THRESHOLD
class DocumentRetriever:
def __init__(self):
self.vector_store = get_mongodb_store()
def retrieve(
self,
query: str,
top_k: int = TOP_K,
filter_metadata: Dict = None,
min_similarity: float = SIMILARITY_THRESHOLD
) -> List[Dict]:
"""
Retrieve relevant documents for a query
"""
# Search vector store
documents, metadatas, distances = self.vector_store.search(
query=query,
top_k=top_k,
filter_metadata=filter_metadata
)
# Format results
results = []
for doc, metadata, distance in zip(documents, metadatas, distances):
similarity = 1 - distance
if similarity >= min_similarity:
results.append({
'text': doc,
'metadata': metadata,
'similarity': similarity,
'source': metadata.get('source', 'Unknown'),
'chunk_index': metadata.get('chunk_index', 0)
})
results.sort(key=lambda x: x['similarity'], reverse=True)
return results
def format_context(self, retrieved_docs: List[Dict]) -> str:
"""Format retrieved documents into context string"""
if not retrieved_docs:
return "No relevant information found."
context_parts = []
for i, doc in enumerate(retrieved_docs, 1):
source = doc['metadata'].get('source', 'Unknown')
chunk_idx = doc['metadata'].get('chunk_index', 0)
similarity = doc['similarity']
context_parts.append(
f"[Source {i}: {source}, Chunk {chunk_idx}, Relevance: {similarity:.2f}]\n"
f"{doc['text']}\n"
)
return "\n".join(context_parts)
_retriever = None
def get_retriever() -> DocumentRetriever:
global _retriever
if _retriever is None:
_retriever = DocumentRetriever()
return _retriever