"""Document retriever — handles PDF, DOCX, TXT chunks (source_type="document", non-tabular). TEAMMATE: implement retrieve() below. Strategy: MMR (amax_marginal_relevance_search) + score threshold to avoid returning near-identical chunks from the same PDF page. Filter: source_type="document" AND data->>'file_type' NOT IN ('csv', 'xlsx') """ from src.db.postgres.vector_store import get_vector_store from src.middlewares.logging import get_logger from src.rag.base import BaseRetriever, RetrievalResult logger = get_logger("document_retriever") _SCORE_THRESHOLD = 0.45 # discard chunks with cosine distance above this class DocumentRetriever(BaseRetriever): def __init__(self): self.vector_store = get_vector_store() async def retrieve( self, query: str, user_id: str, k: int = 5 ) -> list[RetrievalResult]: # TODO (teammate): implement MMR retrieval for prose documents # Filter: {"user_id": user_id, "source_type": "document"} # then post-filter to exclude file_type in ("csv", "xlsx") logger.info("document retriever not yet implemented — returning empty") return [] document_retriever = DocumentRetriever()