| """Document retriever — handles PDF, DOCX, TXT chunks (source_type="document", non-tabular). | |
| TEAMMATE: implement retrieve() below. | |
| Strategy: MMR (amax_marginal_relevance_search) + score threshold to avoid returning | |
| near-identical chunks from the same PDF page. | |
| Filter: source_type="document" AND data->>'file_type' NOT IN ('csv', 'xlsx') | |
| """ | |
| from src.db.postgres.vector_store import get_vector_store | |
| from src.middlewares.logging import get_logger | |
| from src.rag.base import BaseRetriever, RetrievalResult | |
| logger = get_logger("document_retriever") | |
| _SCORE_THRESHOLD = 0.45 # discard chunks with cosine distance above this | |
| class DocumentRetriever(BaseRetriever): | |
| def __init__(self): | |
| self.vector_store = get_vector_store() | |
| async def retrieve( | |
| self, query: str, user_id: str, k: int = 5 | |
| ) -> list[RetrievalResult]: | |
| # TODO (teammate): implement MMR retrieval for prose documents | |
| # Filter: {"user_id": user_id, "source_type": "document"} | |
| # then post-filter to exclude file_type in ("csv", "xlsx") | |
| logger.info("document retriever not yet implemented — returning empty") | |
| return [] | |
| document_retriever = DocumentRetriever() | |