File size: 1,193 Bytes
ba550a5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 | """Document retriever — handles PDF, DOCX, TXT chunks (source_type="document", non-tabular).
TEAMMATE: implement retrieve() below.
Strategy: MMR (amax_marginal_relevance_search) + score threshold to avoid returning
near-identical chunks from the same PDF page.
Filter: source_type="document" AND data->>'file_type' NOT IN ('csv', 'xlsx')
"""
from src.db.postgres.vector_store import get_vector_store
from src.middlewares.logging import get_logger
from src.rag.base import BaseRetriever, RetrievalResult
logger = get_logger("document_retriever")
_SCORE_THRESHOLD = 0.45 # discard chunks with cosine distance above this
class DocumentRetriever(BaseRetriever):
def __init__(self):
self.vector_store = get_vector_store()
async def retrieve(
self, query: str, user_id: str, k: int = 5
) -> list[RetrievalResult]:
# TODO (teammate): implement MMR retrieval for prose documents
# Filter: {"user_id": user_id, "source_type": "document"}
# then post-filter to exclude file_type in ("csv", "xlsx")
logger.info("document retriever not yet implemented — returning empty")
return []
document_retriever = DocumentRetriever()
|