Rifqi Hafizuddin
[KM-438-439] add retriever feature
ba550a5
raw
history blame
1.19 kB
"""Document retriever — handles PDF, DOCX, TXT chunks (source_type="document", non-tabular).
TEAMMATE: implement retrieve() below.
Strategy: MMR (amax_marginal_relevance_search) + score threshold to avoid returning
near-identical chunks from the same PDF page.
Filter: source_type="document" AND data->>'file_type' NOT IN ('csv', 'xlsx')
"""
from src.db.postgres.vector_store import get_vector_store
from src.middlewares.logging import get_logger
from src.rag.base import BaseRetriever, RetrievalResult
logger = get_logger("document_retriever")
_SCORE_THRESHOLD = 0.45 # discard chunks with cosine distance above this
class DocumentRetriever(BaseRetriever):
def __init__(self):
self.vector_store = get_vector_store()
async def retrieve(
self, query: str, user_id: str, k: int = 5
) -> list[RetrievalResult]:
# TODO (teammate): implement MMR retrieval for prose documents
# Filter: {"user_id": user_id, "source_type": "document"}
# then post-filter to exclude file_type in ("csv", "xlsx")
logger.info("document retriever not yet implemented — returning empty")
return []
document_retriever = DocumentRetriever()