Agentic-Service-Data-Eyond-Catalog

Running

ishaq101 commited on 3 days ago

Commit

61c746f

1 Parent(s): 0e9263a

[KM-582][DED][AI] Fix Retrieval in Agentic Service

ticket: https://bukittechnology.atlassian.net/browse/KM-582
fix: replace LangChain ORM retrieval with raw SQL and fix pgvector collection name
- Rewrite DocumentRetriever.retrieve() using raw SQL cosine/manhattan
queries instead of LangChain PGVector ORM, bypassing asyncpg type-mapping
issues (id String vs UUID column, jsonb_path_match binding quirks)
- Fix _COLLECTION_NAME from "document_embeddings" to "documents" to match
the collection name set by the Golang ingestion service
- Fix collection_name in vector_store.py to match consistently
- Fix Redis chat cache to store {response, sources} dict so cached replies
also populate message_sources table
- Add cache management endpoints: DELETE /chat/cache, /chat/cache/room/{id},
/retrieval/cache/{user_id}
- Invalidate retrieval cache automatically after document processing
- Update intent_router prompt: route topical/knowledge questions to
unstructured even without explicit document mention; prefer unstructured
when ambiguous; add Indonesian few-shot examples
- Fix logging level from WARNING to INFO so structured logs are visible
- Add page_label: null to non-PDF chunk metadata for consistency
- Add diagnostic logging in retrieve() to expose collection, user_id,
and raw row count per call

Files changed (8) hide show

src/agents/chat_handler.py +1 -0
src/api/v1/chat.py +51 -10
src/api/v1/document.py +3 -0
src/config/prompts/intent_router.md +22 -6
src/db/postgres/vector_store.py +1 -1
src/knowledge/processing_service.py +1 -0
src/middlewares/logging.py +1 -1
src/retrieval/document.py +40 -101

src/agents/chat_handler.py CHANGED Viewed

@@ -170,6 +170,7 @@ class ChatHandler:
         sources = _build_sources(
             decision.source_hint, user_id, query_result, raw_chunks
         )
         yield {"event": "sources", "data": json.dumps(sources)}
         # ---- 3. Stream answer ----------------------------------------

         sources = _build_sources(
             decision.source_hint, user_id, query_result, raw_chunks
         )
+        logger.info("built sources", source_hint=decision.source_hint, sources_count=len(sources), raw_chunks_count=len(raw_chunks) if raw_chunks else 0)
         yield {"event": "sources", "data": json.dumps(sources)}
         # ---- 3. Stream answer ----------------------------------------

src/api/v1/chat.py CHANGED Viewed

@@ -42,15 +42,19 @@ class ChatRequest(BaseModel):
     message: str
-async def get_cached_response(redis, cache_key: str) -> Optional[str]:
     cached = await redis.get(cache_key)
     if cached:
-        return json.loads(cached)
     return None
-async def cache_response(redis, cache_key: str, response: str):
-    await redis.setex(cache_key, 86400, json.dumps(response))
 async def load_history(db: AsyncSession, room_id: str, limit: int = 10) -> list:
@@ -91,6 +95,34 @@ async def save_messages(
     await db.commit()
 @router.post("/chat/stream")
 @log_execution(logger)
 async def chat_stream(request: ChatRequest, db: AsyncSession = Depends(get_db)):
@@ -107,13 +139,17 @@ async def chat_stream(request: ChatRequest, db: AsyncSession = Depends(get_db)):
     # Redis cache hit
     cached = await get_cached_response(redis, cache_key)
     if cached:
         logger.info("Returning cached response")
         async def stream_cached():
-            yield {"event": "sources", "data": json.dumps([])}
-            for i in range(0, len(cached), 50):
-                yield {"event": "chunk", "data": cached[i:i + 50]}
             yield {"event": "done", "data": ""}
         return EventSourceResponse(stream_cached())
@@ -122,7 +158,7 @@ async def chat_stream(request: ChatRequest, db: AsyncSession = Depends(get_db)):
         # Fast intent: greetings/farewells bypass LLM entirely
         direct = _fast_intent(request.message)
         if direct:
-            await cache_response(redis, cache_key, direct)
             await save_messages(db, request.room_id, request.message, direct, sources=[])
             async def stream_direct():
@@ -136,6 +172,7 @@ async def chat_stream(request: ChatRequest, db: AsyncSession = Depends(get_db)):
         handler = ChatHandler()
         async def stream_response():
             full_response = ""
             sources: List[Dict[str, Any]] = []
             async for event in handler.handle(request.message, request.user_id, history):
@@ -149,8 +186,12 @@ async def chat_stream(request: ChatRequest, db: AsyncSession = Depends(get_db)):
                     full_response += event["data"]
                     yield event
                 elif event["event"] == "done":
-                    await cache_response(redis, cache_key, full_response)
-                    await save_messages(db, request.room_id, request.message, full_response, sources=sources)
                     yield event
                 elif event["event"] == "error":
                     yield event

     message: str
+async def get_cached_response(redis, cache_key: str) -> Optional[dict]:
     cached = await redis.get(cache_key)
     if cached:
+        data = json.loads(cached)
+        if isinstance(data, dict) and "response" in data:
+            return data
+        # legacy: plain string cached before this change
+        return {"response": data, "sources": []}
     return None
+async def cache_response(redis, cache_key: str, response: str, sources: list):
+    await redis.setex(cache_key, 86400, json.dumps({"response": response, "sources": sources}))
 async def load_history(db: AsyncSession, room_id: str, limit: int = 10) -> list:
     await db.commit()
+@router.delete("/chat/cache")
+async def clear_chat_cache(room_id: str, message: str):
+    """Delete the Redis cache entry for a specific room + message pair."""
+    redis = await get_redis()
+    cache_key = f"{settings.redis_prefix}chat:{room_id}:{message}"
+    deleted = await redis.delete(cache_key)
+    return {"deleted": deleted > 0, "cache_key": cache_key}
+@router.delete("/chat/cache/room/{room_id}")
+async def clear_room_cache(room_id: str):
+    """Delete all Redis cache entries for a room."""
+    redis = await get_redis()
+    pattern = f"{settings.redis_prefix}chat:{room_id}:*"
+    keys = await redis.keys(pattern)
+    if keys:
+        await redis.delete(*keys)
+    return {"deleted_count": len(keys), "room_id": room_id}
+@router.delete("/retrieval/cache/{user_id}")
+async def clear_retrieval_cache(user_id: str):
+    """Delete all cached retrieval results for a user. Call this after uploading/processing new documents."""
+    from src.retrieval.router import retrieval_router
+    deleted = await retrieval_router.invalidate_cache(user_id)
+    return {"deleted_count": deleted, "user_id": user_id}
 @router.post("/chat/stream")
 @log_execution(logger)
 async def chat_stream(request: ChatRequest, db: AsyncSession = Depends(get_db)):
     # Redis cache hit
     cached = await get_cached_response(redis, cache_key)
+    logger.info("cache check", cache_key=cache_key, cache_hit=cached is not None)
     if cached:
         logger.info("Returning cached response")
+        cached_text = cached["response"]
+        cached_sources = cached["sources"]
+        await save_messages(db, request.room_id, request.message, cached_text, sources=cached_sources)
         async def stream_cached():
+            yield {"event": "sources", "data": json.dumps(cached_sources)}
+            for i in range(0, len(cached_text), 50):
+                yield {"event": "chunk", "data": cached_text[i:i + 50]}
             yield {"event": "done", "data": ""}
         return EventSourceResponse(stream_cached())
         # Fast intent: greetings/farewells bypass LLM entirely
         direct = _fast_intent(request.message)
         if direct:
+            await cache_response(redis, cache_key, direct, sources=[])
             await save_messages(db, request.room_id, request.message, direct, sources=[])
             async def stream_direct():
         handler = ChatHandler()
         async def stream_response():
+            logger.info("stream_response started", room_id=request.room_id, user_id=request.user_id)
             full_response = ""
             sources: List[Dict[str, Any]] = []
             async for event in handler.handle(request.message, request.user_id, history):
                     full_response += event["data"]
                     yield event
                 elif event["event"] == "done":
+                    await cache_response(redis, cache_key, full_response, sources=sources)
+                    logger.info("saving messages", sources_count=len(sources), sources=sources)
+                    try:
+                        await save_messages(db, request.room_id, request.message, full_response, sources=sources)
+                    except Exception as e:
+                        logger.error("save_messages failed", room_id=request.room_id, error=str(e))
                     yield event
                 elif event["event"] == "error":
                     yield event

src/api/v1/document.py CHANGED Viewed

@@ -114,5 +114,8 @@ async def process_document(
         except Exception as e:
             logger.error("catalog ingestion failed after process", document_id=document_id, error=str(e))
     return {"status": "success", "message": "Document processed successfully", "data": data}

         except Exception as e:
             logger.error("catalog ingestion failed after process", document_id=document_id, error=str(e))
+    from src.retrieval.router import retrieval_router
+    await retrieval_router.invalidate_cache(user_id)
     return {"status": "success", "message": "Document processed successfully", "data": data}

src/config/prompts/intent_router.md CHANGED Viewed

@@ -7,16 +7,16 @@ Return three fields:
 - **`needs_search`** — `true` if we must look at the user's data to answer; `false` for greetings, farewells, off-topic chitchat, or meta questions about the assistant itself.
 - **`source_hint`** — one of:
   - `chat` — no data lookup needed (greetings, farewells, generic small talk).
-  - `unstructured` — the user is asking about the **content** of an uploaded document (PDF / DOCX / TXT).
   - `structured` — the user is asking a **data question** answerable from a database or a tabular file (CSV / XLSX / Parquet). This includes counts, sums, top-N, filters, comparisons, trends, joins across registered structured sources.
 - **`rewritten_query`** — a **standalone** version of the user's question that incorporates necessary context from history. If the original message is already standalone, return it unchanged. If `needs_search` is `false`, leave this empty/null.
 ## Routing rules
-1. If the message is a pure greeting / farewell / thanks / "how are you" / "what can you do" → `chat` + `needs_search=false`.
-2. If the message references content that lives in a registered DB or uploaded tabular file (sales numbers, customer counts, order trends, sheet rows, table columns) → `structured` + `needs_search=true`.
-3. If the message asks about prose content (a section of a PDF, what a memo says, a quote from a document) → `unstructured` + `needs_search=true`.
-4. If the message is ambiguous between structured and unstructured, prefer `structured` — the planner can fall back if the catalog has nothing relevant.
 5. Cross-source comparison ("compare DB sales to the customers.csv file") → `structured`. The planner sees both source types in one prompt and can correlate.
 ## Rewriting follow-ups
@@ -53,6 +53,22 @@ User: "Top 5 customers by revenue this year"
 → needs_search=true, source_hint="structured",
   rewritten_query="Top 5 customers by revenue this year"
 History: assistant: "Pro Plan Annual led at $487,200 in April."
 User: "And in March?"
 → needs_search=true, source_hint="structured",
@@ -61,6 +77,6 @@ User: "And in March?"
 ## Constraints
-- Do not invent data. If you don't know whether a topic exists in the user's data, route to `structured` and let the planner decide.
 - Do not refuse — refusal happens later in guardrails. Just classify.
 - One JSON object as output; no prose, no markdown.

 - **`needs_search`** — `true` if we must look at the user's data to answer; `false` for greetings, farewells, off-topic chitchat, or meta questions about the assistant itself.
 - **`source_hint`** — one of:
   - `chat` — no data lookup needed (greetings, farewells, generic small talk).
+  - `unstructured` — the user is asking about a topic, concept, feature, or factual knowledge that may exist in uploaded documents (PDF / DOCX / TXT). The user does not need to explicitly mention a document.
   - `structured` — the user is asking a **data question** answerable from a database or a tabular file (CSV / XLSX / Parquet). This includes counts, sums, top-N, filters, comparisons, trends, joins across registered structured sources.
 - **`rewritten_query`** — a **standalone** version of the user's question that incorporates necessary context from history. If the original message is already standalone, return it unchanged. If `needs_search` is `false`, leave this empty/null.
 ## Routing rules
+1. If the message is ONLY a pure greeting / farewell / thanks / "how are you" / "what can you do" / compliment with no factual question → `chat` + `needs_search=false`.
+2. If the message asks a data question answerable from a database or tabular file (counts, sums, top-N, filters, comparisons, trends, sheet rows, table columns) → `structured` + `needs_search=true`.
+3. If the message asks about a topic, concept, feature, explanation, summary, or factual knowledge — even without explicitly mentioning a document — route to `unstructured` + `needs_search=true`. The user may have uploaded relevant documents covering that topic.
+4. If ambiguous between structured and unstructured → prefer `unstructured`. Only prefer `structured` if there are clear signals of tabular/numeric data questions.
 5. Cross-source comparison ("compare DB sales to the customers.csv file") → `structured`. The planner sees both source types in one prompt and can correlate.
 ## Rewriting follow-ups
 → needs_search=true, source_hint="structured",
   rewritten_query="Top 5 customers by revenue this year"
+User: "apa key feature dari iot connectivity?"
+→ needs_search=true, source_hint="unstructured",
+  rewritten_query="What are the key features of IoT connectivity?"
+User: "jelaskan tentang machine learning"
+→ needs_search=true, source_hint="unstructured",
+  rewritten_query="Explain machine learning"
+User: "bagaimana cara kerja neural network?"
+→ needs_search=true, source_hint="unstructured",
+  rewritten_query="How does a neural network work?"
+User: "what is the main purpose of this system?"
+→ needs_search=true, source_hint="unstructured",
+  rewritten_query="What is the main purpose of this system?"
 History: assistant: "Pro Plan Annual led at $487,200 in April."
 User: "And in March?"
 → needs_search=true, source_hint="structured",
 ## Constraints
+- Do not invent data. If the question is factual or knowledge-based (not clearly tabular), route to `unstructured` and let the retriever decide. Only route to `structured` if the question clearly involves counts, sums, filters, or trends from tabular sources.
 - Do not refuse — refusal happens later in guardrails. Just classify.
 - One JSON object as output; no prose, no markdown.

src/db/postgres/vector_store.py CHANGED Viewed

@@ -19,7 +19,7 @@ embeddings = AzureOpenAIEmbeddings(
 vector_store = PGVector(
     embeddings=embeddings,
     connection=_pgvector_engine,
-    collection_name="document_embeddings",
     use_jsonb=True,
     async_mode=True,
     create_extension=False,  # Extension pre-created in init_db.py (avoids multi-statement asyncpg bug)

 vector_store = PGVector(
     embeddings=embeddings,
     connection=_pgvector_engine,
+    collection_name="documents",
     use_jsonb=True,
     async_mode=True,
     create_extension=False,  # Extension pre-created in init_db.py (avoids multi-statement asyncpg bug)

src/knowledge/processing_service.py CHANGED Viewed

@@ -59,6 +59,7 @@ class KnowledgeProcessingService:
                                 "filename": db_doc.filename,
                                 "file_type": db_doc.file_type,
                                 "chunk_index": i,
                             },
                         }
                     )

                                 "filename": db_doc.filename,
                                 "file_type": db_doc.file_type,
                                 "chunk_index": i,
+                                "page_label": None,
                             },
                         }
                     )

src/middlewares/logging.py CHANGED Viewed

@@ -9,7 +9,7 @@ import time
 def configure_logging():
     """Configure structured logging."""
-    logging.basicConfig(level=logging.WARNING)
     logging.getLogger("tabular_executor").setLevel(logging.INFO)
     structlog.configure(
         processors=[

 def configure_logging():
     """Configure structured logging."""
+    logging.basicConfig(level=logging.INFO)
     logging.getLogger("tabular_executor").setLevel(logging.INFO)
     structlog.configure(
         processors=[

src/retrieval/document.py CHANGED Viewed

@@ -1,68 +1,44 @@
-"""DocumentRetriever — dense similarity over prose chunks (Cu).
-For unstructured sources only (PDF / DOCX / TXT). Backed by PGVector with
-collection `document_embeddings`. Methods: MMR, cosine, euclidean, etc.
 """
 import functools
 import math
-from langchain_postgres import PGVector
-from langchain_postgres.vectorstores import DistanceStrategy
 from langchain_openai import AzureOpenAIEmbeddings
 from sqlalchemy import text
 from src.config.settings import settings
 from src.db.postgres.connection import _pgvector_engine
-from src.db.postgres.vector_store import get_vector_store
 from src.middlewares.logging import get_logger
 from src.retrieval.base import BaseRetriever, RetrievalResult
 logger = get_logger("document_retriever")
 # Change this one line to switch retrieval method
-# Options: "mmr" | "cosine" | "euclidean" | "inner_product" | "manhattan"
-_RETRIEVAL_METHOD = "mmr"
 _TABULAR_TYPES = {"csv", "xlsx"}
-_FETCH_K = 20
-_LAMBDA_MULT = 0.5
-_COLLECTION_NAME = "document_embeddings"
-@functools.cache
-def _get_embeddings() -> AzureOpenAIEmbeddings:
-    return AzureOpenAIEmbeddings(
-        azure_deployment=settings.azureai_deployment_name_embedding,
-        openai_api_version=settings.azureai_api_version_embedding,
-        azure_endpoint=settings.azureai_endpoint_url_embedding,
-        api_key=settings.azureai_api_key_embedding,
-    )
-@functools.cache
-def _get_euclidean_store() -> PGVector:
-    return PGVector(
-        embeddings=_get_embeddings(),
-        connection=_pgvector_engine,
-        collection_name=_COLLECTION_NAME,
-        distance_strategy=DistanceStrategy.EUCLIDEAN,
-        use_jsonb=True,
-        async_mode=True,
-        create_extension=False,
-    )
-@functools.cache
-def _get_ip_store() -> PGVector:
-    return PGVector(
-        embeddings=_get_embeddings(),
-        connection=_pgvector_engine,
-        collection_name=_COLLECTION_NAME,
-        distance_strategy=DistanceStrategy.MAX_INNER_PRODUCT,
-        use_jsonb=True,
-        async_mode=True,
-        create_extension=False,
-    )
 _MANHATTAN_SQL = text("""
     SELECT
@@ -79,71 +55,32 @@ _MANHATTAN_SQL = text("""
 """)
-class DocumentRetriever(BaseRetriever):
-    def __init__(self) -> None:
-        self.vector_store = get_vector_store()
     async def retrieve(
         self, query: str, user_id: str, k: int = 5
-    ) -> list[RetrievalResult]:
-        filter_ = {"user_id": user_id, "source_type": "document"}
-        fetch_k = k + len(_TABULAR_TYPES)
-        if _RETRIEVAL_METHOD == "manhattan":
-            return await self._retrieve_manhattan(query, user_id, k, fetch_k)
-        if _RETRIEVAL_METHOD == "mmr":
-            docs = await self.vector_store.amax_marginal_relevance_search(
-                query=query,
-                k=fetch_k,
-                fetch_k=_FETCH_K,
-                lambda_mult=_LAMBDA_MULT,
-                filter=filter_,
-            )
-            cosine = await self.vector_store.asimilarity_search_with_score(
-                query=query, k=fetch_k, filter=filter_,
-            )
-            score_map = {doc.page_content: score for doc, score in cosine}
-            docs_with_scores = [(doc, score_map.get(doc.page_content, 0.0)) for doc in docs]
-        elif _RETRIEVAL_METHOD == "euclidean":
-            docs_with_scores = await _get_euclidean_store().asimilarity_search_with_score(
-                query=query, k=fetch_k, filter=filter_,
-            )
-        elif _RETRIEVAL_METHOD == "inner_product":
-            docs_with_scores = await _get_ip_store().asimilarity_search_with_score(
-                query=query, k=fetch_k, filter=filter_,
-            )
-        else:  # cosine
-            docs_with_scores = await self.vector_store.asimilarity_search_with_score(
-                query=query, k=fetch_k, filter=filter_,
-            )
-        results = []
-        for doc, score in docs_with_scores:
-            file_type = doc.metadata.get("data", {}).get("file_type", "")
-            if file_type not in _TABULAR_TYPES:
-                results.append(RetrievalResult(
-                    content=doc.page_content,
-                    metadata=doc.metadata,
-                    score=score,
-                    source_type="document",
-                ))
-            if len(results) == k:
-                break
-        logger.info("retrieved chunks", method=_RETRIEVAL_METHOD, count=len(results))
-        return results
-    async def _retrieve_manhattan(
-        self, query: str, user_id: str, k: int, fetch_k: int
     ) -> list[RetrievalResult]:
         query_vector = await _get_embeddings().aembed_query(query)
         if not all(math.isfinite(v) for v in query_vector):
             raise ValueError("Embedding vector contains NaN or Infinity values.")
         vector_str = "[" + ",".join(str(v) for v in query_vector) + "]"
         async with _pgvector_engine.connect() as conn:
-            result = await conn.execute(_MANHATTAN_SQL, {
                 "embedding": vector_str,
                 "collection": _COLLECTION_NAME,
                 "user_id": user_id,
@@ -151,6 +88,8 @@ class DocumentRetriever(BaseRetriever):
             })
             rows = result.fetchall()
         results = []
         for row in rows:
             file_type = row.cmetadata.get("data", {}).get("file_type", "")
@@ -164,7 +103,7 @@ class DocumentRetriever(BaseRetriever):
             if len(results) == k:
                 break
-        logger.info("retrieved chunks", method="manhattan", count=len(results))
         return results

+"""DocumentRetriever — dense similarity over prose chunks.
+For unstructured sources only (PDF / DOCX / TXT). Backed by PGVector via
+raw SQL to avoid LangChain ORM / asyncpg type-mapping issues (id UUID vs
+String mismatch, jsonb_path_match asyncpg binding quirks).
+Collection `document_embeddings`. Methods: cosine | manhattan.
 """
 import functools
 import math
 from langchain_openai import AzureOpenAIEmbeddings
 from sqlalchemy import text
 from src.config.settings import settings
 from src.db.postgres.connection import _pgvector_engine
 from src.middlewares.logging import get_logger
 from src.retrieval.base import BaseRetriever, RetrievalResult
 logger = get_logger("document_retriever")
 # Change this one line to switch retrieval method
+# Options: "cosine" | "manhattan"
+_RETRIEVAL_METHOD = "cosine"
 _TABULAR_TYPES = {"csv", "xlsx"}
+_COLLECTION_NAME = "documents"
+_COSINE_SQL = text("""
+    SELECT
+        lpe.document,
+        lpe.cmetadata,
+        lpe.embedding <=> CAST(:embedding AS vector) AS distance
+    FROM langchain_pg_embedding lpe
+    JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
+    WHERE lpc.name = :collection
+      AND lpe.cmetadata->>'user_id' = :user_id
+      AND lpe.cmetadata->>'source_type' = 'document'
+    ORDER BY distance ASC
+    LIMIT :k
+""")
 _MANHATTAN_SQL = text("""
     SELECT
 """)
+@functools.cache
+def _get_embeddings() -> AzureOpenAIEmbeddings:
+    return AzureOpenAIEmbeddings(
+        azure_deployment=settings.azureai_deployment_name_embedding,
+        openai_api_version=settings.azureai_api_version_embedding,
+        azure_endpoint=settings.azureai_endpoint_url_embedding,
+        api_key=settings.azureai_api_key_embedding,
+    )
+class DocumentRetriever(BaseRetriever):
     async def retrieve(
         self, query: str, user_id: str, k: int = 5
     ) -> list[RetrievalResult]:
         query_vector = await _get_embeddings().aembed_query(query)
         if not all(math.isfinite(v) for v in query_vector):
             raise ValueError("Embedding vector contains NaN or Infinity values.")
         vector_str = "[" + ",".join(str(v) for v in query_vector) + "]"
+        fetch_k = k + len(_TABULAR_TYPES)
+        sql = _COSINE_SQL if _RETRIEVAL_METHOD == "cosine" else _MANHATTAN_SQL
+        logger.info("retrieve called", user_id=user_id, collection=_COLLECTION_NAME, fetch_k=fetch_k)
         async with _pgvector_engine.connect() as conn:
+            result = await conn.execute(sql, {
                 "embedding": vector_str,
                 "collection": _COLLECTION_NAME,
                 "user_id": user_id,
             })
             rows = result.fetchall()
+        logger.info("raw rows from db", row_count=len(rows))
         results = []
         for row in rows:
             file_type = row.cmetadata.get("data", {}).get("file_type", "")
             if len(results) == k:
                 break
+        logger.info("retrieved chunks", method=_RETRIEVAL_METHOD, count=len(results))
         return results