Agentic-Service-Data-Eyond-Catalog

Running

App Files Files Community

feat/Planner Agent

by rhbt6767 - opened 5 days ago

base: refs/heads/main

←

from: refs/pr/2

Discussion Files changed

+177

-135

This PR is in draft mode

Files changed (14) hide show

.gitignore +0 -3
.vscode/launch.json +25 -0
main.py +2 -6
pyproject.toml +3 -1
src/agents/chat_handler.py +0 -1
src/api/v1/chat.py +10 -51
src/api/v1/document.py +0 -3
src/config/prompts/intent_router.md +6 -22
src/config/settings.py +4 -4
src/db/postgres/vector_store.py +1 -1
src/knowledge/processing_service.py +0 -1
src/middlewares/logging.py +1 -1
src/retrieval/document.py +101 -40
uv.lock +24 -1

.gitignore CHANGED Viewed

@@ -37,8 +37,6 @@ playground_chat.py
 playground_flush_cache.py
 playground_create_user.py
 API_CONTRACT.md
-API_CONTRACT_AGENT.md
-API_CONTRACT_AGENT_ACTIVE.md
 context_engineering/
 sample_file/
 test_tesseract.py
@@ -48,4 +46,3 @@ software/
 tests/
 .claude/
-migratego/

 playground_flush_cache.py
 playground_create_user.py
 API_CONTRACT.md
 context_engineering/
 sample_file/
 test_tesseract.py
 tests/
 .claude/

.vscode/launch.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "DataEyond: FastAPI (debug)",
+            "type": "debugpy",
+            "request": "launch",
+            "module": "uvicorn",
+            "args": [
+                "main:app",
+                "--host", "0.0.0.0",
+                "--port", "7860",
+                "--reload"
+            ],
+            "jinja": true,
+            "justMyCode": true,
+            "envFile": "${workspaceFolder}/.env",
+            "console": "integratedTerminal",
+            "cwd": "${workspaceFolder}"
+        }
+    ]
+}

main.py CHANGED Viewed

@@ -14,7 +14,6 @@ from src.api.v1.users import router as users_router
 from src.api.v1.db_client import router as db_client_router
 from src.api.v1.data_catalog import router as data_catalog_router
 from src.db.postgres.init_db import init_db
-import os
 import uvicorn
 # Configure logging
@@ -25,11 +24,8 @@ logger = get_logger("main")
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     logger.info("Starting application...")
-    if os.getenv("SKIP_INIT_DB", "false").lower() != "true":
-        await init_db()
-        logger.info("Database initialized")
-    else:
-        logger.info("Skipping database initialization (SKIP_INIT_DB=true)")
     yield

 from src.api.v1.db_client import router as db_client_router
 from src.api.v1.data_catalog import router as data_catalog_router
 from src.db.postgres.init_db import init_db
 import uvicorn
 # Configure logging
 @asynccontextmanager
 async def lifespan(app: FastAPI):
     logger.info("Starting application...")
+    await init_db()
+    logger.info("Database initialized")
     yield

pyproject.toml CHANGED Viewed

@@ -77,6 +77,7 @@ dependencies = [
     "cachetools==5.5.0",
     "apscheduler==3.10.4",
     "jsonpatch>=1.33",
     "psycopg2>=2.9.11",
     # --- SQL parsing / guardrails ---
     "sqlglot>=25.0.0",
@@ -120,7 +121,8 @@ ignore = [
 ]
 [tool.ruff.lint.per-file-ignores]
-"tests/**" = ["S101", "S105", "S106"]
 [tool.mypy]
 python_version = "3.12"

     "cachetools==5.5.0",
     "apscheduler==3.10.4",
     "jsonpatch>=1.33",
+    "pymongo>=4.14.0",
     "psycopg2>=2.9.11",
     # --- SQL parsing / guardrails ---
     "sqlglot>=25.0.0",
 ]
 [tool.ruff.lint.per-file-ignores]
+# S608 in tests is a false positive — tests assert literal SQL strings as fixtures.
+"tests/**" = ["S101", "S105", "S106", "S608"]
 [tool.mypy]
 python_version = "3.12"

src/agents/chat_handler.py CHANGED Viewed

@@ -170,7 +170,6 @@ class ChatHandler:
         sources = _build_sources(
             decision.source_hint, user_id, query_result, raw_chunks
         )
-        logger.info("built sources", source_hint=decision.source_hint, sources_count=len(sources), raw_chunks_count=len(raw_chunks) if raw_chunks else 0)
         yield {"event": "sources", "data": json.dumps(sources)}
         # ---- 3. Stream answer ----------------------------------------

         sources = _build_sources(
             decision.source_hint, user_id, query_result, raw_chunks
         )
         yield {"event": "sources", "data": json.dumps(sources)}
         # ---- 3. Stream answer ----------------------------------------

src/api/v1/chat.py CHANGED Viewed

@@ -42,19 +42,15 @@ class ChatRequest(BaseModel):
     message: str
-async def get_cached_response(redis, cache_key: str) -> Optional[dict]:
     cached = await redis.get(cache_key)
     if cached:
-        data = json.loads(cached)
-        if isinstance(data, dict) and "response" in data:
-            return data
-        # legacy: plain string cached before this change
-        return {"response": data, "sources": []}
     return None
-async def cache_response(redis, cache_key: str, response: str, sources: list):
-    await redis.setex(cache_key, 86400, json.dumps({"response": response, "sources": sources}))
 async def load_history(db: AsyncSession, room_id: str, limit: int = 10) -> list:
@@ -95,34 +91,6 @@ async def save_messages(
     await db.commit()
-@router.delete("/chat/cache")
-async def clear_chat_cache(room_id: str, message: str):
-    """Delete the Redis cache entry for a specific room + message pair."""
-    redis = await get_redis()
-    cache_key = f"{settings.redis_prefix}chat:{room_id}:{message}"
-    deleted = await redis.delete(cache_key)
-    return {"deleted": deleted > 0, "cache_key": cache_key}
-@router.delete("/chat/cache/room/{room_id}")
-async def clear_room_cache(room_id: str):
-    """Delete all Redis cache entries for a room."""
-    redis = await get_redis()
-    pattern = f"{settings.redis_prefix}chat:{room_id}:*"
-    keys = await redis.keys(pattern)
-    if keys:
-        await redis.delete(*keys)
-    return {"deleted_count": len(keys), "room_id": room_id}
-@router.delete("/retrieval/cache/{user_id}")
-async def clear_retrieval_cache(user_id: str):
-    """Delete all cached retrieval results for a user. Call this after uploading/processing new documents."""
-    from src.retrieval.router import retrieval_router
-    deleted = await retrieval_router.invalidate_cache(user_id)
-    return {"deleted_count": deleted, "user_id": user_id}
 @router.post("/chat/stream")
 @log_execution(logger)
 async def chat_stream(request: ChatRequest, db: AsyncSession = Depends(get_db)):
@@ -139,17 +107,13 @@ async def chat_stream(request: ChatRequest, db: AsyncSession = Depends(get_db)):
     # Redis cache hit
     cached = await get_cached_response(redis, cache_key)
-    logger.info("cache check", cache_key=cache_key, cache_hit=cached is not None)
     if cached:
         logger.info("Returning cached response")
-        cached_text = cached["response"]
-        cached_sources = cached["sources"]
-        await save_messages(db, request.room_id, request.message, cached_text, sources=cached_sources)
         async def stream_cached():
-            yield {"event": "sources", "data": json.dumps(cached_sources)}
-            for i in range(0, len(cached_text), 50):
-                yield {"event": "chunk", "data": cached_text[i:i + 50]}
             yield {"event": "done", "data": ""}
         return EventSourceResponse(stream_cached())
@@ -158,7 +122,7 @@ async def chat_stream(request: ChatRequest, db: AsyncSession = Depends(get_db)):
         # Fast intent: greetings/farewells bypass LLM entirely
         direct = _fast_intent(request.message)
         if direct:
-            await cache_response(redis, cache_key, direct, sources=[])
             await save_messages(db, request.room_id, request.message, direct, sources=[])
             async def stream_direct():
@@ -172,7 +136,6 @@ async def chat_stream(request: ChatRequest, db: AsyncSession = Depends(get_db)):
         handler = ChatHandler()
         async def stream_response():
-            logger.info("stream_response started", room_id=request.room_id, user_id=request.user_id)
             full_response = ""
             sources: List[Dict[str, Any]] = []
             async for event in handler.handle(request.message, request.user_id, history):
@@ -186,12 +149,8 @@ async def chat_stream(request: ChatRequest, db: AsyncSession = Depends(get_db)):
                     full_response += event["data"]
                     yield event
                 elif event["event"] == "done":
-                    await cache_response(redis, cache_key, full_response, sources=sources)
-                    logger.info("saving messages", sources_count=len(sources), sources=sources)
-                    try:
-                        await save_messages(db, request.room_id, request.message, full_response, sources=sources)
-                    except Exception as e:
-                        logger.error("save_messages failed", room_id=request.room_id, error=str(e))
                     yield event
                 elif event["event"] == "error":
                     yield event

     message: str
+async def get_cached_response(redis, cache_key: str) -> Optional[str]:
     cached = await redis.get(cache_key)
     if cached:
+        return json.loads(cached)
     return None
+async def cache_response(redis, cache_key: str, response: str):
+    await redis.setex(cache_key, 86400, json.dumps(response))
 async def load_history(db: AsyncSession, room_id: str, limit: int = 10) -> list:
     await db.commit()
 @router.post("/chat/stream")
 @log_execution(logger)
 async def chat_stream(request: ChatRequest, db: AsyncSession = Depends(get_db)):
     # Redis cache hit
     cached = await get_cached_response(redis, cache_key)
     if cached:
         logger.info("Returning cached response")
         async def stream_cached():
+            yield {"event": "sources", "data": json.dumps([])}
+            for i in range(0, len(cached), 50):
+                yield {"event": "chunk", "data": cached[i:i + 50]}
             yield {"event": "done", "data": ""}
         return EventSourceResponse(stream_cached())
         # Fast intent: greetings/farewells bypass LLM entirely
         direct = _fast_intent(request.message)
         if direct:
+            await cache_response(redis, cache_key, direct)
             await save_messages(db, request.room_id, request.message, direct, sources=[])
             async def stream_direct():
         handler = ChatHandler()
         async def stream_response():
             full_response = ""
             sources: List[Dict[str, Any]] = []
             async for event in handler.handle(request.message, request.user_id, history):
                     full_response += event["data"]
                     yield event
                 elif event["event"] == "done":
+                    await cache_response(redis, cache_key, full_response)
+                    await save_messages(db, request.room_id, request.message, full_response, sources=sources)
                     yield event
                 elif event["event"] == "error":
                     yield event

src/api/v1/document.py CHANGED Viewed

@@ -114,8 +114,5 @@ async def process_document(
         except Exception as e:
             logger.error("catalog ingestion failed after process", document_id=document_id, error=str(e))
-    from src.retrieval.router import retrieval_router
-    await retrieval_router.invalidate_cache(user_id)
     return {"status": "success", "message": "Document processed successfully", "data": data}

         except Exception as e:
             logger.error("catalog ingestion failed after process", document_id=document_id, error=str(e))
     return {"status": "success", "message": "Document processed successfully", "data": data}

src/config/prompts/intent_router.md CHANGED Viewed

@@ -7,16 +7,16 @@ Return three fields:
 - **`needs_search`** — `true` if we must look at the user's data to answer; `false` for greetings, farewells, off-topic chitchat, or meta questions about the assistant itself.
 - **`source_hint`** — one of:
   - `chat` — no data lookup needed (greetings, farewells, generic small talk).
-  - `unstructured` — the user is asking about a topic, concept, feature, or factual knowledge that may exist in uploaded documents (PDF / DOCX / TXT). The user does not need to explicitly mention a document.
   - `structured` — the user is asking a **data question** answerable from a database or a tabular file (CSV / XLSX / Parquet). This includes counts, sums, top-N, filters, comparisons, trends, joins across registered structured sources.
 - **`rewritten_query`** — a **standalone** version of the user's question that incorporates necessary context from history. If the original message is already standalone, return it unchanged. If `needs_search` is `false`, leave this empty/null.
 ## Routing rules
-1. If the message is ONLY a pure greeting / farewell / thanks / "how are you" / "what can you do" / compliment with no factual question → `chat` + `needs_search=false`.
-2. If the message asks a data question answerable from a database or tabular file (counts, sums, top-N, filters, comparisons, trends, sheet rows, table columns) → `structured` + `needs_search=true`.
-3. If the message asks about a topic, concept, feature, explanation, summary, or factual knowledge — even without explicitly mentioning a document — route to `unstructured` + `needs_search=true`. The user may have uploaded relevant documents covering that topic.
-4. If ambiguous between structured and unstructured → prefer `unstructured`. Only prefer `structured` if there are clear signals of tabular/numeric data questions.
 5. Cross-source comparison ("compare DB sales to the customers.csv file") → `structured`. The planner sees both source types in one prompt and can correlate.
 ## Rewriting follow-ups
@@ -53,22 +53,6 @@ User: "Top 5 customers by revenue this year"
 → needs_search=true, source_hint="structured",
   rewritten_query="Top 5 customers by revenue this year"
-User: "apa key feature dari iot connectivity?"
-→ needs_search=true, source_hint="unstructured",
-  rewritten_query="What are the key features of IoT connectivity?"
-User: "jelaskan tentang machine learning"
-→ needs_search=true, source_hint="unstructured",
-  rewritten_query="Explain machine learning"
-User: "bagaimana cara kerja neural network?"
-→ needs_search=true, source_hint="unstructured",
-  rewritten_query="How does a neural network work?"
-User: "what is the main purpose of this system?"
-→ needs_search=true, source_hint="unstructured",
-  rewritten_query="What is the main purpose of this system?"
 History: assistant: "Pro Plan Annual led at $487,200 in April."
 User: "And in March?"
 → needs_search=true, source_hint="structured",
@@ -77,6 +61,6 @@ User: "And in March?"
 ## Constraints
-- Do not invent data. If the question is factual or knowledge-based (not clearly tabular), route to `unstructured` and let the retriever decide. Only route to `structured` if the question clearly involves counts, sums, filters, or trends from tabular sources.
 - Do not refuse — refusal happens later in guardrails. Just classify.
 - One JSON object as output; no prose, no markdown.

 - **`needs_search`** — `true` if we must look at the user's data to answer; `false` for greetings, farewells, off-topic chitchat, or meta questions about the assistant itself.
 - **`source_hint`** — one of:
   - `chat` — no data lookup needed (greetings, farewells, generic small talk).
+  - `unstructured` — the user is asking about the **content** of an uploaded document (PDF / DOCX / TXT).
   - `structured` — the user is asking a **data question** answerable from a database or a tabular file (CSV / XLSX / Parquet). This includes counts, sums, top-N, filters, comparisons, trends, joins across registered structured sources.
 - **`rewritten_query`** — a **standalone** version of the user's question that incorporates necessary context from history. If the original message is already standalone, return it unchanged. If `needs_search` is `false`, leave this empty/null.
 ## Routing rules
+1. If the message is a pure greeting / farewell / thanks / "how are you" / "what can you do" → `chat` + `needs_search=false`.
+2. If the message references content that lives in a registered DB or uploaded tabular file (sales numbers, customer counts, order trends, sheet rows, table columns) → `structured` + `needs_search=true`.
+3. If the message asks about prose content (a section of a PDF, what a memo says, a quote from a document) → `unstructured` + `needs_search=true`.
+4. If the message is ambiguous between structured and unstructured, prefer `structured` — the planner can fall back if the catalog has nothing relevant.
 5. Cross-source comparison ("compare DB sales to the customers.csv file") → `structured`. The planner sees both source types in one prompt and can correlate.
 ## Rewriting follow-ups
 → needs_search=true, source_hint="structured",
   rewritten_query="Top 5 customers by revenue this year"
 History: assistant: "Pro Plan Annual led at $487,200 in April."
 User: "And in March?"
 → needs_search=true, source_hint="structured",
 ## Constraints
+- Do not invent data. If you don't know whether a topic exists in the user's data, route to `structured` and let the planner decide.
 - Do not refuse — refusal happens later in guardrails. Just classify.
 - One JSON object as output; no prose, no markdown.

src/config/settings.py CHANGED Viewed

@@ -1,7 +1,7 @@
 """Centralized configuration management using pydantic-settings."""
-# import os
-# from typing import Optional
 from pydantic import Field
 from pydantic_settings import BaseSettings, SettingsConfigDict
@@ -51,8 +51,8 @@ class Settings(BaseSettings):
     LANGFUSE_HOST: str
     # MongoDB (for users - existing)
-    # emarcal_mongo_endpoint_url: str = Field(alias="emarcal__mongo__endpoint__url", default="")
-    # emarcal_buma_mongo_dbname: str = Field(alias="emarcal__buma__mongo__dbname", default="")
     # JWT (for users - existing)
     emarcal_jwt_secret_key: str = Field(alias="emarcal__jwt__secret_key", default="")

 """Centralized configuration management using pydantic-settings."""
+import os
+from typing import Optional
 from pydantic import Field
 from pydantic_settings import BaseSettings, SettingsConfigDict
     LANGFUSE_HOST: str
     # MongoDB (for users - existing)
+    emarcal_mongo_endpoint_url: str = Field(alias="emarcal__mongo__endpoint__url", default="")
+    emarcal_buma_mongo_dbname: str = Field(alias="emarcal__buma__mongo__dbname", default="")
     # JWT (for users - existing)
     emarcal_jwt_secret_key: str = Field(alias="emarcal__jwt__secret_key", default="")

src/db/postgres/vector_store.py CHANGED Viewed

@@ -19,7 +19,7 @@ embeddings = AzureOpenAIEmbeddings(
 vector_store = PGVector(
     embeddings=embeddings,
     connection=_pgvector_engine,
-    collection_name="documents",
     use_jsonb=True,
     async_mode=True,
     create_extension=False,  # Extension pre-created in init_db.py (avoids multi-statement asyncpg bug)

 vector_store = PGVector(
     embeddings=embeddings,
     connection=_pgvector_engine,
+    collection_name="document_embeddings",
     use_jsonb=True,
     async_mode=True,
     create_extension=False,  # Extension pre-created in init_db.py (avoids multi-statement asyncpg bug)

src/knowledge/processing_service.py CHANGED Viewed

@@ -59,7 +59,6 @@ class KnowledgeProcessingService:
                                 "filename": db_doc.filename,
                                 "file_type": db_doc.file_type,
                                 "chunk_index": i,
-                                "page_label": None,
                             },
                         }
                     )

                                 "filename": db_doc.filename,
                                 "file_type": db_doc.file_type,
                                 "chunk_index": i,
                             },
                         }
                     )

src/middlewares/logging.py CHANGED Viewed

@@ -9,7 +9,7 @@ import time
 def configure_logging():
     """Configure structured logging."""
-    logging.basicConfig(level=logging.INFO)
     logging.getLogger("tabular_executor").setLevel(logging.INFO)
     structlog.configure(
         processors=[

 def configure_logging():
     """Configure structured logging."""
+    logging.basicConfig(level=logging.WARNING)
     logging.getLogger("tabular_executor").setLevel(logging.INFO)
     structlog.configure(
         processors=[

src/retrieval/document.py CHANGED Viewed

@@ -1,44 +1,68 @@
-"""DocumentRetriever — dense similarity over prose chunks.
-For unstructured sources only (PDF / DOCX / TXT). Backed by PGVector via
-raw SQL to avoid LangChain ORM / asyncpg type-mapping issues (id UUID vs
-String mismatch, jsonb_path_match asyncpg binding quirks).
-Collection `document_embeddings`. Methods: cosine | manhattan.
 """
 import functools
 import math
 from langchain_openai import AzureOpenAIEmbeddings
 from sqlalchemy import text
 from src.config.settings import settings
 from src.db.postgres.connection import _pgvector_engine
 from src.middlewares.logging import get_logger
 from src.retrieval.base import BaseRetriever, RetrievalResult
 logger = get_logger("document_retriever")
 # Change this one line to switch retrieval method
-# Options: "cosine" | "manhattan"
-_RETRIEVAL_METHOD = "cosine"
 _TABULAR_TYPES = {"csv", "xlsx"}
-_COLLECTION_NAME = "documents"
-_COSINE_SQL = text("""
-    SELECT
-        lpe.document,
-        lpe.cmetadata,
-        lpe.embedding <=> CAST(:embedding AS vector) AS distance
-    FROM langchain_pg_embedding lpe
-    JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
-    WHERE lpc.name = :collection
-      AND lpe.cmetadata->>'user_id' = :user_id
-      AND lpe.cmetadata->>'source_type' = 'document'
-    ORDER BY distance ASC
-    LIMIT :k
-""")
 _MANHATTAN_SQL = text("""
     SELECT
@@ -55,32 +79,71 @@ _MANHATTAN_SQL = text("""
 """)
-@functools.cache
-def _get_embeddings() -> AzureOpenAIEmbeddings:
-    return AzureOpenAIEmbeddings(
-        azure_deployment=settings.azureai_deployment_name_embedding,
-        openai_api_version=settings.azureai_api_version_embedding,
-        azure_endpoint=settings.azureai_endpoint_url_embedding,
-        api_key=settings.azureai_api_key_embedding,
-    )
 class DocumentRetriever(BaseRetriever):
     async def retrieve(
         self, query: str, user_id: str, k: int = 5
     ) -> list[RetrievalResult]:
         query_vector = await _get_embeddings().aembed_query(query)
         if not all(math.isfinite(v) for v in query_vector):
             raise ValueError("Embedding vector contains NaN or Infinity values.")
         vector_str = "[" + ",".join(str(v) for v in query_vector) + "]"
-        fetch_k = k + len(_TABULAR_TYPES)
-        sql = _COSINE_SQL if _RETRIEVAL_METHOD == "cosine" else _MANHATTAN_SQL
-        logger.info("retrieve called", user_id=user_id, collection=_COLLECTION_NAME, fetch_k=fetch_k)
         async with _pgvector_engine.connect() as conn:
-            result = await conn.execute(sql, {
                 "embedding": vector_str,
                 "collection": _COLLECTION_NAME,
                 "user_id": user_id,
@@ -88,8 +151,6 @@ class DocumentRetriever(BaseRetriever):
             })
             rows = result.fetchall()
-        logger.info("raw rows from db", row_count=len(rows))
         results = []
         for row in rows:
             file_type = row.cmetadata.get("data", {}).get("file_type", "")
@@ -103,7 +164,7 @@ class DocumentRetriever(BaseRetriever):
             if len(results) == k:
                 break
-        logger.info("retrieved chunks", method=_RETRIEVAL_METHOD, count=len(results))
         return results

+"""DocumentRetriever — dense similarity over prose chunks (Cu).
+For unstructured sources only (PDF / DOCX / TXT). Backed by PGVector with
+collection `document_embeddings`. Methods: MMR, cosine, euclidean, etc.
 """
 import functools
 import math
+from langchain_postgres import PGVector
+from langchain_postgres.vectorstores import DistanceStrategy
 from langchain_openai import AzureOpenAIEmbeddings
 from sqlalchemy import text
 from src.config.settings import settings
 from src.db.postgres.connection import _pgvector_engine
+from src.db.postgres.vector_store import get_vector_store
 from src.middlewares.logging import get_logger
 from src.retrieval.base import BaseRetriever, RetrievalResult
 logger = get_logger("document_retriever")
 # Change this one line to switch retrieval method
+# Options: "mmr" | "cosine" | "euclidean" | "inner_product" | "manhattan"
+_RETRIEVAL_METHOD = "mmr"
 _TABULAR_TYPES = {"csv", "xlsx"}
+_FETCH_K = 20
+_LAMBDA_MULT = 0.5
+_COLLECTION_NAME = "document_embeddings"
+@functools.cache
+def _get_embeddings() -> AzureOpenAIEmbeddings:
+    return AzureOpenAIEmbeddings(
+        azure_deployment=settings.azureai_deployment_name_embedding,
+        openai_api_version=settings.azureai_api_version_embedding,
+        azure_endpoint=settings.azureai_endpoint_url_embedding,
+        api_key=settings.azureai_api_key_embedding,
+    )
+@functools.cache
+def _get_euclidean_store() -> PGVector:
+    return PGVector(
+        embeddings=_get_embeddings(),
+        connection=_pgvector_engine,
+        collection_name=_COLLECTION_NAME,
+        distance_strategy=DistanceStrategy.EUCLIDEAN,
+        use_jsonb=True,
+        async_mode=True,
+        create_extension=False,
+    )
+@functools.cache
+def _get_ip_store() -> PGVector:
+    return PGVector(
+        embeddings=_get_embeddings(),
+        connection=_pgvector_engine,
+        collection_name=_COLLECTION_NAME,
+        distance_strategy=DistanceStrategy.MAX_INNER_PRODUCT,
+        use_jsonb=True,
+        async_mode=True,
+        create_extension=False,
+    )
 _MANHATTAN_SQL = text("""
     SELECT
 """)
 class DocumentRetriever(BaseRetriever):
+    def __init__(self) -> None:
+        self.vector_store = get_vector_store()
     async def retrieve(
         self, query: str, user_id: str, k: int = 5
+    ) -> list[RetrievalResult]:
+        filter_ = {"user_id": user_id, "source_type": "document"}
+        fetch_k = k + len(_TABULAR_TYPES)
+        if _RETRIEVAL_METHOD == "manhattan":
+            return await self._retrieve_manhattan(query, user_id, k, fetch_k)
+        if _RETRIEVAL_METHOD == "mmr":
+            docs = await self.vector_store.amax_marginal_relevance_search(
+                query=query,
+                k=fetch_k,
+                fetch_k=_FETCH_K,
+                lambda_mult=_LAMBDA_MULT,
+                filter=filter_,
+            )
+            cosine = await self.vector_store.asimilarity_search_with_score(
+                query=query, k=fetch_k, filter=filter_,
+            )
+            score_map = {doc.page_content: score for doc, score in cosine}
+            docs_with_scores = [(doc, score_map.get(doc.page_content, 0.0)) for doc in docs]
+        elif _RETRIEVAL_METHOD == "euclidean":
+            docs_with_scores = await _get_euclidean_store().asimilarity_search_with_score(
+                query=query, k=fetch_k, filter=filter_,
+            )
+        elif _RETRIEVAL_METHOD == "inner_product":
+            docs_with_scores = await _get_ip_store().asimilarity_search_with_score(
+                query=query, k=fetch_k, filter=filter_,
+            )
+        else:  # cosine
+            docs_with_scores = await self.vector_store.asimilarity_search_with_score(
+                query=query, k=fetch_k, filter=filter_,
+            )
+        results = []
+        for doc, score in docs_with_scores:
+            file_type = doc.metadata.get("data", {}).get("file_type", "")
+            if file_type not in _TABULAR_TYPES:
+                results.append(RetrievalResult(
+                    content=doc.page_content,
+                    metadata=doc.metadata,
+                    score=score,
+                    source_type="document",
+                ))
+            if len(results) == k:
+                break
+        logger.info("retrieved chunks", method=_RETRIEVAL_METHOD, count=len(results))
+        return results
+    async def _retrieve_manhattan(
+        self, query: str, user_id: str, k: int, fetch_k: int
     ) -> list[RetrievalResult]:
         query_vector = await _get_embeddings().aembed_query(query)
         if not all(math.isfinite(v) for v in query_vector):
             raise ValueError("Embedding vector contains NaN or Infinity values.")
         vector_str = "[" + ",".join(str(v) for v in query_vector) + "]"
         async with _pgvector_engine.connect() as conn:
+            result = await conn.execute(_MANHATTAN_SQL, {
                 "embedding": vector_str,
                 "collection": _COLLECTION_NAME,
                 "user_id": user_id,
             })
             rows = result.fetchall()
         results = []
         for row in rows:
             file_type = row.cmetadata.get("data", {}).get("file_type", "")
             if len(results) == k:
                 break
+        logger.info("retrieved chunks", method="manhattan", count=len(results))
         return results

uv.lock CHANGED Viewed

@@ -1,5 +1,5 @@
 version = 1
-revision = 2
 requires-python = "==3.12.*"
 resolution-markers = [
     "python_full_version >= '3.12.4'",
@@ -50,6 +50,7 @@ dependencies = [
     { name = "pyarrow" },
     { name = "pydantic" },
     { name = "pydantic-settings" },
     { name = "pymssql" },
     { name = "pymysql" },
     { name = "pypdf" },
@@ -137,6 +138,7 @@ requires-dist = [
     { name = "pyarrow", specifier = ">=24.0.0" },
     { name = "pydantic", specifier = "==2.10.3" },
     { name = "pydantic-settings", specifier = "==2.7.0" },
     { name = "pymssql", specifier = ">=2.3.0" },
     { name = "pymysql", specifier = ">=1.1.1" },
     { name = "pypdf", specifier = "==5.1.0" },
@@ -2558,6 +2560,27 @@ crypto = [
     { name = "cryptography" },
 ]
 [[package]]
 name = "pymssql"
 version = "2.3.13"

 version = 1
+revision = 3
 requires-python = "==3.12.*"
 resolution-markers = [
     "python_full_version >= '3.12.4'",
     { name = "pyarrow" },
     { name = "pydantic" },
     { name = "pydantic-settings" },
+    { name = "pymongo" },
     { name = "pymssql" },
     { name = "pymysql" },
     { name = "pypdf" },
     { name = "pyarrow", specifier = ">=24.0.0" },
     { name = "pydantic", specifier = "==2.10.3" },
     { name = "pydantic-settings", specifier = "==2.7.0" },
+    { name = "pymongo", specifier = ">=4.14.0" },
     { name = "pymssql", specifier = ">=2.3.0" },
     { name = "pymysql", specifier = ">=1.1.1" },
     { name = "pypdf", specifier = "==5.1.0" },
     { name = "cryptography" },
 ]
+[[package]]
+name = "pymongo"
+version = "4.16.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "dnspython" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/65/9c/a4895c4b785fc9865a84a56e14b5bd21ca75aadc3dab79c14187cdca189b/pymongo-4.16.0.tar.gz", hash = "sha256:8ba8405065f6e258a6f872fe62d797a28f383a12178c7153c01ed04e845c600c", size = 2495323, upload-time = "2026-01-07T18:05:48.107Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/6a/03/6dd7c53cbde98de469a3e6fb893af896dca644c476beb0f0c6342bcc368b/pymongo-4.16.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:bd4911c40a43a821dfd93038ac824b756b6e703e26e951718522d29f6eb166a8", size = 917619, upload-time = "2026-01-07T18:04:19.173Z" },
+    { url = "https://files.pythonhosted.org/packages/73/e1/328915f2734ea1f355dc9b0e98505ff670f5fab8be5e951d6ed70971c6aa/pymongo-4.16.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:25a6b03a68f9907ea6ec8bc7cf4c58a1b51a18e23394f962a6402f8e46d41211", size = 917364, upload-time = "2026-01-07T18:04:20.861Z" },
+    { url = "https://files.pythonhosted.org/packages/41/fe/4769874dd9812a1bc2880a9785e61eba5340da966af888dd430392790ae0/pymongo-4.16.0-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:91ac0cb0fe2bf17616c2039dac88d7c9a5088f5cb5829b27c9d250e053664d31", size = 1686901, upload-time = "2026-01-07T18:04:22.219Z" },
+    { url = "https://files.pythonhosted.org/packages/fa/8d/15707b9669fdc517bbc552ac60da7124dafe7ac1552819b51e97ed4038b4/pymongo-4.16.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:cf0ec79e8ca7077f455d14d915d629385153b6a11abc0b93283ed73a8013e376", size = 1723034, upload-time = "2026-01-07T18:04:24.055Z" },
+    { url = "https://files.pythonhosted.org/packages/5b/af/3d5d16ff11d447d40c1472da1b366a31c7380d7ea2922a449c7f7f495567/pymongo-4.16.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:2d0082631a7510318befc2b4fdab140481eb4b9dd62d9245e042157085da2a70", size = 1797161, upload-time = "2026-01-07T18:04:25.964Z" },
+    { url = "https://files.pythonhosted.org/packages/fb/04/725ab8664eeec73ec125b5a873448d80f5d8cf2750aaaf804cbc538a50a5/pymongo-4.16.0-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:85dc2f3444c346ea019a371e321ac868a4fab513b7a55fe368f0cc78de8177cc", size = 1780938, upload-time = "2026-01-07T18:04:28.745Z" },
+    { url = "https://files.pythonhosted.org/packages/22/50/dd7e9095e1ca35f93c3c844c92eb6eb0bc491caeb2c9bff3b32fe3c9b18f/pymongo-4.16.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dabbf3c14de75a20cc3c30bf0c6527157224a93dfb605838eabb1a2ee3be008d", size = 1714342, upload-time = "2026-01-07T18:04:30.331Z" },
+    { url = "https://files.pythonhosted.org/packages/03/c9/542776987d5c31ae8e93e92680ea2b6e5a2295f398b25756234cabf38a39/pymongo-4.16.0-cp312-cp312-win32.whl", hash = "sha256:60307bb91e0ab44e560fe3a211087748b2b5f3e31f403baf41f5b7b0a70bd104", size = 887868, upload-time = "2026-01-07T18:04:32.124Z" },
+    { url = "https://files.pythonhosted.org/packages/2e/d4/b4045a7ccc5680fb496d01edf749c7a9367cc8762fbdf7516cf807ef679b/pymongo-4.16.0-cp312-cp312-win_amd64.whl", hash = "sha256:f513b2c6c0d5c491f478422f6b5b5c27ac1af06a54c93ef8631806f7231bd92e", size = 907554, upload-time = "2026-01-07T18:04:33.685Z" },
+    { url = "https://files.pythonhosted.org/packages/60/4c/33f75713d50d5247f2258405142c0318ff32c6f8976171c4fcae87a9dbdf/pymongo-4.16.0-cp312-cp312-win_arm64.whl", hash = "sha256:dfc320f08ea9a7ec5b2403dc4e8150636f0d6150f4b9792faaae539c88e7db3b", size = 892971, upload-time = "2026-01-07T18:04:35.594Z" },
+]
 [[package]]
 name = "pymssql"
 version = "2.3.13"