Spaces:

DataEyond
/

Agentic-Service-Data-Eyond

Sleeping

App Files Files Community

[KM-438][KM-439] Improve Retrieval and Querying feature

#15

by rhbt6767 - opened 4 days ago

base: refs/heads/main

←

from: refs/pr/15

Discussion Files changed

+4430

-281

Files changed (46) hide show

.dockerignore +6 -0
.gitattributes +1 -0
.gitignore +13 -1
.vscode/launch.json +25 -0
Dockerfile +2 -0
README.md +2 -0
main.py +2 -0
pyproject.toml +19 -10
src/agents/chatbot.py +11 -1
src/agents/orchestration.py +5 -0
src/api/v1/chat.py +80 -17
src/api/v1/db_client.py +471 -3
src/api/v1/document.py +43 -128
src/config/agents/system_prompt.md +1 -2
src/config/settings.py +5 -0
src/database_client/database_client_service.py +164 -0
src/db/postgres/init_db.py +43 -1
src/db/postgres/models.py +16 -0
src/document/document_service.py +17 -1
src/knowledge/parquet_service.py +77 -0
src/knowledge/processing_service.py +145 -56
src/models/credentials.py +164 -0
src/models/sql_query.py +8 -0
src/models/structured_output.py +4 -0
src/pipeline/db_pipeline/__init__.py +3 -0
src/pipeline/db_pipeline/db_pipeline_service.py +302 -0
src/pipeline/db_pipeline/extractor.py +283 -0
src/pipeline/document_pipeline/__init__.py +0 -0
src/pipeline/document_pipeline/document_pipeline.py +94 -0
src/query/__init__.py +0 -0
src/query/base.py +32 -0
src/query/executors/__init__.py +0 -0
src/query/executors/db_executor.py +648 -0
src/query/executors/tabular.py +287 -0
src/query/query_executor.py +42 -0
src/rag/base.py +20 -0
src/rag/retriever.py +24 -48
src/rag/retrievers/__init__.py +0 -0
src/rag/retrievers/baseline.py +76 -0
src/rag/retrievers/document.py +158 -0
src/rag/retrievers/schema.py +411 -0
src/rag/router.py +179 -0
src/storage/az_blob/az_blob.py +34 -0
src/tools/search.py +3 -3
src/utils/db_credential_encryption.py +70 -0
uv.lock +440 -10

.dockerignore ADDED Viewed

	@@ -0,0 +1,6 @@

+.venv
+software/
+__pycache__
+*.py[oc]
+.env
+.env.*

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+software/** filter=lfs diff=lfs merge=lfs -text

.gitignore CHANGED Viewed

@@ -26,6 +26,10 @@ test/users/user_accounts.csv
 .env.prd
 .env.example
 erd/
 playground/
 playground_retriever.py
@@ -33,4 +37,12 @@ playground_chat.py
 playground_flush_cache.py
 playground_create_user.py
 API_CONTRACT.md
-context_engineering/

 .env.prd
 .env.example
+CLAUDE.md
+/experiments
+src/rag/experiments/
 erd/
 playground/
 playground_retriever.py
 playground_flush_cache.py
 playground_create_user.py
 API_CONTRACT.md
+context_engineering/
+sample_file/
+test_tesseract.py
+# Windows binaries — installed via apt in Docker instead
+software/
+tests/
+.claude/

.vscode/launch.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "DataEyond: FastAPI (debug)",
+            "type": "debugpy",
+            "request": "launch",
+            "module": "uvicorn",
+            "args": [
+                "main:app",
+                "--host", "0.0.0.0",
+                "--port", "7860",
+                "--reload"
+            ],
+            "jinja": true,
+            "justMyCode": true,
+            "envFile": "${workspaceFolder}/.env",
+            "console": "integratedTerminal",
+            "cwd": "${workspaceFolder}"
+        }
+    ]
+}

Dockerfile CHANGED Viewed

@@ -12,6 +12,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     libpq-dev \
     gcc \
     libgomp1 \
     && rm -rf /var/lib/apt/lists/*
 RUN addgroup --system app && \

     libpq-dev \
     gcc \
     libgomp1 \
+    tesseract-ocr \
+    poppler-utils \
     && rm -rf /var/lib/apt/lists/*
 RUN addgroup --system app && \

README.md CHANGED Viewed

@@ -11,6 +11,8 @@ short_description: AI Agent core service
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 How to run:
 `uv run --no-sync uvicorn main:app --host 0.0.0.0 --port 7860`

 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+# Agentic Service Data Eyond
 How to run:
 `uv run --no-sync uvicorn main:app --host 0.0.0.0 --port 7860`

main.py CHANGED Viewed

@@ -10,6 +10,7 @@ from src.api.v1.chat import router as chat_router
 from src.api.v1.room import router as room_router
 from src.api.v1.users import router as users_router
 from src.api.v1.knowledge import router as knowledge_router
 from src.db.postgres.init_db import init_db
 import uvicorn
@@ -35,6 +36,7 @@ app.include_router(document_router)
 app.include_router(knowledge_router)
 app.include_router(room_router)
 app.include_router(chat_router)
 @app.on_event("startup")

 from src.api.v1.room import router as room_router
 from src.api.v1.users import router as users_router
 from src.api.v1.knowledge import router as knowledge_router
+from src.api.v1.db_client import router as db_client_router
 from src.db.postgres.init_db import init_db
 import uvicorn
 app.include_router(knowledge_router)
 app.include_router(room_router)
 app.include_router(chat_router)
+app.include_router(db_client_router)
 @app.on_event("startup")

pyproject.toml CHANGED Viewed

@@ -79,6 +79,18 @@ dependencies = [
     "jsonpatch>=1.33",
     "pymongo>=4.14.0",
     "psycopg2>=2.9.11",
 ]
 [project.optional-dependencies]
@@ -92,16 +104,6 @@ dev = [
     "pre-commit==4.0.1",
 ]
-[tool.uv]
-dev-dependencies = [
-    "pytest==8.3.4",
-    "pytest-asyncio==0.24.0",
-    "pytest-cov==6.0.0",
-    "ruff==0.8.4",
-    "mypy==1.13.0",
-    "pre-commit==4.0.1",
-]
 [tool.hatch.build.targets.wheel]
 packages = ["src/agent_service"]
@@ -133,3 +135,10 @@ testpaths = ["tests"]
 filterwarnings = [
     "ignore::DeprecationWarning",
 ]

     "jsonpatch>=1.33",
     "pymongo>=4.14.0",
     "psycopg2>=2.9.11",
+    # --- SQL parsing / guardrails ---
+    "sqlglot>=25.0.0",
+    # --- User-DB connectors (db_pipeline) ---
+    "pymysql>=1.1.1",
+    "pymssql>=2.3.0",
+    "sqlalchemy-bigquery>=1.11.0",
+    "snowflake-sqlalchemy>=1.7.0",
+    # --- OCR (pdf processing) ---
+    "pdf2image>=1.17.0",
+    "pytesseract>=0.3.13",
+    "pypdf2>=3.0.1",
+    "pyarrow>=24.0.0",
 ]
 [project.optional-dependencies]
     "pre-commit==4.0.1",
 ]
 [tool.hatch.build.targets.wheel]
 packages = ["src/agent_service"]
 filterwarnings = [
     "ignore::DeprecationWarning",
 ]
+[dependency-groups]
+dev = [
+    "pytest>=8.3.4",
+    "pytest-asyncio>=0.24.0",
+    "ruff>=0.8.4",
+]

src/agents/chatbot.py CHANGED Viewed

@@ -1,5 +1,6 @@
 """Chatbot agent with RAG capabilities."""
 from langchain_openai import AzureChatOpenAI
 from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
 from langchain_core.output_parsers import StrOutputParser
@@ -9,6 +10,14 @@ from langchain_core.messages import HumanMessage, AIMessage
 logger = get_logger("chatbot")
 class ChatbotAgent:
     """Chatbot agent with RAG capabilities."""
@@ -64,7 +73,8 @@ class ChatbotAgent:
     async def astream_response(self, messages: list, context: str = ""):
         """Stream response tokens as they are generated."""
         try:
-            logger.info("Streaming chatbot response")
             async for token in self.chain.astream({"messages": messages, "context": context}):
                 yield token
         except Exception as e:

 """Chatbot agent with RAG capabilities."""
+import tiktoken
 from langchain_openai import AzureChatOpenAI
 from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
 from langchain_core.output_parsers import StrOutputParser
 logger = get_logger("chatbot")
+_enc = tiktoken.get_encoding("cl100k_base")
+def _count_tokens(messages: list, context: str) -> dict:
+    msg_tokens = sum(len(_enc.encode(m.content)) for m in messages)
+    ctx_tokens = len(_enc.encode(context))
+    return {"messages_tokens": msg_tokens, "context_tokens": ctx_tokens, "total": msg_tokens + ctx_tokens}
 class ChatbotAgent:
     """Chatbot agent with RAG capabilities."""
     async def astream_response(self, messages: list, context: str = ""):
         """Stream response tokens as they are generated."""
         try:
+            token_counts = _count_tokens(messages, context)
+            logger.info("LLM input tokens", **token_counts)
             async for token in self.chain.astream({"messages": messages, "context": context}):
                 yield token
         except Exception as e:

src/agents/orchestration.py CHANGED Viewed

@@ -35,6 +35,11 @@ Intent Routing:
 - greeting -> needs_search=False, direct_response="Hello! How can I assist you today?"
 - goodbye -> needs_search=False, direct_response="Goodbye! Have a great day!"
 - other -> needs_search=True, search_query=<standalone rewritten query>
 """),
             MessagesPlaceholder(variable_name="history"),
             ("user", "{message}")

 - greeting -> needs_search=False, direct_response="Hello! How can I assist you today?"
 - goodbye -> needs_search=False, direct_response="Goodbye! Have a great day!"
 - other -> needs_search=True, search_query=<standalone rewritten query>
+Source Routing (set source_hint):
+- Columns, tables, sheets, data types, schema, row counts, statistics -> source_hint=schema
+- Document content, paragraphs, reports, articles, text -> source_hint=document
+- Unclear or spans both -> source_hint=both
 """),
             MessagesPlaceholder(variable_name="history"),
             ("user", "{message}")

src/api/v1/chat.py CHANGED Viewed

@@ -9,6 +9,9 @@ from src.db.postgres.models import ChatMessage, MessageSource
 from src.agents.orchestration import orchestrator
 from src.agents.chatbot import chatbot
 from src.rag.retriever import retriever
 from src.db.redis.connection import get_redis
 from src.config.settings import settings
 from src.middlewares.logging import get_logger, log_execution
@@ -45,34 +48,66 @@ class ChatRequest(BaseModel):
     message: str
-def _format_context(results: List[Dict[str, Any]]) -> str:
     """Format retrieval results as context string for the LLM."""
     lines = []
     for result in results:
-        filename = result["metadata"].get("filename", "Unknown")
-        page = result["metadata"].get("page_label")
         source_label = f"{filename}, p.{page}" if page else filename
-        lines.append(f"[Source: {source_label}]\n{result['content']}\n")
     return "\n".join(lines)
-def _extract_sources(results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
     """Extract deduplicated source references from retrieval results."""
     seen = set()
     sources = []
     for result in results:
-        meta = result["metadata"]
-        key = (meta.get("document_id"), meta.get("page_label"))
-        if key not in seen:
-            seen.add(key)
-            sources.append({
-                "document_id": meta.get("document_id"),
-                "filename": meta.get("filename", "Unknown"),
-                "page_label": meta.get("page_label"),
-            })
     return sources
 async def get_cached_response(redis, cache_key: str) -> Optional[str]:
     cached = await redis.get(cache_key)
     if cached:
@@ -155,9 +190,12 @@ async def chat_stream(request: ChatRequest, db: AsyncSession = Depends(get_db)):
         sources: List[Dict[str, Any]] = []
         if intent_result is None:
-            # Step 2: Launch retrieval and history loading in parallel, then run orchestrator
             retrieval_task = asyncio.create_task(
-                retriever.retrieve(request.message, request.user_id, db)
             )
             history_task = asyncio.create_task(
                 load_history(db, request.room_id, limit=6)  # 6 msgs (3 pairs) for orchestrator
@@ -165,18 +203,28 @@ async def chat_stream(request: ChatRequest, db: AsyncSession = Depends(get_db)):
             history = await history_task  # fast DB query (<100ms), done before orchestrator finishes
             intent_result = await orchestrator.analyze_message(request.message, history)
             if not intent_result.get("needs_search"):
                 retrieval_task.cancel()
                 raw_results = []
             else:
-                search_query = intent_result.get("search_query", request.message)
                 logger.info(f"Searching for: {search_query}")
                 if search_query != request.message:
                     retrieval_task.cancel()
                     raw_results = await retriever.retrieve(
                         query=search_query,
                         user_id=request.user_id,
                         db=db,
                     )
                 else:
                     raw_results = await retrieval_task
@@ -184,6 +232,21 @@ async def chat_stream(request: ChatRequest, db: AsyncSession = Depends(get_db)):
             context = _format_context(raw_results)
             sources = _extract_sources(raw_results)
         # Step 3: Direct response for greetings / non-document intents
         if intent_result.get("direct_response"):
             response = intent_result["direct_response"]

 from src.agents.orchestration import orchestrator
 from src.agents.chatbot import chatbot
 from src.rag.retriever import retriever
+from src.rag.base import RetrievalResult
+from src.query.query_executor import query_executor
+from src.query.base import QueryResult
 from src.db.redis.connection import get_redis
 from src.config.settings import settings
 from src.middlewares.logging import get_logger, log_execution
     message: str
+def _format_context(results: List[RetrievalResult]) -> str:
     """Format retrieval results as context string for the LLM."""
     lines = []
     for result in results:
+        data = result.metadata.get("data", {})
+        filename = data.get("filename", "Unknown")
+        page = data.get("page_label")
         source_label = f"{filename}, p.{page}" if page else filename
+        lines.append(f"[Source: {source_label}]\n{result.content}\n")
     return "\n".join(lines)
+def _extract_sources(results: List[RetrievalResult]) -> List[Dict[str, Any]]:
     """Extract deduplicated source references from retrieval results."""
     seen = set()
     sources = []
     for result in results:
+        meta = result.metadata
+        data = meta.get("data", {})
+        if "document_id" in data:
+            key = (data.get("document_id"), data.get("page_label"))
+            if key not in seen:
+                seen.add(key)
+                sources.append({
+                    "document_id": data.get("document_id"),
+                    "filename": data.get("filename", "Unknown"),
+                    "page_label": data.get("page_label", "Unknown"),
+                })
+        else:
+            key = (data.get("table_name"), data.get("column_name"))
+            if key not in seen:
+                seen.add(key)
+                table_name = data.get("table_name")
+                user_id = meta.get("user_id")
+                sources.append({
+                    "document_id": f"{user_id}_{table_name}",
+                    "filename": data.get("table_name", "Unknown"),
+                    "page_label": data.get("column_name", "Unknown"),
+                })
+    logger.debug(f"Extracted sources: {sources}")
     return sources
+def _format_query_results(results: list[QueryResult]) -> str:
+    if not results:
+        return ""
+    lines = []
+    for r in results:
+        name = r.metadata.get("client_name", r.source_id)
+        lines.append(f"[Query result — {name}, tables: {r.table_or_file}]")
+        lines.append(f"SQL: {r.metadata.get('sql', '')}")
+        if r.columns and r.rows:
+            lines.append(" | ".join(r.columns))
+            for row in r.rows[:20]:
+                lines.append(" | ".join(str(row.get(c, "")) for c in r.columns))
+        lines.append(f"({r.row_count} rows total)\n")
+    return "\n".join(lines)
 async def get_cached_response(redis, cache_key: str) -> Optional[str]:
     cached = await redis.get(cache_key)
     if cached:
         sources: List[Dict[str, Any]] = []
         if intent_result is None:
+            # Step 2: Launch retrieval and history loading in parallel, then run orchestrator.
+            # k=5
+            # tables — db_executor's FK expansion is one-hop and cannot bridge
+            # 2-hop gaps (e.g. customers -> order_items -> products) on its own.
             retrieval_task = asyncio.create_task(
+                retriever.retrieve(request.message, request.user_id, db, k=5)
             )
             history_task = asyncio.create_task(
                 load_history(db, request.room_id, limit=6)  # 6 msgs (3 pairs) for orchestrator
             history = await history_task  # fast DB query (<100ms), done before orchestrator finishes
             intent_result = await orchestrator.analyze_message(request.message, history)
+            search_query = intent_result.get("search_query", request.message) or request.message
             if not intent_result.get("needs_search"):
                 retrieval_task.cancel()
+                try:
+                    await retrieval_task
+                except asyncio.CancelledError:
+                    pass
                 raw_results = []
             else:
                 logger.info(f"Searching for: {search_query}")
                 if search_query != request.message:
                     retrieval_task.cancel()
+                    try:
+                        await retrieval_task
+                    except asyncio.CancelledError:
+                        pass
                     raw_results = await retriever.retrieve(
                         query=search_query,
                         user_id=request.user_id,
                         db=db,
+                        k=5,
+                        source_hint=intent_result.get("source_hint", "both"),
                     )
                 else:
                     raw_results = await retrieval_task
             context = _format_context(raw_results)
             sources = _extract_sources(raw_results)
+            source_hint = intent_result.get("source_hint", "both")
+            if source_hint in ("schema", "both"):
+                # Use search_query (orchestrator's standalone rewrite) so follow-up
+                # messages like "dive deeper" or "show me last year" resolve correctly.
+                # For first-turn questions search_query == request.message, so no change.
+                query_results = await query_executor.execute(
+                    results=raw_results,
+                    user_id=request.user_id,
+                    db=db,
+                    question=search_query,
+                )
+                query_context = _format_query_results(query_results)
+                if query_context:
+                    context = query_context + "\n\n" + context
         # Step 3: Direct response for greetings / non-document intents
         if intent_result.get("direct_response"):
             response = intent_result["direct_response"]

src/api/v1/db_client.py CHANGED Viewed

@@ -1,5 +1,473 @@
-from typing import Literal, Dict
-dbtypes: Literal["postgresql", "mysql", "sqlite"] = Literal["postgresql", "mysql", "sqlite"]
-creds: Dict[str, str]

+"""API endpoints for user-registered database connections.
+Credential schemas (DbType, PostgresCredentials, etc.) live in
+`src/models/credentials.py` — they are imported below (with noqa: F401) so
+FastAPI/Swagger picks them up for OpenAPI schema generation even though they
+are not referenced by name in this file.
+"""
+from typing import Any, Dict, List, Literal, Optional
+from datetime import datetime
+from fastapi import APIRouter, Depends, HTTPException, Query, Request, status
+from pydantic import BaseModel, Field
+from sqlalchemy.ext.asyncio import AsyncSession
+from src.database_client.database_client_service import database_client_service
+from src.db.postgres.connection import get_db
+from src.middlewares.logging import get_logger, log_execution
+from src.middlewares.rate_limit import limiter
+from src.models.credentials import (  # noqa: F401 — re-exported for Swagger schema discovery
+    BigQueryCredentials,
+    CredentialSchemas,
+    DbType,
+    MysqlCredentials,
+    PostgresCredentials,
+    SnowflakeCredentials,
+    SqlServerCredentials,
+    SupabaseCredentials,
+)
+from src.pipeline.db_pipeline import db_pipeline_service
+from src.utils.db_credential_encryption import decrypt_credentials_dict
+logger = get_logger("database_client_api")
+router = APIRouter(prefix="/api/v1", tags=["Database Clients"])
+# ---------------------------------------------------------------------------
+# Request / Response schemas
+# ---------------------------------------------------------------------------
+class DatabaseClientCreate(BaseModel):
+    """
+    Payload to register a new external database connection.
+    The `credentials` object shape depends on `db_type`:
+    | db_type     | Required fields                                          |
+    |-------------|----------------------------------------------------------|
+    | postgres    | host, port, database, username, password, ssl_mode       |
+    | mysql       | host, port, database, username, password, ssl            |
+    | sqlserver   | host, port, database, username, password, driver?        |
+    | supabase    | host, port, database, username, password, ssl_mode       |
+    | bigquery    | project_id, dataset_id, location?, service_account_json  |
+    | snowflake   | account, warehouse, database, schema?, username, password, role? |
+    Sensitive fields (`password`, `service_account_json`) are encrypted
+    at rest using Fernet symmetric encryption.
+    """
+    name: str = Field(..., description="Display name for this connection.", examples=["Production DB"])
+    db_type: DbType = Field(..., description="Type of the database engine.", examples=["postgres"])
+    credentials: Dict[str, Any] = Field(
+        ...,
+        description="Connection credentials. Shape depends on db_type. See schema descriptions above.",
+        examples=[
+            {
+                "host": "db.example.com",
+                "port": 5432,
+                "database": "mydb",
+                "username": "admin",
+                "password": "s3cr3t!",
+                "ssl_mode": "require",
+            }
+        ],
+    )
+class DatabaseClientUpdate(BaseModel):
+    """
+    Payload to update an existing database connection.
+    All fields are optional — only provided fields will be updated.
+    If `credentials` is provided, it replaces the entire credentials object
+    and sensitive fields are re-encrypted.
+    """
+    name: Optional[str] = Field(None, description="New display name for this connection.", examples=["Staging DB"])
+    credentials: Optional[Dict[str, Any]] = Field(
+        None,
+        description="Updated credentials object. Replaces existing credentials entirely if provided.",
+        examples=[{"host": "new-host.example.com", "port": 5432, "database": "mydb", "username": "admin", "password": "n3wP@ss!", "ssl_mode": "require"}],
+    )
+    status: Optional[Literal["active", "inactive"]] = Field(
+        None,
+        description="Set to 'inactive' to soft-disable the connection without deleting it.",
+        examples=["inactive"],
+    )
+class DatabaseClientResponse(BaseModel):
+    """
+    Database connection record returned by the API.
+    Credentials are **never** included in the response for security reasons.
+    """
+    id: str = Field(..., description="Unique identifier of the database connection.")
+    user_id: str = Field(..., description="ID of the user who owns this connection.")
+    name: str = Field(..., description="Display name of the connection.")
+    db_type: str = Field(..., description="Database engine type.")
+    status: str = Field(..., description="Connection status: 'active' or 'inactive'.")
+    created_at: datetime = Field(..., description="Timestamp when the connection was registered.")
+    updated_at: Optional[datetime] = Field(None, description="Timestamp of the last update, if any.")
+    model_config = {"from_attributes": True}
+# ---------------------------------------------------------------------------
+# Supported DB types registry
+# ---------------------------------------------------------------------------
+_DB_TYPES: List[Dict[str, Any]] = [
+    {
+        "db_type": "postgres",
+        "display_name": "PostgreSQL",
+        "logo": "postgres",
+        "status": "active",
+        "message": None,
+        "fields": [
+            {"name": "host", "type": "string", "required": True, "default": None, "description": "Hostname or IP address"},
+            {"name": "port", "type": "integer", "required": False, "default": 5432, "description": "Port number"},
+            {"name": "database", "type": "string", "required": True, "default": None, "description": "Database name"},
+            {"name": "username", "type": "string", "required": True, "default": None, "description": "Database username"},
+            {"name": "password", "type": "string", "required": True, "default": None, "description": "Database password", "sensitive": True},
+            {"name": "ssl_mode", "type": "select", "required": False, "default": "require", "description": "SSL mode", "options": ["disable", "require", "verify-ca", "verify-full"]},
+        ],
+    },
+    {
+        "db_type": "mysql",
+        "display_name": "MySQL",
+        "logo": "mysql",
+        "status": "active",
+        "message": None,
+        "fields": [
+            {"name": "host", "type": "string", "required": True, "default": None, "description": "Hostname or IP address"},
+            {"name": "port", "type": "integer", "required": False, "default": 3306, "description": "Port number"},
+            {"name": "database", "type": "string", "required": True, "default": None, "description": "Database name"},
+            {"name": "username", "type": "string", "required": True, "default": None, "description": "Database username"},
+            {"name": "password", "type": "string", "required": True, "default": None, "description": "Database password", "sensitive": True},
+            {"name": "ssl", "type": "boolean", "required": False, "default": True, "description": "Enable SSL"},
+        ],
+    },
+    {
+        "db_type": "supabase",
+        "display_name": "Supabase",
+        "logo": "supabase",
+        "status": "active",
+        "message": None,
+        "fields": [
+            {"name": "host", "type": "string", "required": True, "default": None, "description": "Supabase database host"},
+            {"name": "port", "type": "integer", "required": False, "default": 5432, "description": "Port number (5432 direct, 6543 pooler)"},
+            {"name": "database", "type": "string", "required": False, "default": "postgres", "description": "Database name"},
+            {"name": "username", "type": "string", "required": True, "default": None, "description": "Database user"},
+            {"name": "password", "type": "string", "required": True, "default": None, "description": "Database password", "sensitive": True},
+            {"name": "ssl_mode", "type": "select", "required": False, "default": "require", "description": "SSL mode", "options": ["require", "verify-ca", "verify-full"]},
+        ],
+    },
+    {
+        "db_type": "sqlserver",
+        "display_name": "SQL Server",
+        "logo": "sqlserver",
+        "status": "inactive",
+        "message": "Coming soon",
+        "fields": [
+            {"name": "host", "type": "string", "required": True, "default": None, "description": "Hostname or IP address"},
+            {"name": "port", "type": "integer", "required": False, "default": 1433, "description": "Port number"},
+            {"name": "database", "type": "string", "required": True, "default": None, "description": "Database name"},
+            {"name": "username", "type": "string", "required": True, "default": None, "description": "Database username"},
+            {"name": "password", "type": "string", "required": True, "default": None, "description": "Database password", "sensitive": True},
+            {"name": "driver", "type": "string", "required": False, "default": None, "description": "ODBC driver name"},
+        ],
+    },
+    {
+        "db_type": "bigquery",
+        "display_name": "BigQuery",
+        "logo": "bigquery",
+        "status": "inactive",
+        "message": "Coming soon",
+        "fields": [
+            {"name": "project_id", "type": "string", "required": True, "default": None, "description": "GCP project ID"},
+            {"name": "dataset_id", "type": "string", "required": True, "default": None, "description": "BigQuery dataset name"},
+            {"name": "location", "type": "string", "required": False, "default": "US", "description": "Dataset location/region"},
+            {"name": "service_account_json", "type": "string", "required": True, "default": None, "description": "GCP Service Account key JSON", "sensitive": True},
+        ],
+    },
+    {
+        "db_type": "snowflake",
+        "display_name": "Snowflake",
+        "logo": "snowflake",
+        "status": "inactive",
+        "message": "Coming soon",
+        "fields": [
+            {"name": "account", "type": "string", "required": True, "default": None, "description": "Snowflake account identifier"},
+            {"name": "warehouse", "type": "string", "required": True, "default": None, "description": "Virtual warehouse name"},
+            {"name": "database", "type": "string", "required": True, "default": None, "description": "Database name"},
+            {"name": "schema", "type": "string", "required": False, "default": "PUBLIC", "description": "Schema name"},
+            {"name": "username", "type": "string", "required": True, "default": None, "description": "Snowflake username"},
+            {"name": "password", "type": "string", "required": True, "default": None, "description": "Snowflake password", "sensitive": True},
+            {"name": "role", "type": "string", "required": False, "default": None, "description": "Snowflake role"},
+        ],
+    },
+]
+# ---------------------------------------------------------------------------
+# Endpoints
+# ---------------------------------------------------------------------------
+@router.get(
+    "/database-clients/dbtypes",
+    summary="List supported database types",
+    response_description="All database types supported by DataEyond with their connection parameters.",
+)
+async def list_db_types():
+    """
+    Return every database type DataEyond can connect to, along with the
+    credential fields the frontend should render, a logo filename, and
+    an active/inactive status with an optional message.
+    """
+    return _DB_TYPES
+@router.post(
+    "/database-clients",
+    response_model=DatabaseClientResponse,
+    status_code=status.HTTP_201_CREATED,
+    summary="Register a new database connection",
+    response_description="The newly created database connection record (credentials excluded).",
+    responses={
+        201: {"description": "Connection registered successfully."},
+        422: {"description": "Validation error — check the credentials shape for the given db_type."},
+        500: {"description": "Internal server error."},
+    },
+)
+@limiter.limit("10/minute")
+@log_execution(logger)
+async def create_database_client(
+    request: Request,
+    payload: DatabaseClientCreate,
+    user_id: str = Query(..., description="ID of the user registering the connection."),
+    db: AsyncSession = Depends(get_db),
+):
+    """
+    Register a new external database connection for a user.
+    The `credentials` object must match the shape for the chosen `db_type`
+    (see **CredentialSchemas** in the schema section below for exact fields).
+    Sensitive fields (`password`, `service_account_json`) are encrypted
+    before being persisted — they are never returned in any response.
+    """
+    try:
+        client = await database_client_service.create(
+            db=db,
+            user_id=user_id,
+            name=payload.name,
+            db_type=payload.db_type,
+            credentials=payload.credentials,
+        )
+        return DatabaseClientResponse.model_validate(client)
+    except Exception as e:
+        logger.error(f"Failed to create database client for user {user_id}", error=str(e))
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"Failed to create database client: {str(e)}",
+        )
+@router.get(
+    "/database-clients/{user_id}",
+    response_model=List[DatabaseClientResponse],
+    summary="List all database connections for a user",
+    response_description="List of database connections (credentials excluded).",
+    responses={
+        200: {"description": "Returns an empty list if the user has no connections."},
+    },
+)
+@log_execution(logger)
+async def list_database_clients(
+    user_id: str,
+    db: AsyncSession = Depends(get_db),
+):
+    """
+    Return all database connections registered by the specified user,
+    ordered by creation date (newest first).
+    Credentials are never included in the response.
+    """
+    clients = await database_client_service.get_user_clients(db, user_id)
+    return [DatabaseClientResponse.model_validate(c) for c in clients]
+@router.get(
+    "/database-clients/{user_id}/{client_id}",
+    response_model=DatabaseClientResponse,
+    summary="Get a single database connection",
+    response_description="Database connection detail (credentials excluded).",
+    responses={
+        404: {"description": "Connection not found."},
+        403: {"description": "Access denied — user_id does not own this connection."},
+    },
+)
+@log_execution(logger)
+async def get_database_client(
+    user_id: str,
+    client_id: str,
+    db: AsyncSession = Depends(get_db),
+):
+    """
+    Return the detail of a single database connection.
+    Returns **403** if the `user_id` in the path does not match the owner
+    of the requested connection.
+    """
+    client = await database_client_service.get(db, client_id)
+    if not client:
+        raise HTTPException(status_code=404, detail="Database client not found")
+    if client.user_id != user_id:
+        raise HTTPException(status_code=403, detail="Access denied")
+    return DatabaseClientResponse.model_validate(client)
+@router.put(
+    "/database-clients/{client_id}",
+    response_model=DatabaseClientResponse,
+    summary="Update a database connection",
+    response_description="Updated database connection record (credentials excluded).",
+    responses={
+        404: {"description": "Connection not found."},
+        403: {"description": "Access denied — user_id does not own this connection."},
+    },
+)
+@log_execution(logger)
+async def update_database_client(
+    client_id: str,
+    payload: DatabaseClientUpdate,
+    user_id: str = Query(..., description="ID of the user who owns the connection."),
+    db: AsyncSession = Depends(get_db),
+):
+    """
+    Update an existing database connection.
+    Only fields present in the request body are updated.
+    If `credentials` is provided it **replaces** the entire credentials object
+    and sensitive fields are re-encrypted automatically.
+    """
+    client = await database_client_service.get(db, client_id)
+    if not client:
+        raise HTTPException(status_code=404, detail="Database client not found")
+    if client.user_id != user_id:
+        raise HTTPException(status_code=403, detail="Access denied")
+    updated = await database_client_service.update(
+        db=db,
+        client_id=client_id,
+        name=payload.name,
+        credentials=payload.credentials,
+        status=payload.status,
+    )
+    return DatabaseClientResponse.model_validate(updated)
+@router.delete(
+    "/database-clients/{client_id}",
+    status_code=status.HTTP_200_OK,
+    summary="Delete a database connection",
+    responses={
+        200: {"description": "Connection deleted successfully."},
+        404: {"description": "Connection not found."},
+        403: {"description": "Access denied — user_id does not own this connection."},
+    },
+)
+@log_execution(logger)
+async def delete_database_client(
+    client_id: str,
+    user_id: str = Query(..., description="ID of the user who owns the connection."),
+    db: AsyncSession = Depends(get_db),
+):
+    """
+    Permanently delete a database connection.
+    This action is irreversible. The stored credentials are also removed.
+    """
+    client = await database_client_service.get(db, client_id)
+    if not client:
+        raise HTTPException(status_code=404, detail="Database client not found")
+    if client.user_id != user_id:
+        raise HTTPException(status_code=403, detail="Access denied")
+    await database_client_service.delete(db, client_id)
+    return {"status": "success", "message": "Database client deleted successfully"}
+@router.post(
+    "/database-clients/{client_id}/ingest",
+    status_code=status.HTTP_200_OK,
+    summary="Ingest schema from a registered database into the vector store",
+    response_description="Count of chunks ingested.",
+    responses={
+        200: {"description": "Ingestion completed successfully."},
+        403: {"description": "Access denied — user_id does not own this connection."},
+        404: {"description": "Connection not found."},
+        501: {"description": "The connection's db_type is not yet supported by the pipeline."},
+        500: {"description": "Ingestion failed (connection error, profiling error, etc.)."},
+    },
+)
+@limiter.limit("5/minute")
+@log_execution(logger)
+async def ingest_database_client(
+    request: Request,
+    client_id: str,
+    user_id: str = Query(..., description="ID of the user who owns the connection."),
+    db: AsyncSession = Depends(get_db),
+):
+    """
+    Decrypt the stored credentials, connect to the user's database, introspect
+    its schema, profile each column, embed the descriptions, and store them in
+    the shared PGVector collection tagged with `source_type="database"`.
+    Chunks become retrievable via the same retriever used for document chunks.
+    """
+    client = await database_client_service.get(db, client_id)
+    if not client:
+        raise HTTPException(status_code=404, detail="Database client not found")
+    if client.user_id != user_id:
+        raise HTTPException(status_code=403, detail="Access denied")
+    if client.status != "active":
+        raise HTTPException(
+            status_code=status.HTTP_409_CONFLICT,
+            detail="Cannot ingest from an inactive database connection.",
+        )
+    try:
+        creds = decrypt_credentials_dict(client.credentials)
+        with db_pipeline_service.engine_scope(
+            db_type=client.db_type,
+            credentials=creds,
+        ) as engine:
+            total = await db_pipeline_service.run(user_id=user_id, client_id=client_id, engine=engine)
+    except NotImplementedError as e:
+        raise HTTPException(status_code=status.HTTP_501_NOT_IMPLEMENTED, detail=str(e))
+    except Exception as e:
+        logger.error(
+            f"Ingestion failed for client {client_id}", user_id=user_id, error=str(e)
+        )
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"Ingestion failed: {e}",
+        )
+    return {"status": "success", "client_id": client_id, "chunks_ingested": total}

src/api/v1/document.py CHANGED Viewed

@@ -1,21 +1,20 @@
 """Document management API endpoints."""
-from fastapi import APIRouter, Depends, HTTPException, Request, UploadFile, File, status
 from sqlalchemy.ext.asyncio import AsyncSession
 from src.db.postgres.connection import get_db
 from src.document.document_service import document_service
-from src.knowledge.processing_service import knowledge_processor
-from src.storage.az_blob.az_blob import blob_storage
 from src.middlewares.logging import get_logger, log_execution
 from src.middlewares.rate_limit import limiter
 from pydantic import BaseModel
 from typing import List
 logger = get_logger("document_api")
 router = APIRouter(prefix="/api/v1", tags=["Documents"])
 class DocumentResponse(BaseModel):
     id: str
     filename: str
@@ -23,6 +22,27 @@ class DocumentResponse(BaseModel):
     file_size: int
     file_type: str
     created_at: str
 @router.get("/documents/{user_id}", response_model=List[DocumentResponse])
@@ -44,8 +64,8 @@ async def list_documents(
         )
         for doc in documents
     ]
 @router.post("/document/upload")
 @limiter.limit("10/minute")
 @log_execution(logger)
@@ -57,57 +77,12 @@ async def upload_document(
 ):
     """Upload a document."""
     if not user_id:
-        raise HTTPException(
-            status_code=400,
-            detail="user_id is required"
-        )
-    try:
-        # Read file content
-        content = await file.read()
-        file_size = len(content)
-        # Get file type
-        filename = file.filename
-        file_type = filename.split('.')[-1].lower() if '.' in filename else 'txt'
-        if file_type not in ['pdf', 'docx', 'txt']:
-            raise HTTPException(
-                status_code=400,
-                detail="Unsupported file type. Supported: pdf, docx, txt"
-            )
-        # Upload to blob storage
-        blob_name = await blob_storage.upload_file(content, filename, user_id)
-        # Create document record
-        document = await document_service.create_document(
-            db=db,
-            user_id=user_id,
-            filename=filename,
-            blob_name=blob_name,
-            file_size=file_size,
-            file_type=file_type
-        )
-        return {
-            "status": "success",
-            "message": "Document uploaded successfully",
-            "data": {
-                "id": document.id,
-                "filename": document.filename,
-                "status": document.status
-            }
-        }
-    except Exception as e:
-        logger.error(f"Upload failed for user {user_id}", error=str(e))
-        raise HTTPException(
-            status_code=500,
-            detail=f"Upload failed: {str(e)}"
-        )
 @router.delete("/document/delete")
 @log_execution(logger)
 async def delete_document(
@@ -116,31 +91,10 @@ async def delete_document(
     db: AsyncSession = Depends(get_db)
 ):
     """Delete a document."""
-    document = await document_service.get_document(db, document_id)
-    if not document:
-        raise HTTPException(
-            status_code=404,
-            detail="Document not found"
-        )
-    if document.user_id != user_id:
-        raise HTTPException(
-            status_code=403,
-            detail="Access denied"
-        )
-    success = await document_service.delete_document(db, document_id)
-    if success:
-        return {"status": "success", "message": "Document deleted successfully"}
-    else:
-        raise HTTPException(
-            status_code=500,
-            detail="Failed to delete document"
-        )
 @router.post("/document/process")
 @log_execution(logger)
 async def process_document(
@@ -149,45 +103,6 @@ async def process_document(
     db: AsyncSession = Depends(get_db)
 ):
     """Process document and ingest to vector index."""
-    document = await document_service.get_document(db, document_id)
-    if not document:
-        raise HTTPException(
-            status_code=404,
-            detail="Document not found"
-        )
-    if document.user_id != user_id:
-        raise HTTPException(
-            status_code=403,
-            detail="Access denied"
-        )
-    try:
-        # Update status to processing
-        await document_service.update_document_status(db, document_id, "processing")
-        # Process document
-        chunks_count = await knowledge_processor.process_document(document, db)
-        # Update status to completed
-        await document_service.update_document_status(db, document_id, "completed")
-        return {
-            "status": "success",
-            "message": "Document processed successfully",
-            "data": {
-                "document_id": document_id,
-                "chunks_processed": chunks_count
-            }
-        }
-    except Exception as e:
-        logger.error(f"Processing failed for document {document_id}", error=str(e))
-        await document_service.update_document_status(
-            db, document_id, "failed", str(e)
-        )
-        raise HTTPException(
-            status_code=500,
-            detail=f"Processing failed: {str(e)}"
-        )

 """Document management API endpoints."""
+from fastapi import APIRouter, Depends, HTTPException, Request, UploadFile, File
 from sqlalchemy.ext.asyncio import AsyncSession
 from src.db.postgres.connection import get_db
 from src.document.document_service import document_service
 from src.middlewares.logging import get_logger, log_execution
 from src.middlewares.rate_limit import limiter
+from src.pipeline.document_pipeline.document_pipeline import document_pipeline
 from pydantic import BaseModel
 from typing import List
 logger = get_logger("document_api")
 router = APIRouter(prefix="/api/v1", tags=["Documents"])
 class DocumentResponse(BaseModel):
     id: str
     filename: str
     file_size: int
     file_type: str
     created_at: str
+# NOTE: Keep in sync with SUPPORTED_FILE_TYPES in src/pipeline/document_pipeline/document_pipeline.py
+_DOC_TYPES = [
+    {"doc_type": "pdf", "max_size": 10, "status": "active", "message": None},
+    {"doc_type": "docx", "max_size": 10, "status": "active", "message": None},
+    {"doc_type": "txt", "max_size": 10, "status": "active", "message": None},
+    {"doc_type": "csv", "max_size": 10, "status": "active", "message": None},
+    {"doc_type": "xlsx", "max_size": 10, "status": "active", "message": None},
+]
+@router.get(
+    "/documents/doctypes",
+    summary="List supported document types",
+    response_description="All document types supported by DataEyond with their size limits and status.",
+)
+@log_execution(logger)
+async def get_document_types():
+    """Return every document type DataEyond can process, with max file size and active/inactive status."""
+    return {"status": "success", "data": _DOC_TYPES}
 @router.get("/documents/{user_id}", response_model=List[DocumentResponse])
         )
         for doc in documents
     ]
 @router.post("/document/upload")
 @limiter.limit("10/minute")
 @log_execution(logger)
 ):
     """Upload a document."""
     if not user_id:
+        raise HTTPException(status_code=400, detail="user_id is required")
+    data = await document_pipeline.upload(file, user_id, db)
+    return {"status": "success", "message": "Document uploaded successfully", "data": data}
 @router.delete("/document/delete")
 @log_execution(logger)
 async def delete_document(
     db: AsyncSession = Depends(get_db)
 ):
     """Delete a document."""
+    await document_pipeline.delete(document_id, user_id, db)
+    return {"status": "success", "message": "Document deleted successfully"}
 @router.post("/document/process")
 @log_execution(logger)
 async def process_document(
     db: AsyncSession = Depends(get_db)
 ):
     """Process document and ingest to vector index."""
+    data = await document_pipeline.process(document_id, user_id, db)
+    return {"status": "success", "message": "Document processed successfully", "data": data}

src/config/agents/system_prompt.md CHANGED Viewed

@@ -3,8 +3,7 @@ You are a helpful AI assistant with access to user's uploaded documents. Your ro
 1. Answer questions based on provided document context
 2. If no relevant information is found in documents, acknowledge this honestly
 3. Be concise and direct in your responses
-4. Cite source documents when providing information
-5. If user's question is unclear, ask for clarification
 When document context is provided:
 - Use information from documents to answer accurately

 1. Answer questions based on provided document context
 2. If no relevant information is found in documents, acknowledge this honestly
 3. Be concise and direct in your responses
+4. If user's question is unclear, ask for clarification
 When document context is provided:
 - Use information from documents to answer accurately

src/config/settings.py CHANGED Viewed

@@ -61,6 +61,11 @@ class Settings(BaseSettings):
     # Bcrypt salt (for users - existing)
     emarcal_bcrypt_salt: str = Field(alias="emarcal__bcrypt__salt", default="")
 # Singleton instance
 settings = Settings()

     # Bcrypt salt (for users - existing)
     emarcal_bcrypt_salt: str = Field(alias="emarcal__bcrypt__salt", default="")
+    # DB credential encryption (Fernet key for user-registered database creds)
+    dataeyond_db_credential_key: str = Field(
+        alias="dataeyond__db__credential__key"
+    )
 # Singleton instance
 settings = Settings()

src/database_client/database_client_service.py ADDED Viewed

	@@ -0,0 +1,164 @@

+"""Service for managing user-registered external database connections."""
+import uuid
+from typing import List, Optional
+from sqlalchemy import delete, select
+from sqlalchemy.ext.asyncio import AsyncSession
+from src.db.postgres.models import DatabaseClient
+from src.middlewares.logging import get_logger
+from src.utils.db_credential_encryption import (
+    decrypt_credentials_dict,
+    encrypt_credentials_dict,
+)
+logger = get_logger("database_client_service")
+    # Fields that identify the same physical database per db_type.
+_CONNECTION_IDENTITY_KEYS: dict[str, tuple[str, ...]] = {
+    "postgres": ("host", "port", "database"),
+    "supabase": ("host", "port", "database"),
+    "mysql": ("host", "port", "database"),
+    "sqlserver": ("host", "port", "database"),
+    "bigquery": ("project_id", "dataset_id"),
+    "snowflake": ("account", "warehouse", "database"),
+}
+class DatabaseClientService:
+    """Service for managing user-registered external database connections."""
+    async def _find_duplicate(
+        self,
+        db: AsyncSession,
+        user_id: str,
+        db_type: str,
+        credentials: dict,
+    ) -> Optional[DatabaseClient]:
+        """Return an existing client if it points to the same physical database."""
+        identity_keys = _CONNECTION_IDENTITY_KEYS.get(db_type, ())
+        if not identity_keys:
+            return None
+        result = await db.execute(
+            select(DatabaseClient).where(
+                DatabaseClient.user_id == user_id,
+                DatabaseClient.db_type == db_type,
+            )
+        )
+        for existing in result.scalars().all():
+            decrypted = decrypt_credentials_dict(existing.credentials)
+            if all(
+                decrypted.get(k) == credentials.get(k) for k in identity_keys
+            ):
+                return existing
+        return None
+    async def create(
+        self,
+        db: AsyncSession,
+        user_id: str,
+        name: str,
+        db_type: str,
+        credentials: dict,
+    ) -> DatabaseClient:
+        """Register a new database client connection.
+        If a connection to the same physical database already exists for this
+        user, the existing record is returned instead of creating a duplicate.
+        Credentials are encrypted before being stored.
+        """
+        existing = await self._find_duplicate(db, user_id, db_type, credentials)
+        if existing:
+            logger.info(
+                f"Duplicate connection detected, returning existing client {existing.id}"
+            )
+            return existing
+        client = DatabaseClient(
+            id=str(uuid.uuid4()),
+            user_id=user_id,
+            name=name,
+            db_type=db_type,
+            credentials=encrypt_credentials_dict(credentials),
+            status="active",
+        )
+        db.add(client)
+        await db.commit()
+        await db.refresh(client)
+        logger.info(f"Created database client {client.id} for user {user_id}")
+        return client
+    async def get_user_clients(
+        self,
+        db: AsyncSession,
+        user_id: str,
+    ) -> List[DatabaseClient]:
+        """Return all active and inactive database clients for a user."""
+        result = await db.execute(
+            select(DatabaseClient)
+            .where(DatabaseClient.user_id == user_id)
+            .order_by(DatabaseClient.created_at.desc())
+        )
+        return result.scalars().all()
+    async def get(
+        self,
+        db: AsyncSession,
+        client_id: str,
+    ) -> Optional[DatabaseClient]:
+        """Return a single database client by its ID."""
+        result = await db.execute(
+            select(DatabaseClient).where(DatabaseClient.id == client_id)
+        )
+        return result.scalars().first()
+    async def update(
+        self,
+        db: AsyncSession,
+        client_id: str,
+        name: Optional[str] = None,
+        credentials: Optional[dict] = None,
+        status: Optional[str] = None,
+    ) -> Optional[DatabaseClient]:
+        """Update an existing database client connection.
+        Only non-None fields are updated.
+        Credentials are re-encrypted if provided.
+        """
+        client = await self.get(db, client_id)
+        if not client:
+            return None
+        if name is not None:
+            client.name = name
+        if credentials is not None:
+            client.credentials = encrypt_credentials_dict(credentials)
+        if status is not None:
+            client.status = status
+        await db.commit()
+        await db.refresh(client)
+        logger.info(f"Updated database client {client_id}")
+        return client
+    async def delete(
+        self,
+        db: AsyncSession,
+        client_id: str,
+    ) -> bool:
+        """Permanently delete a database client connection."""
+        result = await db.execute(
+            delete(DatabaseClient).where(DatabaseClient.id == client_id)
+        )
+        await db.commit()
+        deleted = result.rowcount > 0
+        if deleted:
+            logger.info(f"Deleted database client {client_id}")
+        return deleted
+database_client_service = DatabaseClientService()

src/db/postgres/init_db.py CHANGED Viewed

@@ -2,7 +2,14 @@
 from sqlalchemy import text
 from src.db.postgres.connection import engine, Base
-from src.db.postgres.models import Document, Room, ChatMessage, User, MessageSource
 async def init_db():
@@ -21,3 +28,38 @@ async def init_db():
         await conn.execute(text(
             "ALTER TABLE rooms ADD COLUMN IF NOT EXISTS status VARCHAR NOT NULL DEFAULT 'active'"
         ))

 from sqlalchemy import text
 from src.db.postgres.connection import engine, Base
+from src.db.postgres.models import (
+    ChatMessage,
+    DatabaseClient,
+    Document,
+    MessageSource,
+    Room,
+    User,
+)
 async def init_db():
         await conn.execute(text(
             "ALTER TABLE rooms ADD COLUMN IF NOT EXISTS status VARCHAR NOT NULL DEFAULT 'active'"
         ))
+        # HNSW index for fast approximate vector similarity search
+        # Only created when the embedding column has explicit dimensions (HNSW requirement).
+        # atttypmod > 0 means the vector column was created with a dimension (e.g. vector(1536));
+        # atttypmod = -1 means dimensionless — HNSW would fail with "column does not have dimensions".
+        await conn.execute(text("""
+            DO $$
+            BEGIN
+                IF EXISTS (
+                    SELECT FROM pg_attribute a
+                    JOIN pg_class c ON c.oid = a.attrelid
+                    WHERE c.relname = 'langchain_pg_embedding'
+                      AND a.attname = 'embedding'
+                      AND a.atttypmod > 0
+                ) THEN
+                    CREATE INDEX IF NOT EXISTS idx_langchain_pg_embedding_hnsw
+                    ON langchain_pg_embedding USING hnsw (embedding vector_cosine_ops);
+                END IF;
+            END $$
+        """))
+        # GIN index for FTS on schema chunks — only created if table exists
+        # (langchain_pg_embedding is created by PGVector on first use, not by create_all)
+        await conn.execute(text("""
+            DO $$
+            BEGIN
+                IF EXISTS (
+                    SELECT FROM information_schema.tables
+                    WHERE table_name = 'langchain_pg_embedding'
+                ) THEN
+                    CREATE INDEX IF NOT EXISTS idx_langchain_pg_embedding_fts
+                    ON langchain_pg_embedding USING GIN (to_tsvector('english', document));
+                END IF;
+            END $$
+        """))

src/db/postgres/models.py CHANGED Viewed

@@ -4,6 +4,7 @@ from uuid import uuid4
 from sqlalchemy import Column, String, DateTime, Text, Integer, ForeignKey
 from sqlalchemy.orm import relationship
 from sqlalchemy.sql import func
 from src.db.postgres.connection import Base
@@ -81,3 +82,18 @@ class MessageSource(Base):
     created_at = Column(DateTime(timezone=True), server_default=func.now())
     message = relationship("ChatMessage", back_populates="sources")

 from sqlalchemy import Column, String, DateTime, Text, Integer, ForeignKey
 from sqlalchemy.orm import relationship
 from sqlalchemy.sql import func
+from sqlalchemy.dialects.postgresql import JSONB
 from src.db.postgres.connection import Base
     created_at = Column(DateTime(timezone=True), server_default=func.now())
     message = relationship("ChatMessage", back_populates="sources")
+class DatabaseClient(Base):
+    """User-registered external database connections."""
+    __tablename__ = "databases"
+    id = Column(String, primary_key=True, default=lambda: str(uuid4()))
+    user_id = Column(String, nullable=False, index=True)
+    name = Column(String, nullable=False)       # display name, e.g. "Prod DB"
+    db_type = Column(String, nullable=False)    # postgres|mysql|sqlserver|supabase|bigquery|snowflake
+    credentials = Column(JSONB, nullable=False) # per-type JSON; sensitive fields Fernet-encrypted
+    status = Column(String, nullable=False, default="active")  # active | inactive
+    created_at = Column(DateTime(timezone=True), server_default=func.now())
+    updated_at = Column(DateTime(timezone=True), onupdate=func.now())

src/document/document_service.py CHANGED Viewed

@@ -1,8 +1,9 @@
 """Service for managing documents."""
 from sqlalchemy.ext.asyncio import AsyncSession
-from sqlalchemy import select, delete
 from src.db.postgres.models import Document
 from src.storage.az_blob.az_blob import blob_storage
 from src.middlewares.logging import get_logger
 from typing import List, Optional
@@ -77,6 +78,21 @@ class DocumentService:
         # Delete from blob storage
         await blob_storage.delete_file(document.blob_name)
         # Delete from database
         await db.execute(
             delete(Document).where(Document.id == document_id)

 """Service for managing documents."""
 from sqlalchemy.ext.asyncio import AsyncSession
+from sqlalchemy import select, delete, text
 from src.db.postgres.models import Document
+from src.db.postgres.connection import _pgvector_engine
 from src.storage.az_blob.az_blob import blob_storage
 from src.middlewares.logging import get_logger
 from typing import List, Optional
         # Delete from blob storage
         await blob_storage.delete_file(document.blob_name)
+        # Delete vector embeddings from pgvector (scoped to user + collection to avoid cross-user over-delete)
+        async with _pgvector_engine.begin() as conn:
+            await conn.execute(
+                text("""
+                    DELETE FROM langchain_pg_embedding
+                    WHERE cmetadata->>'user_id' = :user_id
+                      AND cmetadata->>'source_type' = 'document'
+                      AND cmetadata->'data'->>'document_id' = :doc_id
+                      AND collection_id = (
+                        SELECT uuid FROM langchain_pg_collection WHERE name = 'document_embeddings'
+                      )
+                """),
+                {"user_id": document.user_id, "doc_id": document_id},
+            )
         # Delete from database
         await db.execute(
             delete(Document).where(Document.id == document_id)

src/knowledge/parquet_service.py ADDED Viewed

	@@ -0,0 +1,77 @@

+"""Parquet service — converts, uploads, downloads, and deletes Parquet files for CSV/XLSX.
+Parquet files are stored in Azure Blob alongside the original document using
+a deterministic naming convention based on document_id:
+  CSV:        {user_id}/{document_id}.parquet
+  XLSX sheet: {user_id}/{document_id}__{safe_sheet_name}.parquet
+This allows tabular.py to construct the correct blob name at retrieval time
+without needing to store it separately, and allows document_pipeline.py to
+delete all Parquet files for a document using a prefix delete.
+"""
+import io
+import pandas as pd
+from src.middlewares.logging import get_logger
+from src.storage.az_blob.az_blob import blob_storage
+logger = get_logger("parquet_service")
+def _safe_sheet_name(sheet_name: str) -> str:
+    return sheet_name.replace("/", "_").replace(" ", "_").replace("\\", "_")
+def parquet_blob_name(user_id: str, document_id: str, sheet_name: str | None = None) -> str:
+    """Construct deterministic Parquet blob name."""
+    if sheet_name:
+        return f"{user_id}/{document_id}__{_safe_sheet_name(sheet_name)}.parquet"
+    return f"{user_id}/{document_id}.parquet"
+def _to_parquet_bytes(df: pd.DataFrame) -> bytes:
+    buf = io.BytesIO()
+    df.to_parquet(buf, index=False)
+    return buf.getvalue()
+async def upload_parquet(
+    df: pd.DataFrame,
+    user_id: str,
+    document_id: str,
+    sheet_name: str | None = None,
+) -> str:
+    """Convert DataFrame to Parquet and upload to Azure Blob. Returns blob_name."""
+    blob_name = parquet_blob_name(user_id, document_id, sheet_name)
+    parquet_bytes = _to_parquet_bytes(df)
+    await blob_storage.upload_bytes(parquet_bytes, blob_name)
+    logger.info(f"Uploaded Parquet {blob_name} ({len(parquet_bytes)} bytes)")
+    return blob_name
+async def download_parquet(
+    user_id: str,
+    document_id: str,
+    sheet_name: str | None = None,
+) -> pd.DataFrame:
+    """Download Parquet from Azure Blob and return as DataFrame."""
+    blob_name = parquet_blob_name(user_id, document_id, sheet_name)
+    content = await blob_storage.download_file(blob_name)
+    df = pd.read_parquet(io.BytesIO(content))
+    logger.info(f"Downloaded Parquet {blob_name}: {len(df)} rows, {len(df.columns)} columns")
+    return df
+async def delete_document_parquets(user_id: str, document_id: str) -> int:
+    """Delete all Parquet files for a document (CSV = 1 file, XLSX = one per sheet).
+    Uses prefix delete: {user_id}/{document_id} matches all Parquet variants
+    for this document without touching the original blob (which uses a random UUID name).
+    """
+    prefix = f"{user_id}/{document_id}"
+    deleted = await blob_storage.delete_blobs_with_prefix(prefix)
+    logger.info(f"Deleted {deleted} Parquet file(s) for document {document_id}")
+    return deleted

src/knowledge/processing_service.py CHANGED Viewed

@@ -5,16 +5,20 @@ from langchain_core.documents import Document as LangChainDocument
 from src.db.postgres.vector_store import get_vector_store
 from src.storage.az_blob.az_blob import blob_storage
 from src.db.postgres.models import Document as DBDocument
-from src.config.settings import settings
 from sqlalchemy.ext.asyncio import AsyncSession
 from src.middlewares.logging import get_logger
-from azure.ai.documentintelligence.aio import DocumentIntelligenceClient
-from azure.core.credentials import AzureKeyCredential
 from typing import List
-import pypdf
 import docx
 from io import BytesIO
 logger = get_logger("knowledge_processing")
@@ -40,6 +44,10 @@ class KnowledgeProcessingService:
             if db_doc.file_type == "pdf":
                 documents = await self._build_pdf_documents(content, db_doc)
             else:
                 text = self._extract_text(content, db_doc.file_type)
                 if not text.strip():
@@ -49,10 +57,15 @@ class KnowledgeProcessingService:
                     LangChainDocument(
                         page_content=chunk,
                         metadata={
-                            "document_id": db_doc.id,
                             "user_id": db_doc.user_id,
-                            "filename": db_doc.filename,
-                            "chunk_index": i,
                         }
                     )
                     for i, chunk in enumerate(chunks)
@@ -74,62 +87,138 @@ class KnowledgeProcessingService:
     async def _build_pdf_documents(
         self, content: bytes, db_doc: DBDocument
     ) -> List[LangChainDocument]:
-        """Build LangChain documents from PDF with page_label metadata.
-        Uses Azure Document Intelligence (per-page) when credentials are present,
-        falls back to pypdf (also per-page) otherwise.
-        """
         documents: List[LangChainDocument] = []
-        if settings.azureai_docintel_endpoint and settings.azureai_docintel_key:
-            async with DocumentIntelligenceClient(
-                endpoint=settings.azureai_docintel_endpoint,
-                credential=AzureKeyCredential(settings.azureai_docintel_key),
-            ) as client:
-                poller = await client.begin_analyze_document(
-                    model_id="prebuilt-read",
-                    body=BytesIO(content),
-                    content_type="application/pdf",
-                )
-                result = await poller.result()
-                logger.info(f"Azure DI extracted {len(result.pages or [])} pages")
-                for page in result.pages or []:
-                    page_text = "\n".join(
-                        line.content for line in (page.lines or [])
-                    )
-                    if not page_text.strip():
-                        continue
-                    for chunk in self.text_splitter.split_text(page_text):
-                        documents.append(LangChainDocument(
-                            page_content=chunk,
-                            metadata={
-                                "document_id": db_doc.id,
-                                "user_id": db_doc.user_id,
-                                "filename": db_doc.filename,
-                                "chunk_index": len(documents),
-                                "page_label": page.page_number,
-                            }
-                        ))
-        else:
-            logger.warning("Azure DI not configured, using pypdf")
-            pdf_reader = pypdf.PdfReader(BytesIO(content))
-            for page_num, page in enumerate(pdf_reader.pages, start=1):
-                page_text = page.extract_text() or ""
-                if not page_text.strip():
-                    continue
-                for chunk in self.text_splitter.split_text(page_text):
-                    documents.append(LangChainDocument(
-                        page_content=chunk,
-                        metadata={
                             "document_id": db_doc.id,
-                            "user_id": db_doc.user_id,
                             "filename": db_doc.filename,
                             "chunk_index": len(documents),
                             "page_label": page_num,
-                        }
-                    ))
         return documents
     def _extract_text(self, content: bytes, file_type: str) -> str:

 from src.db.postgres.vector_store import get_vector_store
 from src.storage.az_blob.az_blob import blob_storage
 from src.db.postgres.models import Document as DBDocument
 from sqlalchemy.ext.asyncio import AsyncSession
 from src.middlewares.logging import get_logger
+from src.knowledge.parquet_service import upload_parquet
 from typing import List
+from datetime import datetime, timezone, timedelta
+import sys
 import docx
+import pandas as pd
+import pytesseract
+from pdf2image import convert_from_bytes
 from io import BytesIO
+_JAKARTA_TZ = timezone(timedelta(hours=7))
 logger = get_logger("knowledge_processing")
             if db_doc.file_type == "pdf":
                 documents = await self._build_pdf_documents(content, db_doc)
+            elif db_doc.file_type == "csv":
+                documents = await self._build_csv_documents(content, db_doc)
+            elif db_doc.file_type == "xlsx":
+                documents = await self._build_excel_documents(content, db_doc)
             else:
                 text = self._extract_text(content, db_doc.file_type)
                 if not text.strip():
                     LangChainDocument(
                         page_content=chunk,
                         metadata={
                             "user_id": db_doc.user_id,
+                            "source_type": "document",
+                            "updated_at": datetime.now(_JAKARTA_TZ).isoformat(),
+                            "data": {
+                                "document_id": db_doc.id,
+                                "filename": db_doc.filename,
+                                "file_type": db_doc.file_type,
+                                "chunk_index": i,
+                            },
                         }
                     )
                     for i, chunk in enumerate(chunks)
     async def _build_pdf_documents(
         self, content: bytes, db_doc: DBDocument
     ) -> List[LangChainDocument]:
+        """Build LangChain documents from PDF with page_label metadata using Tesseract OCR."""
         documents: List[LangChainDocument] = []
+        poppler_path = None
+        if sys.platform == "win32":
+            pytesseract.pytesseract.tesseract_cmd = r"./software/Tesseract-OCR/tesseract.exe"
+            poppler_path = "./software/poppler-24.08.0/Library/bin"
+        images = convert_from_bytes(content, poppler_path=poppler_path)
+        logger.info(f"Tesseract OCR: converting {len(images)} pages")
+        for page_num, image in enumerate(images, start=1):
+            page_text = pytesseract.image_to_string(image)
+            if not page_text.strip():
+                continue
+            for chunk in self.text_splitter.split_text(page_text):
+                documents.append(LangChainDocument(
+                    page_content=chunk,
+                    metadata={
+                        "user_id": db_doc.user_id,
+                        "source_type": "document",
+                        "updated_at": datetime.now(_JAKARTA_TZ).isoformat(),
+                        "data": {
                             "document_id": db_doc.id,
                             "filename": db_doc.filename,
+                            "file_type": db_doc.file_type,
                             "chunk_index": len(documents),
                             "page_label": page_num,
+                        },
+                    }
+                ))
+        return documents
+    def _profile_dataframe(
+        self, df: pd.DataFrame, source_name: str, db_doc: DBDocument
+    ) -> List[LangChainDocument]:
+        """Profile each column of a dataframe → one chunk per column."""
+        documents = []
+        row_count = len(df)
+        for col_name in df.columns:
+            col = df[col_name]
+            is_numeric = pd.api.types.is_numeric_dtype(col)
+            null_count = int(col.isnull().sum())
+            distinct_count = int(col.nunique())
+            distinct_ratio = distinct_count / row_count if row_count > 0 else 0
+            text = f"Source: {source_name} ({row_count} rows)\n"
+            text += f"Column: {col_name} ({col.dtype})\n"
+            text += f"Null count: {null_count}\n"
+            text += f"Distinct count: {distinct_count} ({distinct_ratio:.1%})\n"
+            if is_numeric:
+                text += f"Min: {col.min()}, Max: {col.max()}\n"
+                text += f"Mean: {col.mean():.4f}, Median: {col.median():.4f}\n"
+            if 0 < distinct_ratio <= 0.05:
+                top_values = col.value_counts().head(10)
+                top_str = ", ".join(f"{v} ({c})" for v, c in top_values.items())
+                text += f"Top values: {top_str}\n"
+            text += f"Sample values: {col.dropna().head(5).tolist()}"
+            documents.append(LangChainDocument(
+                page_content=text,
+                metadata={
+                    "user_id": db_doc.user_id,
+                    "source_type": "document",
+                    "chunk_level": "column",
+                    "updated_at": datetime.now(_JAKARTA_TZ).isoformat(),
+                    "data": {
+                        "document_id": db_doc.id,
+                        "filename": db_doc.filename,
+                        "file_type": db_doc.file_type,
+                        "source": source_name,
+                        "column_name": col_name,
+                        "column_type": str(col.dtype),
+                    }
+                }
+            ))
+        return documents
+    def _to_sheet_document(
+        self, df: pd.DataFrame, db_doc: DBDocument, sheet_name: str | None, source_name: str
+    ) -> LangChainDocument:
+        col_summary = ", ".join(f"{c} ({df[c].dtype})" for c in df.columns)
+        text = (
+            f"Source: {source_name} ({len(df)} rows)\n"
+            f"Columns ({len(df.columns)}): {col_summary}"
+        )
+        return LangChainDocument(
+            page_content=text,
+            metadata={
+                "user_id": db_doc.user_id,
+                "source_type": "document",
+                "chunk_level": "sheet",
+                "updated_at": datetime.now(_JAKARTA_TZ).isoformat(),
+                "data": {
+                    "document_id": db_doc.id,
+                    "filename": db_doc.filename,
+                    "file_type": db_doc.file_type,
+                    "sheet_name": sheet_name,
+                    "column_names": list(df.columns),
+                    "row_count": len(df),
+                },
+            },
+        )
+    async def _build_csv_documents(self, content: bytes, db_doc: DBDocument) -> List[LangChainDocument]:
+        """Profile each column of a CSV file and upload Parquet to Azure Blob."""
+        df = pd.read_csv(BytesIO(content))
+        await upload_parquet(df, db_doc.user_id, db_doc.id)
+        logger.info(f"Uploaded Parquet for CSV {db_doc.id}")
+        docs = self._profile_dataframe(df, db_doc.filename, db_doc)
+        docs.append(self._to_sheet_document(df, db_doc, sheet_name=None, source_name=db_doc.filename))
+        return docs
+    async def _build_excel_documents(self, content: bytes, db_doc: DBDocument) -> List[LangChainDocument]:
+        """Profile each column of every sheet in an Excel file and upload one Parquet per sheet."""
+        sheets = pd.read_excel(BytesIO(content), sheet_name=None)
+        documents = []
+        for sheet_name, df in sheets.items():
+            source_name = f"{db_doc.filename} / sheet: {sheet_name}"
+            docs = self._profile_dataframe(df, source_name, db_doc)
+            for doc in docs:
+                doc.metadata["data"]["sheet_name"] = sheet_name
+                doc.metadata["chunk_level"] = "column"
+            documents.extend(docs)
+            documents.append(self._to_sheet_document(df, db_doc, sheet_name, source_name))
+            await upload_parquet(df, db_doc.user_id, db_doc.id, sheet_name)
+            logger.info(f"Uploaded Parquet for sheet '{sheet_name}' of {db_doc.id}")
         return documents
     def _extract_text(self, content: bytes, file_type: str) -> str:

src/models/credentials.py ADDED Viewed

	@@ -0,0 +1,164 @@

+"""Pydantic credential schemas for user-registered external databases.
+Imported by the `/database-clients` API router (`src/api/v1/db_client.py`) and,
+via `DbType`, by the db pipeline connector (`src/pipeline/db_pipeline/connector.py`).
+Sensitive fields (`password`, `service_account_json`) are Fernet-encrypted by
+the database_client service before being stored in the JSONB column; these
+schemas describe the plaintext wire format, not the stored shape.
+"""
+from typing import Literal, Optional, Union
+from pydantic import BaseModel, Field
+# ---------------------------------------------------------------------------
+# Supported DB types
+# ---------------------------------------------------------------------------
+DbType = Literal["postgres", "mysql", "sqlserver", "supabase", "bigquery", "snowflake"]
+# ---------------------------------------------------------------------------
+# Typed credential schemas per DB type
+# ---------------------------------------------------------------------------
+class PostgresCredentials(BaseModel):
+    """Connection credentials for PostgreSQL."""
+    host: str = Field(..., description="Hostname or IP address of the PostgreSQL server.", examples=["db.example.com"])
+    port: int = Field(5432, description="Port number (default: 5432).", examples=[5432])
+    database: str = Field(..., description="Name of the target database.", examples=["mydb"])
+    username: str = Field(..., description="Database username.", examples=["admin"])
+    password: str = Field(..., description="Database password. Will be encrypted at rest.", examples=["s3cr3t!"])
+    ssl_mode: Literal["disable", "require", "verify-ca", "verify-full"] = Field(
+        "require",
+        description="SSL mode for the connection.",
+        examples=["require"],
+    )
+class MysqlCredentials(BaseModel):
+    """Connection credentials for MySQL."""
+    host: str = Field(..., description="Hostname or IP address of the MySQL server.", examples=["db.example.com"])
+    port: int = Field(3306, description="Port number (default: 3306).", examples=[3306])
+    database: str = Field(..., description="Name of the target database.", examples=["mydb"])
+    username: str = Field(..., description="Database username.", examples=["admin"])
+    password: str = Field(..., description="Database password. Will be encrypted at rest.", examples=["s3cr3t!"])
+    ssl: bool = Field(True, description="Enable SSL for the connection.", examples=[True])
+class SqlServerCredentials(BaseModel):
+    """Connection credentials for Microsoft SQL Server."""
+    host: str = Field(..., description="Hostname or IP address of the SQL Server.", examples=["sqlserver.example.com"])
+    port: int = Field(1433, description="Port number (default: 1433).", examples=[1433])
+    database: str = Field(..., description="Name of the target database.", examples=["mydb"])
+    username: str = Field(..., description="Database username.", examples=["sa"])
+    password: str = Field(..., description="Database password. Will be encrypted at rest.", examples=["s3cr3t!"])
+    driver: Optional[str] = Field(
+        None,
+        description="ODBC driver name. Leave empty to use the default driver.",
+        examples=["ODBC Driver 17 for SQL Server"],
+    )
+class SupabaseCredentials(BaseModel):
+    """Connection credentials for Supabase (PostgreSQL-based).
+    Use the connection string details from your Supabase project dashboard
+    under Settings > Database.
+    """
+    host: str = Field(
+        ...,
+        description="Supabase database host (e.g. db.<project-ref>.supabase.co, or the pooler host).",
+        examples=["db.xxxx.supabase.co"],
+    )
+    port: int = Field(
+        5432,
+        description="Port number. Use 5432 for direct connection, 6543 for the connection pooler.",
+        examples=[5432],
+    )
+    database: str = Field("postgres", description="Database name (always 'postgres' for Supabase).", examples=["postgres"])
+    username: str = Field(
+        ...,
+        description="Database user. Use 'postgres' for direct connection, or 'postgres.<project-ref>' for the pooler.",
+        examples=["postgres"],
+    )
+    password: str = Field(..., description="Database password (set in Supabase dashboard). Will be encrypted at rest.", examples=["s3cr3t!"])
+    ssl_mode: Literal["require", "verify-ca", "verify-full"] = Field(
+        "require",
+        description="SSL mode. Supabase always requires SSL.",
+        examples=["require"],
+    )
+class BigQueryCredentials(BaseModel):
+    """Connection credentials for Google BigQuery.
+    Requires a GCP Service Account with at least BigQuery Data Viewer
+    and BigQuery Job User roles.
+    """
+    project_id: str = Field(..., description="GCP project ID where the BigQuery dataset resides.", examples=["my-gcp-project"])
+    dataset_id: str = Field(..., description="BigQuery dataset name to connect to.", examples=["my_dataset"])
+    location: Optional[str] = Field(
+        "US",
+        description="Dataset location/region (default: US).",
+        examples=["US", "EU", "asia-southeast1"],
+    )
+    service_account_json: str = Field(
+        ...,
+        description=(
+            "Full content of the GCP Service Account key JSON file as a string. "
+            "Will be encrypted at rest."
+        ),
+        examples=['{"type":"service_account","project_id":"my-gcp-project","private_key_id":"..."}'],
+    )
+class SnowflakeCredentials(BaseModel):
+    """Connection credentials for Snowflake."""
+    account: str = Field(
+        ...,
+        description="Snowflake account identifier, including region if applicable (e.g. myaccount.us-east-1).",
+        examples=["myaccount.us-east-1"],
+    )
+    warehouse: str = Field(..., description="Name of the virtual warehouse to use for queries.", examples=["COMPUTE_WH"])
+    database: str = Field(..., description="Name of the target Snowflake database.", examples=["MY_DB"])
+    db_schema: Optional[str] = Field("PUBLIC", alias="schema", description="Schema name (default: PUBLIC).", examples=["PUBLIC"])
+    username: str = Field(..., description="Snowflake username.", examples=["admin"])
+    password: str = Field(..., description="Snowflake password. Will be encrypted at rest.", examples=["s3cr3t!"])
+    role: Optional[str] = Field(None, description="Snowflake role to assume for the session.", examples=["SYSADMIN"])
+# Union of all credential shapes — reserved for future typed validation on
+# DatabaseClientCreate.credentials (currently Dict[str, Any]). Kept exported
+# so downstream code can reference it without re-declaring.
+CredentialsUnion = Union[
+    PostgresCredentials,
+    MysqlCredentials,
+    SqlServerCredentials,
+    SupabaseCredentials,
+    BigQueryCredentials,
+    SnowflakeCredentials,
+]
+# Doc-only helper: surfaces per-type credential shapes in the Swagger "Schemas"
+# panel so API consumers can discover the exact field set for each db_type.
+# Not referenced by any endpoint — importing it in db_client.py is enough for
+# FastAPI's OpenAPI generator to pick it up.
+class CredentialSchemas(BaseModel):
+    """Reference schemas for `credentials` per `db_type` (Swagger-only, not used by endpoints)."""
+    postgres: PostgresCredentials
+    mysql: MysqlCredentials
+    sqlserver: SqlServerCredentials
+    supabase: SupabaseCredentials
+    bigquery: BigQueryCredentials
+    snowflake: SnowflakeCredentials

src/models/sql_query.py ADDED Viewed

	@@ -0,0 +1,8 @@

+"""Structured output model for LLM-generated SQL queries."""
+from pydantic import BaseModel, Field
+class SQLQuery(BaseModel):
+    sql: str = Field(description="A single SQL SELECT statement. No markdown, no explanation inline.")
+    reasoning: str = Field(description="One sentence: what this query answers.")

src/models/structured_output.py CHANGED Viewed

@@ -19,3 +19,7 @@ class IntentClassification(BaseModel):
         default="",
         description="Direct response if no search needed (for greetings, etc.)"
     )

         default="",
         description="Direct response if no search needed (for greetings, etc.)"
     )
+    source_hint: str = Field(
+        default="both",
+        description="Which sources to search: 'document' (PDF/DOCX/TXT), 'schema' (DB/CSV/XLSX), or 'both'"
+    )

src/pipeline/db_pipeline/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from src.pipeline.db_pipeline.db_pipeline_service import DbPipelineService, db_pipeline_service
2	+
3	+ __all__ = ["DbPipelineService", "db_pipeline_service"]

src/pipeline/db_pipeline/db_pipeline_service.py ADDED Viewed

	@@ -0,0 +1,302 @@

+"""Service for ingesting a user's external database into the vector store.
+End-to-end flow: connect -> introspect schema -> profile columns -> build text
+-> embed + store in the shared PGVector collection (tagged with
+`source_type="database"`, retrievable via the same retriever used for docs).
+Sync DB work (SQLAlchemy inspect, pandas read_sql) runs in a threadpool;
+async vector writes stay on the event loop.
+"""
+import asyncio
+from contextlib import contextmanager
+from datetime import datetime, timezone, timedelta
+from typing import Any, Iterator, Optional
+from langchain_core.documents import Document as LangChainDocument
+from sqlalchemy import URL, create_engine, text
+from sqlalchemy.engine import Engine
+from src.db.postgres.connection import _pgvector_engine
+from src.db.postgres.vector_store import get_vector_store
+from src.middlewares.logging import get_logger
+from src.models.credentials import DbType
+from src.pipeline.db_pipeline.extractor import (
+    build_table_chunk,
+    fetch_sample_row,
+    get_row_count,
+    get_schema,
+    profile_table,
+)
+logger = get_logger("db_pipeline")
+class DbPipelineService:
+    """End-to-end DB ingestion: connect -> introspect -> profile -> embed -> store."""
+    def connect(self, db_type: DbType, credentials: dict[str, Any]) -> Engine:
+        """Build a SQLAlchemy engine for the user's database.
+        `credentials` is the plaintext dict matching the per-type schema in
+        `src/models/credentials.py`. BigQuery/Snowflake auth models differ
+        from host/port/user/pass, so every shape flows through one dict.
+        Optional driver imports (snowflake-sqlalchemy, json for BigQuery) are
+        done lazily so an env missing one driver doesn't break module import.
+        """
+        logger.info("connecting to user db", db_type=db_type)
+        if db_type in ("postgres", "supabase"):
+            query = (
+                {"sslmode": credentials["ssl_mode"]} if credentials.get("ssl_mode") else {}
+            )
+            url = URL.create(
+                drivername="postgresql+psycopg2",
+                username=credentials["username"],
+                password=credentials["password"],
+                host=credentials["host"],
+                port=credentials["port"],
+                database=credentials["database"],
+                query=query,
+            )
+            return create_engine(url)
+        if db_type == "mysql":
+            url = URL.create(
+                drivername="mysql+pymysql",
+                username=credentials["username"],
+                password=credentials["password"],
+                host=credentials["host"],
+                port=credentials["port"],
+                database=credentials["database"],
+            )
+            # pymysql only activates TLS when the `ssl` dict is truthy
+            # (empty dict is falsy and silently disables TLS). Use system-
+            # default CAs via certifi + hostname verification — required by
+            # managed MySQL providers like TiDB Cloud / PlanetScale / Aiven.
+            if credentials.get("ssl", True):
+                import certifi
+                connect_args = {
+                    "ssl": {
+                        "ca": certifi.where(),
+                        "check_hostname": True,
+                    }
+                }
+            else:
+                connect_args = {}
+            return create_engine(url, connect_args=connect_args)
+        if db_type == "sqlserver":
+            # `driver` applies to pyodbc only; we ship pymssql. Accept-and-ignore
+            # keeps the credential schema stable.
+            if credentials.get("driver"):
+                logger.info(
+                    "sqlserver driver hint ignored (using pymssql)",
+                    driver=credentials["driver"],
+                )
+            url = URL.create(
+                drivername="mssql+pymssql",
+                username=credentials["username"],
+                password=credentials["password"],
+                host=credentials["host"],
+                port=credentials["port"],
+                database=credentials["database"],
+            )
+            return create_engine(url)
+        if db_type == "bigquery":
+            import json
+            sa_info = json.loads(credentials["service_account_json"])
+            # sqlalchemy-bigquery URL shape: bigquery://<project>/<dataset>
+            url = f"bigquery://{credentials['project_id']}/{credentials['dataset_id']}"
+            return create_engine(
+                url,
+                credentials_info=sa_info,
+                location=credentials.get("location", "US"),
+            )
+        if db_type == "snowflake":
+            from snowflake.sqlalchemy import URL as SnowflakeURL
+            url = SnowflakeURL(
+                account=credentials["account"],
+                user=credentials["username"],
+                password=credentials["password"],
+                database=credentials["database"],
+                schema=(
+                    credentials.get("db_schema")
+                    or credentials.get("schema")
+                    or "PUBLIC"
+                ),
+                warehouse=credentials["warehouse"],
+                role=credentials.get("role") or "",
+            )
+            return create_engine(url)
+        raise NotImplementedError(f"Unsupported db_type: {db_type}")
+    @contextmanager
+    def engine_scope(
+        self, db_type: DbType, credentials: dict[str, Any]
+    ) -> Iterator[Engine]:
+        """Yield a connected Engine and dispose its pool on exit.
+        API callers should prefer this over raw `connect(...)` so user DB
+        connection pools do not leak between pipeline runs.
+        """
+        engine = self.connect(db_type, credentials)
+        try:
+            yield engine
+        finally:
+            engine.dispose()
+    def _to_document(
+        self, user_id: str, client_id: str, table_name: str, entry: dict, updated_at: str
+    ) -> LangChainDocument:
+        col = entry["col"]
+        return LangChainDocument(
+            page_content=entry["text"],
+            metadata={
+                "user_id": user_id,
+                "source_type": "database",
+                "chunk_level": "column",
+                "database_client_id": client_id,
+                "updated_at": updated_at,
+                "data": {
+                    "table_name": table_name,
+                    "column_name": col["name"],
+                    "column_type": col["type"],
+                    "is_primary_key": col.get("is_primary_key", False),
+                    "foreign_key": col.get("foreign_key"),
+                },
+            },
+        )
+    def _to_table_document(
+        self,
+        user_id: str,
+        client_id: str,
+        table_name: str,
+        columns: list[dict],
+        row_count: int,
+        text: str,
+        updated_at: str,
+    ) -> LangChainDocument:
+        foreign_keys = []
+        for c in columns:
+            fk = c.get("foreign_key")
+            if not fk:
+                continue
+            target_table, _, target_column = fk.partition(".")
+            foreign_keys.append({
+                "column": c["name"],
+                "target_table": target_table,
+                "target_column": target_column,
+            })
+        return LangChainDocument(
+            page_content=text,
+            metadata={
+                "user_id": user_id,
+                "source_type": "database",
+                "chunk_level": "table",
+                "database_client_id": client_id,
+                "updated_at": updated_at,
+                "data": {
+                    "table_name": table_name,
+                    "row_count": row_count,
+                    "primary_key": [c["name"] for c in columns if c.get("is_primary_key")],
+                    "foreign_keys": foreign_keys,
+                    "column_names": [c["name"] for c in columns],
+                },
+            },
+        )
+    async def run(
+        self,
+        user_id: str,
+        client_id: str,
+        engine: Engine,
+        exclude_tables: Optional[frozenset[str]] = None,
+    ) -> int:
+        """Introspect the user's DB, profile columns, embed descriptions, store in PGVector.
+        Returns:
+            Total number of chunks ingested.
+        """
+        vector_store = get_vector_store()
+        logger.info("db pipeline start", user_id=user_id)
+        # Profile first — if this fails, old embeddings are untouched
+        schema = await asyncio.to_thread(get_schema, engine, exclude_tables)
+        updated_at = datetime.now(timezone(timedelta(hours=7))).isoformat()
+        all_docs: list = []
+        for table_name, columns in schema.items():
+            logger.info("profiling table", table=table_name, columns=len(columns))
+            entries = await asyncio.to_thread(profile_table, engine, table_name, columns)
+            docs = [self._to_document(user_id, client_id, table_name, e, updated_at) for e in entries]
+            all_docs.extend(docs)
+            # Table-level chunk. Failures here are logged and skipped — column
+            # chunks above are already in all_docs and will still be written.
+            try:
+                row_count = await asyncio.to_thread(get_row_count, engine, table_name)
+                sample_row = (
+                    await asyncio.to_thread(fetch_sample_row, engine, table_name)
+                    if row_count > 0
+                    else None
+                )
+                table_text = build_table_chunk(
+                    table_name, row_count, columns, entries, sample_row
+                )
+                all_docs.append(
+                    self._to_table_document(
+                        user_id, client_id, table_name, columns, row_count, table_text, updated_at
+                    )
+                )
+            except Exception as e:
+                logger.error(
+                    "table chunk generation failed", table=table_name, error=str(e)
+                )
+            logger.info("profiled table", table=table_name, count=len(docs))
+        # Insert new chunks first; only delete stale chunks after the insert succeeds.
+        # Prevents data loss if aadd_documents fails — old embeddings stay queryable
+        # until they're proven replaceable. Stale rows are identified by an older
+        # updated_at than this run.
+        if not all_docs:
+            logger.warning(
+                "no docs produced from schema; skipping delete to preserve existing embeddings",
+                user_id=user_id,
+                client_id=client_id,
+            )
+            return 0
+        await vector_store.aadd_documents(all_docs)
+        async with _pgvector_engine.begin() as conn:
+            result = await conn.execute(
+                text(
+                    "DELETE FROM langchain_pg_embedding "
+                    "WHERE cmetadata->>'user_id' = :user_id "
+                    "  AND cmetadata->>'source_type' = 'database' "
+                    "  AND cmetadata->>'database_client_id' = :client_id "
+                    "  AND cmetadata->>'updated_at' < :updated_at "
+                    "  AND collection_id = ("
+                    "    SELECT uuid FROM langchain_pg_collection WHERE name = 'document_embeddings'"
+                    "  )"
+                ),
+                {"user_id": user_id, "client_id": client_id, "updated_at": updated_at},
+            )
+            logger.info("cleared stale db embeddings", user_id=user_id, deleted=result.rowcount)
+        logger.info("db pipeline complete", user_id=user_id, total=len(all_docs))
+        return len(all_docs)
+db_pipeline_service = DbPipelineService()

src/pipeline/db_pipeline/extractor.py ADDED Viewed

	@@ -0,0 +1,283 @@

+"""Schema introspection and per-column profiling for a user's database.
+Identifiers (table/column names) are quoted via the engine's dialect preparer,
+which handles reserved words, mixed case, and embedded quotes correctly across
+dialects. Values used in SQL come from SQLAlchemy inspection of the DB itself,
+not user input.
+"""
+from typing import Optional
+import pandas as pd
+from sqlalchemy import Float, Integer, Numeric, inspect
+from sqlalchemy.engine import Engine
+from src.middlewares.logging import get_logger
+logger = get_logger("db_extractor")
+TOP_VALUES_THRESHOLD = 0.05  # show top values if distinct_ratio <= 5%
+# Dialects where PERCENTILE_CONT(...) WITHIN GROUP is supported as an aggregate.
+# MySQL has no percentile aggregate; BigQuery has PERCENTILE_CONT only as an
+# analytic (window) function — both drop median and keep min/max/mean.
+_MEDIAN_DIALECTS = frozenset({"postgresql", "mssql", "snowflake"})
+def _supports_median(engine: Engine) -> bool:
+    return engine.dialect.name in _MEDIAN_DIALECTS
+def _head_query(
+    engine: Engine,
+    select_clause: str,
+    from_clause: str,
+    n: int,
+    order_by: str = "",
+) -> str:
+    """LIMIT/TOP-equivalent head query for the engine's dialect."""
+    if engine.dialect.name == "mssql":
+        return f"SELECT TOP {n} {select_clause} FROM {from_clause} {order_by}".strip()
+    return f"SELECT {select_clause} FROM {from_clause} {order_by} LIMIT {n}".strip()
+def _qi(engine: Engine, name: str) -> str:
+    """Dialect-correct identifier quoting (schema.table also handled if dotted)."""
+    preparer = engine.dialect.identifier_preparer
+    if "." in name:
+        schema, _, table = name.partition(".")
+        return f"{preparer.quote(schema)}.{preparer.quote(table)}"
+    return preparer.quote(name)
+def get_schema(
+    engine: Engine, exclude_tables: Optional[frozenset[str]] = None
+) -> dict[str, list[dict]]:
+    """Returns {table_name: [{name, type, is_numeric, is_primary_key, foreign_key}, ...]}."""
+    exclude = exclude_tables or frozenset()
+    inspector = inspect(engine)
+    schema = {}
+    for table_name in inspector.get_table_names():
+        if table_name in exclude:
+            continue
+        pk = inspector.get_pk_constraint(table_name)
+        pk_cols = set(pk["constrained_columns"]) if pk else set()
+        fk_map = {}
+        for fk in inspector.get_foreign_keys(table_name):
+            for col, ref_col in zip(fk["constrained_columns"], fk["referred_columns"]):
+                fk_map[col] = f"{fk['referred_table']}.{ref_col}"
+        cols = inspector.get_columns(table_name)
+        schema[table_name] = [
+            {
+                "name": c["name"],
+                "type": str(c["type"]),
+                "is_numeric": isinstance(c["type"], (Integer, Numeric, Float)),
+                "is_primary_key": c["name"] in pk_cols,
+                "foreign_key": fk_map.get(c["name"]),
+            }
+            for c in cols
+        ]
+    logger.info("extracted schema", table_count=len(schema))
+    return schema
+def get_row_count(engine: Engine, table_name: str) -> int:
+    # Cast to plain int — pandas returns numpy.int64 which fails JSONB serialization
+    # when the value lands in PGVector cmetadata via the table-level chunk.
+    return int(pd.read_sql(f"SELECT COUNT(*) FROM {_qi(engine, table_name)}", engine).iloc[0, 0])
+def profile_column(
+    engine: Engine,
+    table_name: str,
+    col_name: str,
+    is_numeric: bool,
+    row_count: int,
+) -> dict:
+    """Returns null_count, distinct_count, min/max, top values, and sample values."""
+    if row_count == 0:
+        return {
+            "null_count": 0,
+            "distinct_count": 0,
+            "distinct_ratio": 0.0,
+            "sample_values": [],
+        }
+    qt = _qi(engine, table_name)
+    qc = _qi(engine, col_name)
+    # Combined stats query: null_count, distinct_count, and min/max (if numeric).
+    # One round-trip instead of two.
+    select_cols = [
+        f"COUNT(*) - COUNT({qc}) AS nulls",
+        f"COUNT(DISTINCT {qc}) AS distincts",
+    ]
+    if is_numeric:
+        select_cols.append(f"MIN({qc}) AS min_val")
+        select_cols.append(f"MAX({qc}) AS max_val")
+        select_cols.append(f"AVG({qc}) AS mean_val")
+        if _supports_median(engine):
+            select_cols.append(
+                f"PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY {qc}) AS median_val"
+            )
+    stats = pd.read_sql(f"SELECT {', '.join(select_cols)} FROM {qt}", engine)
+    null_count = int(stats.iloc[0]["nulls"])
+    distinct_count = int(stats.iloc[0]["distincts"])
+    distinct_ratio = distinct_count / row_count if row_count > 0 else 0
+    profile = {
+        "null_count": null_count,
+        "distinct_count": distinct_count,
+        "distinct_ratio": round(distinct_ratio, 4),
+    }
+    if is_numeric:
+        profile["min"] = stats.iloc[0]["min_val"]
+        profile["max"] = stats.iloc[0]["max_val"]
+        profile["mean"] = stats.iloc[0]["mean_val"]
+        if _supports_median(engine):
+            profile["median"] = stats.iloc[0]["median_val"]
+    if 0 < distinct_ratio <= TOP_VALUES_THRESHOLD:
+        top_sql = _head_query(
+            engine,
+            select_clause=f"{qc}, COUNT(*) AS cnt",
+            from_clause=f"{qt} GROUP BY {qc}",
+            n=10,
+            order_by="ORDER BY cnt DESC",
+        )
+        top = pd.read_sql(top_sql, engine)
+        profile["top_values"] = list(zip(top.iloc[:, 0].tolist(), top["cnt"].tolist()))
+    sample = pd.read_sql(_head_query(engine, qc, qt, 5), engine)
+    profile["sample_values"] = sample.iloc[:, 0].tolist()
+    return profile
+def profile_table(engine: Engine, table_name: str, columns: list[dict]) -> list[dict]:
+    """Profile every column in a table. Returns [{col, profile, text}, ...].
+    Per-column errors are logged and skipped so one bad column doesn't abort
+    the whole table.
+    """
+    row_count = get_row_count(engine, table_name)
+    if row_count == 0:
+        logger.info("skipping empty table", table=table_name)
+        return []
+    results = []
+    for col in columns:
+        try:
+            profile = profile_column(
+                engine, table_name, col["name"], col.get("is_numeric", False), row_count
+            )
+            text = build_text(table_name, row_count, col, profile)
+            results.append({"col": col, "profile": profile, "text": text})
+        except Exception as e:
+            logger.error(
+                "column profiling failed",
+                table=table_name,
+                column=col["name"],
+                error=str(e),
+            )
+            continue
+    return results
+def fetch_sample_row(engine: Engine, table_name: str) -> Optional[dict]:
+    """First row of the table as a dict, or None if the table is empty.
+    Reuses _qi for dialect-correct quoting and _head_query for TOP/LIMIT.
+    """
+    qt = _qi(engine, table_name)
+    sql = _head_query(engine, "*", qt, 1)
+    df = pd.read_sql(sql, engine)
+    if df.empty:
+        return None
+    return df.iloc[0].to_dict()
+def build_table_chunk(
+    table_name: str,
+    row_count: int,
+    columns: list[dict],
+    column_profiles: list[dict],
+    sample_row: Optional[dict],
+) -> str:
+    """Build the table-level chunk text.
+    Format (lines omitted when not applicable):
+        Table: {name} ({row_count} rows)
+        Primary key: {pk_cols}
+        Foreign keys: {col} -> {target_table}.{target_col}, ...
+        Columns ({n}): {col1}, {col2}, ...
+        Numeric ranges: {col} [{min}-{max}], ...
+        Sample row: {dict}
+    Pure formatter — no DB I/O. column_profiles is the output of profile_table
+    and is reused so we don't re-introspect.
+    """
+    lines = [f"Table: {table_name} ({row_count} rows)"]
+    pk_cols = [c["name"] for c in columns if c.get("is_primary_key")]
+    if pk_cols:
+        lines.append(f"Primary key: {', '.join(pk_cols)}")
+    fk_parts = [
+        f"{c['name']} -> {c['foreign_key']}" for c in columns if c.get("foreign_key")
+    ]
+    if fk_parts:
+        lines.append(f"Foreign keys: {', '.join(fk_parts)}")
+    col_names = [c["name"] for c in columns]
+    lines.append(f"Columns ({len(col_names)}): {', '.join(col_names)}")
+    range_parts = []
+    for entry in column_profiles:
+        col = entry["col"]
+        profile = entry["profile"]
+        if not col.get("is_numeric"):
+            continue
+        mn = profile.get("min")
+        mx = profile.get("max")
+        if mn is None or mx is None:
+            continue
+        range_parts.append(f"{col['name']} [{mn}-{mx}]")
+    if range_parts:
+        lines.append(f"Numeric ranges: {', '.join(range_parts)}")
+    if sample_row is not None:
+        lines.append(f"Sample row: {sample_row}")
+    return "\n".join(lines)
+def build_text(table_name: str, row_count: int, col: dict, profile: dict) -> str:
+    col_name = col["name"]
+    col_type = col["type"]
+    key_label = ""
+    if col.get("is_primary_key"):
+        key_label = " [PRIMARY KEY]"
+    elif col.get("foreign_key"):
+        key_label = f" [FK -> {col['foreign_key']}]"
+    text = f"Table: {table_name} ({row_count} rows)\n"
+    text += f"Column: {col_name} ({col_type}){key_label}\n"
+    text += f"Null count: {profile['null_count']}\n"
+    text += f"Distinct count: {profile['distinct_count']} ({profile['distinct_ratio']:.1%})\n"
+    if "min" in profile:
+        text += f"Min: {profile['min']}, Max: {profile['max']}\n"
+        text += f"Mean: {profile['mean']}\n"
+        if profile.get("median") is not None:
+            text += f"Median: {profile['median']}\n"
+    if "top_values" in profile:
+        top_str = ", ".join(f"{v} ({c})" for v, c in profile["top_values"])
+        text += f"Top values: {top_str}\n"
+    text += f"Sample values: {profile['sample_values']}"
+    return text

src/pipeline/document_pipeline/__init__.py ADDED Viewed

File without changes

src/pipeline/document_pipeline/document_pipeline.py ADDED Viewed

	@@ -0,0 +1,94 @@

+"""Document upload and processing pipeline."""
+from fastapi import HTTPException, UploadFile
+from sqlalchemy.ext.asyncio import AsyncSession
+from src.document.document_service import document_service
+from src.knowledge.processing_service import knowledge_processor
+from src.knowledge.parquet_service import delete_document_parquets
+from src.middlewares.logging import get_logger
+from src.storage.az_blob.az_blob import blob_storage
+logger = get_logger("document_pipeline")
+# NOTE: Keep in sync with _DOC_TYPES in src/api/v1/document.py
+SUPPORTED_FILE_TYPES = ["pdf", "docx", "txt", "csv", "xlsx"]
+MAX_FILE_SIZE_BYTES = 10 * 1024 * 1024  # 10 MB
+class DocumentPipeline:
+    """Orchestrates the full document upload, process, and delete flows."""
+    async def upload(self, file: UploadFile, user_id: str, db: AsyncSession) -> dict:
+        """Validate → upload to blob → save to DB."""
+        content = await file.read()
+        if not file.filename:
+            raise HTTPException(status_code=400, detail="Filename is required.")
+        file_type = file.filename.split(".")[-1].lower() if "." in file.filename else "txt"
+        if len(content) > MAX_FILE_SIZE_BYTES:
+            raise HTTPException(
+                status_code=400,
+                detail="File size exceeds maximum allowed size of 10 MB.",
+            )
+        if file_type not in SUPPORTED_FILE_TYPES:
+            raise HTTPException(
+                status_code=400,
+                detail=f"Unsupported file type. Supported: {', '.join(SUPPORTED_FILE_TYPES)}",
+            )
+        blob_name = await blob_storage.upload_file(content, file.filename, user_id)
+        document = await document_service.create_document(
+            db=db,
+            user_id=user_id,
+            filename=file.filename,
+            blob_name=blob_name,
+            file_size=len(content),
+            file_type=file_type,
+        )
+        logger.info(f"Uploaded document {document.id} for user {user_id}")
+        return {"id": document.id, "filename": document.filename, "status": document.status}
+    async def process(self, document_id: str, user_id: str, db: AsyncSession) -> dict:
+        """Validate ownership → extract text → chunk → ingest to vector store."""
+        document = await document_service.get_document(db, document_id)
+        if not document:
+            raise HTTPException(status_code=404, detail="Document not found")
+        if document.user_id != user_id:
+            raise HTTPException(status_code=403, detail="Access denied")
+        try:
+            await document_service.update_document_status(db, document_id, "processing")
+            chunks_count = await knowledge_processor.process_document(document, db)
+            await document_service.update_document_status(db, document_id, "completed")
+            logger.info(f"Processed document {document_id}: {chunks_count} chunks")
+            return {"document_id": document_id, "chunks_processed": chunks_count}
+        except Exception as e:
+            logger.error(f"Processing failed for document {document_id}", error=str(e))
+            await document_service.update_document_status(db, document_id, "failed", str(e))
+            raise HTTPException(status_code=500, detail=f"Processing failed: {str(e)}")
+    async def delete(self, document_id: str, user_id: str, db: AsyncSession) -> dict:
+        """Validate ownership → delete from blob and DB."""
+        document = await document_service.get_document(db, document_id)
+        if not document:
+            raise HTTPException(status_code=404, detail="Document not found")
+        if document.user_id != user_id:
+            raise HTTPException(status_code=403, detail="Access denied")
+        await document_service.delete_document(db, document_id)
+        if document.file_type in ("csv", "xlsx"):
+            await delete_document_parquets(user_id, document_id)
+        logger.info(f"Deleted document {document_id} for user {user_id}")
+        return {"document_id": document_id}
+document_pipeline = DocumentPipeline()

src/query/__init__.py ADDED Viewed

File without changes

src/query/base.py ADDED Viewed

	@@ -0,0 +1,32 @@

+"""Shared contract for query executors."""
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from sqlalchemy.ext.asyncio import AsyncSession
+from src.rag.base import RetrievalResult
+@dataclass
+class QueryResult:
+    source_type: str        # "database" or "document"
+    source_id: str          # database_client_id or document_id
+    table_or_file: str
+    columns: list[str]
+    rows: list[dict]
+    row_count: int
+    metadata: dict = field(default_factory=dict)
+    # metadata should include "column_types": {"col_name": "dtype"} when available
+class BaseExecutor(ABC):
+    @abstractmethod
+    async def execute(
+        self,
+        results: list[RetrievalResult],
+        user_id: str,
+        db: AsyncSession,
+        question: str,
+        limit: int = 100,
+    ) -> list[QueryResult]: ...

src/query/executors/__init__.py ADDED Viewed

File without changes

src/query/executors/db_executor.py ADDED Viewed

	@@ -0,0 +1,648 @@

+"""Executor for registered database sources (source_type="database").
+Flow per (client_id, question):
+  1. Collect all relevant (table_name, column_name) pairs from retrieval results.
+  2. Fetch the FULL schema for those tables from PGVector (not just top-k columns).
+  3. Build a schema context string and send to LLM → structured SQLQuery output.
+  4. Validate via sqlglot: SELECT-only, schema-grounded, LIMIT enforced.
+  5. Execute on the user's DB via engine_scope + asyncio.to_thread.
+  6. Return QueryResult per client_id (may span multiple tables via JOINs).
+Supported db_types: postgres, supabase, mysql.
+Other types are skipped with a warning — they do not raise.
+"""
+import asyncio
+from collections import defaultdict
+from typing import Any
+import sqlglot
+import sqlglot.expressions as exp
+import tiktoken
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_openai import AzureChatOpenAI
+from sqlalchemy import text
+from sqlalchemy.ext.asyncio import AsyncSession
+from src.config.settings import settings
+from src.database_client.database_client_service import database_client_service
+from src.db.postgres.connection import _pgvector_engine
+from src.middlewares.logging import get_logger
+from src.models.sql_query import SQLQuery
+from src.pipeline.db_pipeline import db_pipeline_service
+from src.query.base import BaseExecutor, QueryResult
+from src.rag.base import RetrievalResult
+from src.utils.db_credential_encryption import decrypt_credentials_dict
+logger = get_logger("db_executor")
+_enc = tiktoken.get_encoding("cl100k_base")
+_SUPPORTED_DB_TYPES = {"postgres", "supabase", "mysql"}
+_MAX_RETRIES = 3
+_MAX_LIMIT = 500
+_FK_EXPANSION_MAX_TABLES = 5
+_SQL_SYSTEM_PROMPT = """\
+You are a SQL data analyst working with a user's database.
+Generate a single SQL SELECT statement that answers the user's question.
+Database dialect: {dialect}
+Rules:
+- ONLY reference tables and columns listed in the schema below. Do not invent names.
+- Always include a LIMIT clause (max {limit}).
+- Do not use DELETE, UPDATE, INSERT, DROP, TRUNCATE, ALTER, CREATE, or any DDL.
+- Prefer explicit JOINs over subqueries when combining tables.
+- For aggregations, always alias the result column (e.g. COUNT(*) AS order_count).
+- For date filtering, use dialect-appropriate functions ({dialect} syntax).
+Schema:
+{schema}
+{error_section}"""
+class DbExecutor(BaseExecutor):
+    def __init__(self) -> None:
+        self._llm = AzureChatOpenAI(
+            azure_deployment=settings.azureai_deployment_name_4o,
+            openai_api_version=settings.azureai_api_version_4o,
+            azure_endpoint=settings.azureai_endpoint_url_4o,
+            api_key=settings.azureai_api_key_4o,
+            temperature=0,
+        )
+        self._prompt = ChatPromptTemplate.from_messages([
+            ("system", _SQL_SYSTEM_PROMPT),
+            ("human", "{question}"),
+        ])
+        self._chain = self._prompt | self._llm.with_structured_output(SQLQuery)
+    # ------------------------------------------------------------------
+    # Public interface
+    # ------------------------------------------------------------------
+    async def execute(
+        self,
+        results: list[RetrievalResult],
+        user_id: str,
+        db: AsyncSession,
+        question: str,
+        limit: int = 100,
+    ) -> list[QueryResult]:
+        db_results = [r for r in results if r.source_type == "database"]
+        if not db_results:
+            return []
+        # Group by client_id — one SQL generation + execution pass per client
+        by_client: dict[str, list[RetrievalResult]] = defaultdict(list)
+        for r in db_results:
+            client_id = r.metadata.get("database_client_id", "")
+            if client_id:
+                by_client[client_id].append(r)
+            else:
+                logger.warning("db result missing database_client_id, skipping")
+        query_results: list[QueryResult] = []
+        for client_id, client_results in by_client.items():
+            try:
+                qr = await self._execute_for_client(client_id, client_results, user_id, db, question, limit)
+                if qr:
+                    query_results.append(qr)
+            except Exception as e:
+                logger.error("db executor failed for client", client_id=client_id, error=str(e))
+        return query_results
+    # ------------------------------------------------------------------
+    # Per-client execution
+    # ------------------------------------------------------------------
+    async def _execute_for_client(
+        self,
+        client_id: str,
+        results: list[RetrievalResult],
+        user_id: str,
+        db: AsyncSession,
+        question: str,
+        limit: int,
+    ) -> QueryResult | None:
+        client = await database_client_service.get(db, client_id)
+        if not client:
+            logger.warning("database client not found", client_id=client_id)
+            return None
+        if client.user_id != user_id:
+            logger.warning("client ownership mismatch", client_id=client_id)
+            return None
+        if client.db_type not in _SUPPORTED_DB_TYPES:
+            logger.warning("unsupported db_type for query execution", db_type=client.db_type)
+            return None
+        # Hit tables = tables retrieval pointed at directly. Get full per-column
+        # schema for these. Related tables (one FK hop away, both directions) are
+        # fetched separately in abbreviated form to give the LLM enough context
+        # to JOIN without paying the per-column profile token cost.
+        hit_tables = list({
+            r.metadata.get("data", {}).get("table_name")
+            for r in results
+            if r.metadata.get("data", {}).get("table_name")
+        })
+        if not hit_tables:
+            logger.warning("no table_name on any retrieval result", client_id=client_id)
+            return None
+        full_schema = await self._fetch_full_schema(client_id, hit_tables, user_id)
+        if not full_schema:
+            logger.warning("no schema found in vector store", client_id=client_id, tables=hit_tables)
+            return None
+        related_tables = await self._find_related_tables(client_id, user_id, hit_tables)
+        related_schema = (
+            await self._fetch_abbreviated_schema(client_id, user_id, related_tables)
+            if related_tables else {}
+        )
+        schema_ctx = self._build_schema_context(full_schema, related_schema)
+        capped_limit = min(limit, _MAX_LIMIT)
+        dialect = client.db_type
+        # SQL generation with retry
+        validated_sql: str | None = None
+        prev_error: str = ""
+        prev_reasoning: str = ""
+        for attempt in range(_MAX_RETRIES):
+            if prev_error:
+                error_section = (
+                    f"Previous attempt reasoning: {prev_reasoning}\n"
+                    f"Previous attempt failed: {prev_error}\n"
+                    "Fix the issue above."
+                )
+            else:
+                error_section = ""
+            try:
+                prompt_text = schema_ctx + error_section + question
+                input_tokens = len(_enc.encode(prompt_text))
+                logger.info("sql generation input tokens", attempt=attempt + 1, tokens=input_tokens)
+                result: SQLQuery = await self._chain.ainvoke({
+                    "schema": schema_ctx,
+                    "dialect": dialect,
+                    "limit": capped_limit,
+                    "error_section": error_section,
+                    "question": question,
+                })
+                sql = result.sql.strip()
+                allowed_tables = set(full_schema) | set(related_schema)
+                column_map: dict[str, set[str]] = {
+                    t: {c["name"] for c in cols} for t, cols in full_schema.items()
+                }
+                for t, info in related_schema.items():
+                    column_map[t] = set(info.get("column_names") or [])
+                validation_error = self._validate(sql, allowed_tables, capped_limit, column_map)
+                if validation_error:
+                    prev_error = validation_error
+                    prev_reasoning = result.reasoning
+                    logger.warning("sql validation failed", attempt=attempt + 1, error=validation_error)
+                    continue
+                validated_sql = self._enforce_limit(sql, capped_limit)
+                output_tokens = len(_enc.encode(result.sql)) + len(_enc.encode(result.reasoning))
+                logger.info(
+                    "sql generated",
+                    attempt=attempt + 1,
+                    input_tokens=input_tokens,
+                    output_tokens=output_tokens,
+                    total_tokens=input_tokens + output_tokens,
+                    reasoning=result.reasoning,
+                )
+                break
+            except Exception as e:
+                prev_error = str(e)
+                logger.warning("sql generation error", attempt=attempt + 1, error=prev_error)
+        if not validated_sql:
+            logger.error("sql generation failed after retries", client_id=client_id)
+            return None
+        # Execute on user's DB
+        creds = decrypt_credentials_dict(client.credentials)
+        with db_pipeline_service.engine_scope(client.db_type, creds) as engine:
+            rows = await asyncio.to_thread(self._run_sql, engine, validated_sql)
+        column_types = {
+            col["name"]: col["type"]
+            for cols in full_schema.values()
+            for col in cols
+        }
+        columns = list(rows[0].keys()) if rows else []
+        return QueryResult(
+            source_type="database",
+            source_id=client_id,
+            table_or_file=", ".join(hit_tables),
+            columns=columns,
+            rows=rows,
+            row_count=len(rows),
+            metadata={
+                "db_type": client.db_type,
+                "client_name": client.name,
+                "sql": validated_sql,
+                "column_types": {c: column_types.get(c, "unknown") for c in columns},
+            },
+        )
+    # ------------------------------------------------------------------
+    # Schema helpers
+    # ------------------------------------------------------------------
+    async def _find_related_tables(
+        self,
+        client_id: str,
+        user_id: str,
+        hit_tables: list[str],
+    ) -> list[str]:
+        """One-hop FK neighbours of `hit_tables`, both directions, excluding hits.
+        Prefers chunk_level='table' rows; if none exist for the client (legacy
+        ingest predating Phase 1), falls back to aggregating from column-chunk
+        metadata. Returns [] when no FK metadata is available.
+        Capped at _FK_EXPANSION_MAX_TABLES, ranked by edge count desc then
+        table name asc. A warning is logged when the cap kicks in.
+        """
+        if not hit_tables:
+            return []
+        hit_set = set(hit_tables)
+        # edge_counts[related_table] = number of FK edges connecting it to the hit set
+        edge_counts: dict[str, int] = defaultdict(int)
+        # ---- Primary path: table-level chunks ----
+        sql = text("""
+            SELECT lpe.cmetadata
+            FROM langchain_pg_embedding lpe
+            JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
+            WHERE lpc.name = 'document_embeddings'
+              AND lpe.cmetadata->>'user_id' = :user_id
+              AND lpe.cmetadata->>'source_type' = 'database'
+              AND lpe.cmetadata->>'database_client_id' = :client_id
+              AND lpe.cmetadata->>'chunk_level' = 'table'
+        """)
+        async with _pgvector_engine.connect() as conn:
+            result = await conn.execute(sql, {"user_id": user_id, "client_id": client_id})
+            table_rows = result.fetchall()
+        if table_rows:
+            for row in table_rows:
+                data = row.cmetadata.get("data", {})
+                table = data.get("table_name")
+                fks = data.get("foreign_keys") or []
+                if not table:
+                    continue
+                if table in hit_set:
+                    # Outgoing: this hit's FKs point at related tables
+                    for fk in fks:
+                        target = fk.get("target_table")
+                        if target and target not in hit_set:
+                            edge_counts[target] += 1
+                else:
+                    # Incoming: this non-hit table's FKs point into the hit set
+                    for fk in fks:
+                        target = fk.get("target_table")
+                        if target in hit_set:
+                            edge_counts[table] += 1
+        else:
+            # ---- Fallback: aggregate from column chunks ----
+            sql = text("""
+                SELECT lpe.cmetadata->'data'->>'table_name' AS src_table,
+                       lpe.cmetadata->'data'->>'foreign_key' AS fk
+                FROM langchain_pg_embedding lpe
+                JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
+                WHERE lpc.name = 'document_embeddings'
+                  AND lpe.cmetadata->>'user_id' = :user_id
+                  AND lpe.cmetadata->>'source_type' = 'database'
+                  AND lpe.cmetadata->>'database_client_id' = :client_id
+                  AND lpe.cmetadata->>'chunk_level' = 'column'
+                  AND lpe.cmetadata->'data'->>'foreign_key' IS NOT NULL
+            """)
+            async with _pgvector_engine.connect() as conn:
+                result = await conn.execute(sql, {"user_id": user_id, "client_id": client_id})
+                col_rows = result.fetchall()
+            for row in col_rows:
+                src = row.src_table
+                fk = row.fk
+                if not src or not fk:
+                    continue
+                target = fk.split(".", 1)[0]
+                if src in hit_set and target and target not in hit_set:
+                    edge_counts[target] += 1
+                elif src not in hit_set and target in hit_set:
+                    edge_counts[src] += 1
+        if not edge_counts:
+            return []
+        ranked = sorted(edge_counts.items(), key=lambda kv: (-kv[1], kv[0]))
+        if len(ranked) > _FK_EXPANSION_MAX_TABLES:
+            logger.warning(
+                "fk expansion cap hit",
+                client_id=client_id,
+                total=len(ranked),
+                cap=_FK_EXPANSION_MAX_TABLES,
+                dropped=[t for t, _ in ranked[_FK_EXPANSION_MAX_TABLES:]],
+            )
+            ranked = ranked[:_FK_EXPANSION_MAX_TABLES]
+        related = [t for t, _ in ranked]
+        logger.info("fk-related tables", hit=sorted(hit_set), related=related)
+        return related
+    async def _fetch_abbreviated_schema(
+        self,
+        client_id: str,
+        user_id: str,
+        table_names: list[str],
+    ) -> dict[str, dict[str, Any]]:
+        """Abbreviated schema: name, row_count, PK, FKs, column names — no profiles.
+        Prefers chunk_level='table' rows. Falls back to aggregating column-chunk
+        metadata when table chunks are missing for a given table_name.
+        Returns {table_name: {"row_count": int|None, "primary_key": [str],
+        "foreign_keys": [{column, target_table, target_column}],
+        "column_names": [str]}}.
+        """
+        if not table_names:
+            return {}
+        placeholders = ", ".join(f":t{i}" for i in range(len(table_names)))
+        params: dict[str, Any] = {"user_id": user_id, "client_id": client_id}
+        for i, name in enumerate(table_names):
+            params[f"t{i}"] = name
+        # Primary path: one row per table from chunk_level='table'
+        sql_table = text(f"""
+            SELECT lpe.cmetadata
+            FROM langchain_pg_embedding lpe
+            JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
+            WHERE lpc.name = 'document_embeddings'
+              AND lpe.cmetadata->>'user_id' = :user_id
+              AND lpe.cmetadata->>'source_type' = 'database'
+              AND lpe.cmetadata->>'database_client_id' = :client_id
+              AND lpe.cmetadata->>'chunk_level' = 'table'
+              AND lpe.cmetadata->'data'->>'table_name' IN ({placeholders})
+        """)
+        async with _pgvector_engine.connect() as conn:
+            result = await conn.execute(sql_table, params)
+            t_rows = result.fetchall()
+        out: dict[str, dict[str, Any]] = {}
+        for row in t_rows:
+            data = row.cmetadata.get("data", {})
+            tname = data.get("table_name")
+            if not tname:
+                continue
+            out[tname] = {
+                "row_count": data.get("row_count"),
+                "primary_key": list(data.get("primary_key") or []),
+                "foreign_keys": list(data.get("foreign_keys") or []),
+                "column_names": list(data.get("column_names") or []),
+            }
+        # Fallback for tables with no table-chunk: aggregate column chunks
+        missing = [t for t in table_names if t not in out]
+        if missing:
+            placeholders_m = ", ".join(f":m{i}" for i in range(len(missing)))
+            params_m: dict[str, Any] = {"user_id": user_id, "client_id": client_id}
+            for i, name in enumerate(missing):
+                params_m[f"m{i}"] = name
+            sql_col = text(f"""
+                SELECT lpe.cmetadata
+                FROM langchain_pg_embedding lpe
+                JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
+                WHERE lpc.name = 'document_embeddings'
+                  AND lpe.cmetadata->>'user_id' = :user_id
+                  AND lpe.cmetadata->>'source_type' = 'database'
+                  AND lpe.cmetadata->>'database_client_id' = :client_id
+                  AND lpe.cmetadata->>'chunk_level' = 'column'
+                  AND lpe.cmetadata->'data'->>'table_name' IN ({placeholders_m})
+                ORDER BY lpe.cmetadata->'data'->>'table_name', lpe.cmetadata->'data'->>'column_name'
+            """)
+            async with _pgvector_engine.connect() as conn:
+                result = await conn.execute(sql_col, params_m)
+                c_rows = result.fetchall()
+            agg: dict[str, dict[str, Any]] = {
+                t: {"row_count": None, "primary_key": [], "foreign_keys": [], "column_names": []}
+                for t in missing
+            }
+            for row in c_rows:
+                data = row.cmetadata.get("data", {})
+                tname = data.get("table_name")
+                cname = data.get("column_name")
+                if not tname or tname not in agg or not cname:
+                    continue
+                bucket = agg[tname]
+                bucket["column_names"].append(cname)
+                if data.get("is_primary_key"):
+                    bucket["primary_key"].append(cname)
+                fk = data.get("foreign_key")
+                if fk:
+                    target_table, _, target_col = fk.partition(".")
+                    bucket["foreign_keys"].append({
+                        "column": cname,
+                        "target_table": target_table,
+                        "target_column": target_col,
+                    })
+            for t, v in agg.items():
+                if v["column_names"]:
+                    out[t] = v
+        return out
+    async def _fetch_full_schema(
+        self,
+        client_id: str,
+        table_names: list[str],
+        user_id: str,
+    ) -> dict[str, list[dict[str, Any]]]:
+        """Fetch ALL column chunks for the given tables from PGVector.
+        Returns {table_name: [{"name": ..., "type": ..., "is_primary_key": ...,
+                                "foreign_key": ..., "content": ...}]}
+        """
+        placeholders = ", ".join(f":t{i}" for i in range(len(table_names)))
+        sql = text(f"""
+            SELECT lpe.cmetadata, lpe.document
+            FROM langchain_pg_embedding lpe
+            JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
+            WHERE lpc.name = 'document_embeddings'
+              AND lpe.cmetadata->>'user_id' = :user_id
+              AND lpe.cmetadata->>'source_type' = 'database'
+              AND lpe.cmetadata->>'chunk_level' = 'column'
+              AND lpe.cmetadata->>'database_client_id' = :client_id
+              AND lpe.cmetadata->'data'->>'table_name' IN ({placeholders})
+            ORDER BY lpe.cmetadata->'data'->>'table_name', lpe.cmetadata->'data'->>'column_name'
+        """)
+        params: dict[str, Any] = {"user_id": user_id, "client_id": client_id}
+        for i, name in enumerate(table_names):
+            params[f"t{i}"] = name
+        async with _pgvector_engine.connect() as conn:
+            result = await conn.execute(sql, params)
+            rows = result.fetchall()
+        schema: dict[str, list[dict[str, Any]]] = defaultdict(list)
+        for row in rows:
+            data = row.cmetadata.get("data", {})
+            table = data.get("table_name")
+            if table:
+                schema[table].append({
+                    "name": data.get("column_name", ""),
+                    "type": data.get("column_type", ""),
+                    "is_primary_key": data.get("is_primary_key", False),
+                    "foreign_key": data.get("foreign_key"),
+                    "content": row.document,  # chunk text includes top values / samples
+                })
+        return dict(schema)
+    def _build_schema_context(
+        self,
+        schema: dict[str, list[dict[str, Any]]],
+        related_schema: dict[str, dict[str, Any]] | None = None,
+    ) -> str:
+        lines: list[str] = []
+        for table, columns in schema.items():
+            lines.append(f"Table: {table}")
+            for col in columns:
+                flags = []
+                if col["is_primary_key"]:
+                    flags.append("PRIMARY KEY")
+                if col["foreign_key"]:
+                    flags.append(f"FK -> {col['foreign_key']}")
+                flag_str = f"  [{', '.join(flags)}]" if flags else ""
+                lines.append(f"  - {col['name']}  {col['type']}{flag_str}")
+                # Include sample/top-values line from chunk content if present
+                for line in col["content"].splitlines():
+                    if line.startswith(("Top values:", "Sample values:")):
+                        lines.append(f"    {line}")
+                        break
+            lines.append("")
+        related_block = self._build_related_schema_block(related_schema or {})
+        if related_block:
+            lines.append(related_block)
+        return "\n".join(lines).strip()
+    def _build_related_schema_block(self, related_schema: dict[str, dict[str, Any]]) -> str:
+        """Format the abbreviated FK-related-tables section. Empty string when no related."""
+        if not related_schema:
+            return ""
+        lines: list[str] = ["Related tables (one hop via FK, abbreviated — use for JOINs only):"]
+        for table, info in related_schema.items():
+            row_count = info.get("row_count")
+            header = f"- {table} ({row_count} rows)" if row_count is not None else f"- {table}"
+            lines.append(header)
+            pk = info.get("primary_key") or []
+            lines.append(f"    Primary key: {', '.join(pk) if pk else '(none)'}")
+            fks = info.get("foreign_keys") or []
+            if fks:
+                fk_strs = [
+                    f"{fk.get('column')} -> {fk.get('target_table')}.{fk.get('target_column')}"
+                    for fk in fks
+                ]
+                lines.append(f"    Foreign keys: {', '.join(fk_strs)}")
+            else:
+                lines.append("    Foreign keys: (none)")
+            cols = info.get("column_names") or []
+            lines.append(f"    Columns: {', '.join(cols)}")
+        return "\n".join(lines)
+    # ------------------------------------------------------------------
+    # Guardrails
+    # ------------------------------------------------------------------
+    def _validate(
+        self,
+        sql: str,
+        allowed_tables: set[str],
+        limit: int,
+        column_map: dict[str, set[str]] | None = None,
+    ) -> str:
+        """Return an error string if validation fails, empty string if OK.
+        `allowed_tables` is the union of hit-table names and FK-related table
+        names — both are legal targets for SELECT/JOIN.
+        `column_map` maps table_name → set of valid column names. When provided,
+        any qualified table.column reference not found in the map triggers a retry
+        with an informative error so the LLM can self-correct without hallucinating.
+        """
+        # Layer 1: sqlglot parse + SELECT-only check
+        try:
+            parsed = sqlglot.parse_one(sql)
+        except sqlglot.errors.ParseError as e:
+            return f"SQL parse error: {e}"
+        if not isinstance(parsed, exp.Select):
+            return f"Only SELECT statements are allowed. Got: {type(parsed).__name__}"
+        # Check for DML anywhere in the AST (including writeable CTEs)
+        for node in parsed.find_all((exp.Insert, exp.Update, exp.Delete)):
+            return f"DML ({type(node).__name__}) is not allowed."
+        # Layer 2: schema grounding — table names
+        known_tables = {t.lower() for t in allowed_tables}
+        alias_to_table: dict[str, str] = {}
+        for tbl in parsed.find_all(exp.Table):
+            name = tbl.name.lower()
+            if name and name not in known_tables:
+                return f"Unknown table '{tbl.name}'. Only use tables from the schema."
+            alias = (tbl.alias or tbl.name).lower()
+            alias_to_table[alias] = name
+        # Layer 3: column grounding — qualified references only (table.column)
+        if column_map:
+            normalized_map = {t.lower(): {c.lower() for c in cols} for t, cols in column_map.items()}
+            for col_node in parsed.find_all(exp.Column):
+                tbl_ref = col_node.table
+                if not tbl_ref:
+                    continue  # unqualified — skip, can't resolve without full alias tracking
+                tbl_name = alias_to_table.get(tbl_ref.lower(), tbl_ref.lower())
+                col_name = col_node.name.lower()
+                if tbl_name in normalized_map and col_name not in normalized_map[tbl_name]:
+                    available = ", ".join(sorted(normalized_map[tbl_name]))
+                    return (
+                        f"Column '{col_node.name}' does not exist on table '{tbl_name}'. "
+                        f"Available columns: {available}."
+                    )
+        # Layer 4: LIMIT enforcement (inject if missing — done before execution)
+        return ""
+    # ------------------------------------------------------------------
+    # SQL execution
+    # ------------------------------------------------------------------
+    def _enforce_limit(self, sql: str, limit: int) -> str:
+        """Inject or cap LIMIT using sqlglot AST manipulation."""
+        parsed = sqlglot.parse_one(sql)
+        existing = parsed.find(exp.Limit)
+        if existing:
+            current = int(existing.expression.this)
+            if current > limit:
+                return parsed.limit(limit).sql()
+        else:
+            return parsed.limit(limit).sql()
+        return parsed.sql()
+    def _run_sql(self, engine: Any, sql: str) -> list[dict]:
+        # Ensure the user DB connection is a read-only credential — sqlglot validation alone is not sufficient.
+        with engine.connect() as conn:
+            result = conn.execute(text(sql))
+            return [dict(row) for row in result.mappings()]
+db_executor = DbExecutor()

src/query/executors/tabular.py ADDED Viewed

	@@ -0,0 +1,287 @@

+"""Executor for tabular document sources (source_type="document", file_type csv/xlsx).
+Flow:
+  1. Group RetrievalResult chunks by (document_id, sheet_name).
+  2. Per group: download Parquet from Azure Blob → pandas DataFrame.
+  3. Build schema context from DataFrame columns + sample values.
+  4. LLM decides operation (groupby_sum, filter, top_n, etc.) via structured output.
+  5. Pandas runs the operation; retry up to 3x on error with feedback to LLM.
+  6. Fallback to raw rows if all retries fail.
+  7. Return QueryResult per group.
+"""
+import asyncio
+from typing import Literal, TypedDict
+import pandas as pd
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_openai import AzureChatOpenAI
+from pydantic import BaseModel
+from sqlalchemy.ext.asyncio import AsyncSession
+from src.config.settings import settings
+from src.knowledge.parquet_service import download_parquet
+from src.middlewares.logging import get_logger
+from src.query.base import BaseExecutor, QueryResult
+from src.rag.base import RetrievalResult
+logger = get_logger("tabular_executor")
+class _GroupInfo(TypedDict):
+    filename: str
+    file_type: str
+_TABULAR_FILE_TYPES = ("csv", "xlsx")
+_MAX_RETRIES = 3
+_SYSTEM_PROMPT = """\
+You are a data analyst. Given a DataFrame schema and a user question, \
+decide which pandas operation to perform.
+IMPORTANT rules:
+- Use ONLY the exact column names as written in the schema below. Never translate or rename them.
+- For top_n: always set value_col to the column to sort by. Do NOT use sort_col for top_n.
+- For sort: use sort_col for the column to sort by.
+- For filter with comparison (>, <, >=, <=, !=): set filter_operator accordingly (gt, lt, gte, lte, ne). Default is eq (==).
+- For multi-condition filters (AND logic), use the filters field as a list of {{"col", "value", "op"}} dicts instead of filter_col/filter_value.
+  Example: status=SUCCESS AND amount_paid>200000 → filters=[{{"col":"status","value":"SUCCESS","op":"eq"}},{{"col":"amount_paid","value":"200000","op":"gt"}}]
+- For OR conditions on a column (e.g. value is A or B), use or_filters. Combine with filters for mixed AND+OR logic.
+  Example: (status=FAILED OR status=REVERSED) AND payment_channel=X → or_filters=[{{"col":"status","value":"FAILED","op":"eq"}},{{"col":"status","value":"REVERSED","op":"eq"}}], filters=[{{"col":"payment_channel","value":"X","op":"eq"}}]
+- For groupby with a pre-filter (e.g. count SUCCESS per channel): use filters or or_filters to narrow rows first, then use groupby_count/groupby_sum/groupby_avg on the filtered data by setting both filters and group_col.
+Schema:
+{schema}
+{error_section}"""
+class TabularOperation(BaseModel):
+    operation: Literal[
+        "filter", "groupby_sum", "groupby_avg", "groupby_count",
+        "top_n", "sort", "aggregate", "raw"
+    ]
+    group_col: str | None = None       # for groupby_*
+    value_col: str | None = None       # for groupby_*, top_n, aggregate
+    filter_col: str | None = None      # for single filter
+    filter_value: str | None = None    # for single filter
+    filter_operator: Literal["eq", "ne", "gt", "gte", "lt", "lte"] = "eq"  # for single filter
+    filters: list[dict] | None = None     # for multi-condition AND: [{"col": ..., "value": ..., "op": ...}]
+    or_filters: list[dict] | None = None  # for OR conditions, applied before AND filters
+    sort_col: str | None = None        # for sort
+    ascending: bool = True             # for sort
+    n: int | None = None               # for top_n
+    agg_func: Literal["sum", "avg", "min", "max", "count"] | None = None  # for aggregate
+    reasoning: str
+def _get_filter_mask(df: pd.DataFrame, col: str, value: str, operator: str) -> pd.Series:
+    numeric = pd.to_numeric(df[col], errors="coerce")
+    if operator == "eq":
+        return df[col].astype(str) == str(value)
+    elif operator == "ne":
+        return df[col].astype(str) != str(value)
+    elif operator == "gt":
+        return numeric > float(value)
+    elif operator == "gte":
+        return numeric >= float(value)
+    elif operator == "lt":
+        return numeric < float(value)
+    elif operator == "lte":
+        return numeric <= float(value)
+    raise ValueError(f"Unknown operator: {operator}")
+def _apply_single_filter(df: pd.DataFrame, col: str, value: str, operator: str) -> pd.DataFrame:
+    return df[_get_filter_mask(df, col, value, operator)]
+def _build_schema_context(df: pd.DataFrame) -> str:
+    lines = []
+    for col in df.columns:
+        sample = df[col].dropna().head(3).tolist()
+        lines.append(f"- {col} ({df[col].dtype}): sample values: {sample}")
+    return "\n".join(lines)
+def _apply_operation(df: pd.DataFrame, op: TabularOperation, limit: int) -> pd.DataFrame:
+    if op.operation == "groupby_sum":
+        if not op.group_col or not op.value_col:
+            raise ValueError(f"groupby_sum requires group_col and value_col, got {op}")
+        return df.groupby(op.group_col)[op.value_col].sum().reset_index().nlargest(limit, op.value_col)
+    elif op.operation == "groupby_avg":
+        if not op.group_col or not op.value_col:
+            raise ValueError(f"groupby_avg requires group_col and value_col, got {op}")
+        return df.groupby(op.group_col)[op.value_col].mean().reset_index().nlargest(limit, op.value_col)
+    elif op.operation == "groupby_count":
+        if not op.group_col:
+            raise ValueError(f"groupby_count requires group_col, got {op}")
+        df_filtered = df.copy()
+        if op.or_filters:
+            or_mask = pd.Series([False] * len(df_filtered), index=df_filtered.index)
+            for f in op.or_filters:
+                or_mask = or_mask | _get_filter_mask(df_filtered, f["col"], f["value"], f.get("op", "eq"))
+            df_filtered = df_filtered[or_mask]
+        if op.filters:
+            for f in op.filters:
+                df_filtered = _apply_single_filter(df_filtered, f["col"], f["value"], f.get("op", "eq"))
+        elif op.filter_col and op.filter_value is not None:
+            df_filtered = _apply_single_filter(df_filtered, op.filter_col, op.filter_value, op.filter_operator)
+        return df_filtered.groupby(op.group_col).size().reset_index(name="count").nlargest(limit, "count")
+    elif op.operation == "filter":
+        result = df.copy()
+        if op.or_filters:
+            or_mask = pd.Series([False] * len(result), index=result.index)
+            for f in op.or_filters:
+                or_mask = or_mask | _get_filter_mask(result, f["col"], f["value"], f.get("op", "eq"))
+            result = result[or_mask]
+        if op.filters:
+            for f in op.filters:
+                result = _apply_single_filter(result, f["col"], f["value"], f.get("op", "eq"))
+        elif op.filter_col and op.filter_value is not None and not op.or_filters:
+            result = _apply_single_filter(result, op.filter_col, op.filter_value, op.filter_operator)
+        elif not op.or_filters and not op.filters and (not op.filter_col or op.filter_value is None):
+            raise ValueError(f"filter requires filter_col/filter_value or filters or or_filters, got {op}")
+        return result.head(limit)
+    elif op.operation == "top_n":
+        col = op.value_col
+        if not col:
+            raise ValueError(f"top_n requires value_col, got {op}")
+        n = op.n or limit
+        return df.nlargest(n, col)
+    elif op.operation == "sort":
+        if not op.sort_col:
+            raise ValueError(f"sort requires sort_col, got {op}")
+        return df.sort_values(op.sort_col, ascending=op.ascending).head(limit)
+    elif op.operation == "aggregate":
+        if not op.value_col or not op.agg_func:
+            raise ValueError(f"aggregate requires value_col and agg_func, got {op}")
+        funcs = {"sum": "sum", "avg": "mean", "min": "min", "max": "max", "count": "count"}
+        value = getattr(df[op.value_col], funcs[op.agg_func])()
+        return pd.DataFrame([{op.value_col: value, "operation": op.agg_func}])
+    else:  # "raw"
+        return df.head(limit)
+class TabularExecutor(BaseExecutor):
+    def __init__(self) -> None:
+        self._llm = AzureChatOpenAI(
+            azure_deployment=settings.azureai_deployment_name_4o,
+            openai_api_version=settings.azureai_api_version_4o,
+            azure_endpoint=settings.azureai_endpoint_url_4o,
+            api_key=settings.azureai_api_key_4o,
+            temperature=0,
+        )
+        self._prompt = ChatPromptTemplate.from_messages([
+            ("system", _SYSTEM_PROMPT),
+            ("human", "{question}"),
+        ])
+        self._chain = self._prompt | self._llm.with_structured_output(TabularOperation)
+    async def execute(
+        self,
+        results: list[RetrievalResult],
+        user_id: str,
+        _db: AsyncSession,
+        question: str,
+        limit: int = 100,
+    ) -> list[QueryResult]:
+        tabular = [
+            r for r in results
+            if r.source_type == "document"
+            and r.metadata.get("data", {}).get("file_type") in _TABULAR_FILE_TYPES
+        ]
+        if not tabular:
+            return []
+        # Group by (document_id, sheet_name) — one parquet download per group
+        groups: dict[tuple[str, str | None], _GroupInfo] = {}
+        for r in tabular:
+            data = r.metadata.get("data", {})
+            doc_id = data.get("document_id")
+            if not doc_id:
+                continue
+            sheet_name = data.get("sheet_name")  # None for CSV
+            key = (doc_id, sheet_name)
+            if key not in groups:
+                groups[key] = {
+                    "filename": data.get("filename", ""),
+                    "file_type": data.get("file_type", ""),
+                }
+        async def _process_group(
+            doc_id: str, sheet_name: str | None, info: _GroupInfo
+        ) -> QueryResult | None:
+            try:
+                df = await download_parquet(user_id, doc_id, sheet_name)
+                df_result = await self._query_with_agent(df, question, limit)
+                table_label = info["filename"]
+                if sheet_name:
+                    table_label += f" / sheet: {sheet_name}"
+                logger.info(
+                    "tabular query complete",
+                    document_id=doc_id,
+                    sheet=sheet_name,
+                    file_type=info["file_type"],
+                    rows=len(df_result),
+                    columns=len(df_result.columns),
+                )
+                return QueryResult(
+                    source_type="document",
+                    source_id=doc_id,
+                    table_or_file=table_label,
+                    columns=list(df_result.columns),
+                    rows=df_result.to_dict(orient="records"),
+                    row_count=len(df_result),
+                )
+            except Exception as e:
+                logger.error(
+                    "tabular query failed",
+                    document_id=doc_id,
+                    sheet=sheet_name,
+                    error=str(e),
+                )
+                return None
+        gathered = await asyncio.gather(*[
+            _process_group(doc_id, sheet_name, info)
+            for (doc_id, sheet_name), info in groups.items()
+        ])
+        return [r for r in gathered if r is not None]
+    async def _query_with_agent(
+        self, df: pd.DataFrame, question: str, limit: int
+    ) -> pd.DataFrame:
+        schema_ctx = _build_schema_context(df)
+        prev_error = ""
+        for attempt in range(_MAX_RETRIES):
+            error_section = (
+                f"Previous attempt failed: {prev_error}\nFix the issue."
+                if prev_error else ""
+            )
+            try:
+                op: TabularOperation = await self._chain.ainvoke({
+                    "schema": schema_ctx,
+                    "error_section": error_section,
+                    "question": question,
+                })
+                logger.info(
+                    "tabular operation decided",
+                    operation=op.operation,
+                    reasoning=op.reasoning,
+                )
+                return _apply_operation(df, op, limit)
+            except Exception as e:
+                prev_error = str(e)
+                logger.warning("tabular agent error", attempt=attempt + 1, error=prev_error)
+        # Fallback: return raw rows
+        logger.warning("tabular agent failed after retries, returning raw rows")
+        return df.head(limit)
+tabular_executor = TabularExecutor()

src/query/query_executor.py ADDED Viewed

	@@ -0,0 +1,42 @@

+"""QueryExecutor — dispatches retrieval results to the appropriate executor by source_type."""
+import asyncio
+from sqlalchemy.ext.asyncio import AsyncSession
+from src.middlewares.logging import get_logger
+from src.query.base import QueryResult
+from src.query.executors.db_executor import db_executor
+from src.query.executors.tabular import tabular_executor
+from src.rag.base import RetrievalResult
+logger = get_logger("query_executor")
+class QueryExecutor:
+    async def execute(
+        self,
+        results: list[RetrievalResult],
+        user_id: str,
+        db: AsyncSession,
+        question: str,
+        limit: int = 100,
+    ) -> list[QueryResult]:
+        batches = await asyncio.gather(
+            db_executor.execute(results, user_id, db, question, limit),
+            tabular_executor.execute(results, user_id, db, question, limit),
+            return_exceptions=True,
+        )
+        query_results: list[QueryResult] = []
+        for batch in batches:
+            if isinstance(batch, Exception):
+                logger.error("executor failed", error=str(batch))
+                continue
+            query_results.extend(batch)
+        logger.info("query execution complete", total=len(query_results))
+        return query_results
+query_executor = QueryExecutor()

src/rag/base.py ADDED Viewed

	@@ -0,0 +1,20 @@

+"""Shared contract for all retriever implementations."""
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Any
+@dataclass
+class RetrievalResult:
+    content: str
+    metadata: dict[str, Any]
+    score: float
+    source_type: str  # "document" | "database"
+class BaseRetriever(ABC):
+    @abstractmethod
+    async def retrieve(
+        self, query: str, user_id: str, k: int = 5
+    ) -> list[RetrievalResult]: ...

src/rag/retriever.py CHANGED Viewed

@@ -1,69 +1,45 @@
-"""Service for retrieving relevant documents from vector store."""
-import hashlib
-import json
-from src.db.postgres.vector_store import get_vector_store
-from src.db.redis.connection import get_redis
 from sqlalchemy.ext.asyncio import AsyncSession
 from src.middlewares.logging import get_logger
-from typing import List, Dict, Any
 logger = get_logger("retriever")
-_RETRIEVAL_CACHE_TTL = 3600  # 1 hour
 class RetrieverService:
-    """Service for retrieving relevant documents."""
     def __init__(self):
-        self.vector_store = get_vector_store()
     async def retrieve(
         self,
         query: str,
         user_id: str,
         db: AsyncSession,
-        k: int = 5
-    ) -> List[Dict[str, Any]]:
-        """Retrieve relevant chunks for a query, scoped to the user's documents.
-        Returns:
-            List of dicts with keys: content, metadata
-            metadata includes: document_id, user_id, filename, chunk_index, page_label (if PDF)
-        """
         try:
-            redis = await get_redis()
-            query_hash = hashlib.md5(query.encode()).hexdigest()
-            cache_key = f"retrieval:{user_id}:{query_hash}:{k}"
-            cached = await redis.get(cache_key)
-            if cached:
-                logger.info("Returning cached retrieval results")
-                return json.loads(cached)
-            logger.info(f"Retrieving for user {user_id}, query: {query[:50]}...")
-            docs = await self.vector_store.asimilarity_search(
-                query=query,
-                k=k,
-                filter={"user_id": user_id}
-            )
-            results = [
-                {
-                    "content": doc.page_content,
-                    "metadata": doc.metadata,
-                }
-                for doc in docs
-            ]
-            logger.info(f"Retrieved {len(results)} chunks")
-            await redis.setex(cache_key, _RETRIEVAL_CACHE_TTL, json.dumps(results))
-            return results
         except Exception as e:
-            logger.error("Retrieval failed", error=str(e))
             return []

+"""Public retrieval API — thin wrapper around RetrievalRouter."""
 from sqlalchemy.ext.asyncio import AsyncSession
 from src.middlewares.logging import get_logger
+from src.rag.base import RetrievalResult
+from src.rag.retrievers.document import document_retriever
+from src.rag.retrievers.schema import schema_retriever
+from src.rag.router import RetrievalRouter, SourceHint
 logger = get_logger("retriever")
 class RetrieverService:
+    """Public retrieval service used by chat.py and search tools.
+    Delegates to RetrievalRouter which dispatches based on source_hint.
+    Returns RetrievalResult objects directly so downstream consumers
+    (db_executor, tabular_executor) can be fed without lossy dict
+    conversion. The `db` parameter is accepted for call-site compatibility
+    but currently unused — retrieval reads PGVector via _pgvector_engine
+    inside each retriever.
+    """
     def __init__(self):
+        self._router = RetrievalRouter(
+            schema_retriever=schema_retriever,
+            document_retriever=document_retriever,
+        )
     async def retrieve(
         self,
         query: str,
         user_id: str,
         db: AsyncSession,
+        k: int = 5,
+        source_hint: SourceHint = "both",
+    ) -> list[RetrievalResult]:
         try:
+            return await self._router.retrieve(query, user_id, source_hint, k)
         except Exception as e:
+            logger.error("retrieval failed", error=str(e))
             return []

src/rag/retrievers/__init__.py ADDED Viewed

File without changes

src/rag/retrievers/baseline.py ADDED Viewed

	@@ -0,0 +1,76 @@

+"""Service for retrieving relevant documents from vector store."""
+import hashlib
+import json
+from src.db.postgres.vector_store import get_vector_store
+from src.db.redis.connection import get_redis
+from sqlalchemy.ext.asyncio import AsyncSession
+from src.middlewares.logging import get_logger
+from typing import List, Dict, Any
+logger = get_logger("retriever")
+_RETRIEVAL_CACHE_TTL = 3600  # 1 hour
+class BaselineRetrieverService:
+    """Baseline (pre-Phase-1) retriever — preserved for benchmark comparison.
+    Renamed from RetrieverService so it doesn't shadow the production wrapper
+    at src/rag/retriever.py. Production code imports from src.rag.retriever;
+    benchmark scripts that want this baseline must import explicitly from
+    src.rag.retrievers.baseline.
+    """
+    def __init__(self):
+        self.vector_store = get_vector_store()
+    async def retrieve(
+        self,
+        query: str,
+        user_id: str,
+        db: AsyncSession,
+        k: int = 5
+    ) -> List[Dict[str, Any]]:
+        """Retrieve relevant chunks for a query, scoped to the user's documents.
+        Returns:
+            List of dicts with keys: content, metadata
+            metadata includes: document_id, user_id, filename, chunk_index, page_label (if PDF)
+        """
+        try:
+            redis = await get_redis()
+            query_hash = hashlib.md5(query.encode()).hexdigest()
+            cache_key = f"retrieval:{user_id}:{query_hash}:{k}"
+            cached = await redis.get(cache_key)
+            if cached:
+                logger.info("Returning cached retrieval results")
+                return json.loads(cached)
+            logger.info(f"Retrieving for user {user_id}, query: {query[:50]}...")
+            docs = await self.vector_store.asimilarity_search(
+                query=query,
+                k=k,
+                filter={"user_id": user_id}
+            )
+            results = [
+                {
+                    "content": doc.page_content,
+                    "metadata": doc.metadata,
+                }
+                for doc in docs
+            ]
+            logger.info(f"Retrieved {len(results)} chunks")
+            await redis.setex(cache_key, _RETRIEVAL_CACHE_TTL, json.dumps(results))
+            return results
+        except Exception as e:
+            logger.error("Retrieval failed", error=str(e))
+            return []
+baseline_retriever = BaselineRetrieverService()

src/rag/retrievers/document.py ADDED Viewed

	@@ -0,0 +1,158 @@

+"""Document retriever — handles PDF, DOCX, TXT chunks (source_type="document", non-tabular)."""
+import math
+from langchain_postgres import PGVector
+from langchain_postgres.vectorstores import DistanceStrategy
+from langchain_openai import AzureOpenAIEmbeddings
+from sqlalchemy import text
+from src.config.settings import settings
+from src.db.postgres.connection import _pgvector_engine
+from src.db.postgres.vector_store import get_vector_store
+from src.middlewares.logging import get_logger
+from src.rag.base import BaseRetriever, RetrievalResult
+logger = get_logger("document_retriever")
+# Change this one line to switch retrieval method
+# Options: "mmr" | "cosine" | "euclidean" | "inner_product" | "manhattan"
+_RETRIEVAL_METHOD = "mmr"
+_TABULAR_TYPES = {"csv", "xlsx"}
+_FETCH_K = 20
+_LAMBDA_MULT = 0.5
+_COLLECTION_NAME = "document_embeddings"
+_embeddings = AzureOpenAIEmbeddings(
+    azure_deployment=settings.azureai_deployment_name_embedding,
+    openai_api_version=settings.azureai_api_version_embedding,
+    azure_endpoint=settings.azureai_endpoint_url_embedding,
+    api_key=settings.azureai_api_key_embedding,
+)
+_euclidean_store = PGVector(
+    embeddings=_embeddings,
+    connection=_pgvector_engine,
+    collection_name=_COLLECTION_NAME,
+    distance_strategy=DistanceStrategy.EUCLIDEAN,
+    use_jsonb=True,
+    async_mode=True,
+    create_extension=False,
+)
+_ip_store = PGVector(
+    embeddings=_embeddings,
+    connection=_pgvector_engine,
+    collection_name=_COLLECTION_NAME,
+    distance_strategy=DistanceStrategy.MAX_INNER_PRODUCT,
+    use_jsonb=True,
+    async_mode=True,
+    create_extension=False,
+)
+_MANHATTAN_SQL = text("""
+    SELECT
+        lpe.document,
+        lpe.cmetadata,
+        lpe.embedding <+> CAST(:embedding AS vector) AS distance
+    FROM langchain_pg_embedding lpe
+    JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
+    WHERE lpc.name = :collection
+      AND lpe.cmetadata->>'user_id' = :user_id
+      AND lpe.cmetadata->>'source_type' = 'document'
+    ORDER BY distance ASC
+    LIMIT :k
+""")
+class DocumentRetriever(BaseRetriever):
+    def __init__(self) -> None:
+        self.vector_store = get_vector_store()
+    async def retrieve(
+        self, query: str, user_id: str, k: int = 5
+    ) -> list[RetrievalResult]:
+        filter_ = {"user_id": user_id, "source_type": "document"}
+        fetch_k = k + len(_TABULAR_TYPES)
+        if _RETRIEVAL_METHOD == "manhattan":
+            return await self._retrieve_manhattan(query, user_id, k, fetch_k)
+        if _RETRIEVAL_METHOD == "mmr":
+            docs = await self.vector_store.amax_marginal_relevance_search(
+                query=query,
+                k=fetch_k,
+                fetch_k=_FETCH_K,
+                lambda_mult=_LAMBDA_MULT,
+                filter=filter_,
+            )
+            cosine = await self.vector_store.asimilarity_search_with_score(
+                query=query, k=fetch_k, filter=filter_,
+            )
+            score_map = {doc.page_content: score for doc, score in cosine}
+            docs_with_scores = [(doc, score_map.get(doc.page_content, 0.0)) for doc in docs]
+        elif _RETRIEVAL_METHOD == "euclidean":
+            docs_with_scores = await _euclidean_store.asimilarity_search_with_score(
+                query=query, k=fetch_k, filter=filter_,
+            )
+        elif _RETRIEVAL_METHOD == "inner_product":
+            docs_with_scores = await _ip_store.asimilarity_search_with_score(
+                query=query, k=fetch_k, filter=filter_,
+            )
+        else:  # cosine
+            docs_with_scores = await self.vector_store.asimilarity_search_with_score(
+                query=query, k=fetch_k, filter=filter_,
+            )
+        results = []
+        for doc, score in docs_with_scores:
+            file_type = doc.metadata.get("data", {}).get("file_type", "")
+            if file_type not in _TABULAR_TYPES:
+                results.append(RetrievalResult(
+                    content=doc.page_content,
+                    metadata=doc.metadata,
+                    score=score,
+                    source_type="document",
+                ))
+            if len(results) == k:
+                break
+        logger.info("retrieved chunks", method=_RETRIEVAL_METHOD, count=len(results))
+        return results
+    async def _retrieve_manhattan(
+        self, query: str, user_id: str, k: int, fetch_k: int
+    ) -> list[RetrievalResult]:
+        query_vector = await _embeddings.aembed_query(query)
+        if not all(math.isfinite(v) for v in query_vector):
+            raise ValueError("Embedding vector contains NaN or Infinity values.")
+        vector_str = "[" + ",".join(str(v) for v in query_vector) + "]"
+        async with _pgvector_engine.connect() as conn:
+            result = await conn.execute(_MANHATTAN_SQL, {
+                "embedding": vector_str,
+                "collection": _COLLECTION_NAME,
+                "user_id": user_id,
+                "k": fetch_k,
+            })
+            rows = result.fetchall()
+        results = []
+        for row in rows:
+            file_type = row.cmetadata.get("data", {}).get("file_type", "")
+            if file_type not in _TABULAR_TYPES:
+                results.append(RetrievalResult(
+                    content=row.document,
+                    metadata=row.cmetadata,
+                    score=float(row.distance),
+                    source_type="document",
+                ))
+            if len(results) == k:
+                break
+        logger.info("retrieved chunks", method="manhattan", count=len(results))
+        return results
+document_retriever = DocumentRetriever()

src/rag/retrievers/schema.py ADDED Viewed

	@@ -0,0 +1,411 @@

+"""Schema retriever — handles DB schemas (source_type="database") and tabular file
+columns stored as source_type="document" with file_type in ("csv","xlsx").
+Strategy: hybrid_bm25 — RRF merge of dense cosine search (DB columns + DB tables
++ tabular columns + tabular sheets) and PostgreSQL full-text search (DB columns only).
+Embeds the query once, fans out five legs in parallel.
+The DB-tables leg surfaces table-level summary chunks (chunk_level='table') as
+a recall signal for multi-table questions: when a relevant table's columns
+don't individually win on similarity, the table chunk can still pull the table
+into the hit set, where db_executor's downstream full-schema fetch picks up
+the per-column detail.
+FTS requires a GIN index on langchain_pg_embedding.document (created by init_db.py).
+"""
+import asyncio
+from sqlalchemy import text
+from src.db.postgres.connection import _pgvector_engine
+from src.db.postgres.vector_store import get_vector_store
+from src.middlewares.logging import get_logger
+from src.rag.base import BaseRetriever, RetrievalResult
+logger = get_logger("schema_retriever")
+_TABULAR_FILE_TYPES = ("csv", "xlsx")
+_TABLE_CHUNK_K_MULTIPLIER = 2  # how many table chunks to pull before RRF
+class SchemaRetriever(BaseRetriever):
+    def __init__(self):
+        self.vector_store = get_vector_store()
+    # ------------------------------------------------------------------
+    # Internal helpers
+    # ------------------------------------------------------------------
+    async def _embed_query(self, query: str) -> list[float]:
+        return await asyncio.to_thread(self.vector_store.embeddings.embed_query, query)
+    async def _search_db(
+        self, embedding: list[float], user_id: str, k: int
+    ) -> list[RetrievalResult]:
+        """Cosine vector search over database chunks."""
+        emb_str = "[" + ",".join(str(x) for x in embedding) + "]"
+        sql = text(f"""
+            SELECT lpe.document, lpe.cmetadata,
+                   1.0 - (lpe.embedding <=> '{emb_str}'::vector) AS score
+            FROM langchain_pg_embedding lpe
+            JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
+            WHERE lpc.name = 'document_embeddings'
+              AND lpe.cmetadata->>'user_id' = :user_id
+              AND lpe.cmetadata->>'source_type' = 'database'
+              AND lpe.cmetadata->>'chunk_level' = 'column'
+            ORDER BY lpe.embedding <=> '{emb_str}'::vector ASC
+            LIMIT :k
+        """)
+        async with _pgvector_engine.connect() as conn:
+            result = await conn.execute(sql, {"user_id": user_id, "k": k * 4})
+            rows = result.fetchall()
+        return [
+            RetrievalResult(
+                content=row.document,
+                metadata=row.cmetadata,
+                score=float(row.score),
+                source_type="database",
+            )
+            for row in rows
+        ]
+    async def _search_db_tables(
+        self, embedding: list[float], user_id: str, k: int
+    ) -> list[RetrievalResult]:
+        """Cosine vector search over database TABLE-level chunks.
+        Recall channel for multi-table questions. The chunk's content is
+        discarded downstream — db_executor only consumes its `data.table_name`
+        to seed full-schema fetch.
+        """
+        emb_str = "[" + ",".join(str(x) for x in embedding) + "]"
+        sql = text(f"""
+            SELECT lpe.document, lpe.cmetadata,
+                   1.0 - (lpe.embedding <=> '{emb_str}'::vector) AS score
+            FROM langchain_pg_embedding lpe
+            JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
+            WHERE lpc.name = 'document_embeddings'
+              AND lpe.cmetadata->>'user_id' = :user_id
+              AND lpe.cmetadata->>'source_type' = 'database'
+              AND lpe.cmetadata->>'chunk_level' = 'table'
+            ORDER BY lpe.embedding <=> '{emb_str}'::vector ASC
+            LIMIT :k
+        """)
+        async with _pgvector_engine.connect() as conn:
+            result = await conn.execute(
+                sql, {"user_id": user_id, "k": k * _TABLE_CHUNK_K_MULTIPLIER}
+            )
+            rows = result.fetchall()
+        return [
+            RetrievalResult(
+                content=row.document,
+                metadata=row.cmetadata,
+                score=float(row.score),
+                source_type="database",
+            )
+            for row in rows
+        ]
+    async def _search_tabular(
+        self, embedding: list[float], user_id: str, k: int
+    ) -> list[RetrievalResult]:
+        """Cosine vector search over tabular document chunks (csv/xlsx)."""
+        emb_str = "[" + ",".join(str(x) for x in embedding) + "]"
+        sql = text(f"""
+            SELECT lpe.document, lpe.cmetadata,
+                   1.0 - (lpe.embedding <=> '{emb_str}'::vector) AS score
+            FROM langchain_pg_embedding lpe
+            JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
+            WHERE lpc.name = 'document_embeddings'
+              AND lpe.cmetadata->>'user_id' = :user_id
+              AND lpe.cmetadata->>'source_type' = 'document'
+              AND lpe.cmetadata->>'chunk_level' = 'column'
+              AND (lpe.cmetadata->'data'->>'file_type' = 'csv'
+                OR lpe.cmetadata->'data'->>'file_type' = 'xlsx')
+            ORDER BY lpe.embedding <=> '{emb_str}'::vector ASC
+            LIMIT :k
+        """)
+        async with _pgvector_engine.connect() as conn:
+            result = await conn.execute(sql, {"user_id": user_id, "k": k * 4})
+            rows = result.fetchall()
+        return [
+            RetrievalResult(
+                content=row.document,
+                metadata=row.cmetadata,
+                score=float(row.score),
+                source_type="document",
+            )
+            for row in rows
+        ]
+    async def _search_tabular_sheets(
+        self, embedding: list[float], user_id: str, k: int
+    ) -> list[RetrievalResult]:
+        """Leg 5: sheet-level summary chunks from CSV/XLSX files."""
+        emb_str = "[" + ",".join(str(x) for x in embedding) + "]"
+        sql = text(f"""
+            SELECT lpe.document, lpe.cmetadata,
+                   1.0 - (lpe.embedding <=> '{emb_str}'::vector) AS score
+            FROM langchain_pg_embedding lpe
+            JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
+            WHERE lpc.name = 'document_embeddings'
+              AND lpe.cmetadata->>'user_id' = :user_id
+              AND lpe.cmetadata->>'source_type' = 'document'
+              AND lpe.cmetadata->>'chunk_level' = 'sheet'
+              AND (lpe.cmetadata->'data'->>'file_type' = 'csv'
+                OR lpe.cmetadata->'data'->>'file_type' = 'xlsx')
+            ORDER BY lpe.embedding <=> '{emb_str}'::vector ASC
+            LIMIT :k
+        """)
+        async with _pgvector_engine.connect() as conn:
+            result = await conn.execute(sql, {"user_id": user_id, "k": k})
+            rows = result.fetchall()
+        return [
+            RetrievalResult(
+                content=row.document,
+                metadata=row.cmetadata,
+                score=float(row.score),
+                source_type="document",
+            )
+            for row in rows
+        ]
+    async def _search_fts_db(self, query: str, user_id: str, k: int) -> list[RetrievalResult]:
+        """Full-text search over DB schema chunks using PostgreSQL tsvector."""
+        sql = text("""
+            SELECT lpe.document, lpe.cmetadata,
+                   ts_rank(to_tsvector('english', lpe.document),
+                           plainto_tsquery('english', :query)) AS rank
+            FROM langchain_pg_embedding lpe
+            JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
+            WHERE lpc.name = 'document_embeddings'
+              AND lpe.cmetadata->>'user_id' = :user_id
+              AND lpe.cmetadata->>'source_type' = 'database'
+              AND lpe.cmetadata->>'chunk_level' = 'column'
+              AND to_tsvector('english', lpe.document) @@ plainto_tsquery('english', :query)
+            ORDER BY rank DESC
+            LIMIT :k
+        """)
+        async with _pgvector_engine.connect() as conn:
+            result = await conn.execute(sql, {"query": query, "user_id": user_id, "k": k})
+            rows = result.fetchall()
+        return [
+            RetrievalResult(
+                content=row.document,
+                metadata=row.cmetadata,
+                score=float(row.rank),
+                source_type="database",
+            )
+            for row in rows
+        ]
+    def _rank_tabular_sheets(
+        self,
+        sheet_results: list[RetrievalResult],
+        column_results: list[RetrievalResult],
+        top_k: int,
+        k_rrf: int = 60,
+    ) -> list[RetrievalResult]:
+        """Rank tabular sheets by RRF across two voting legs:
+          L1 (primary): sheet-chunk cosine score
+          L2 (vote):    best column-chunk position per (doc_id, sheet_name)
+        Returns top-k sheet-level RetrievalResults. The full column list of
+        each sheet is already in the sheet chunk's data.column_names from
+        ingestion, so downstream tabular_executor can read full sheet context.
+        For sheets surfaced by column votes but missing a sheet chunk (rare —
+        ingestion always creates one), a minimal stub is returned and
+        tabular_executor falls back to reading columns from the parquet.
+        """
+        # L1: sheets indexed by (doc_id, sheet_name) from sheet chunks
+        sheet_index: dict[tuple, RetrievalResult] = {}
+        sheet_ranked: list[tuple] = []
+        for r in sheet_results:
+            d = r.metadata.get("data", {})
+            key = (d.get("document_id"), d.get("sheet_name"))
+            if key[0] and key not in sheet_index:
+                sheet_index[key] = r
+                sheet_ranked.append(key)
+        # L2: sheets ranked by first-appearance in column-chunk results
+        col_sheet_ranked: list[tuple] = []
+        seen: set[tuple] = set()
+        for r in column_results:
+            d = r.metadata.get("data", {})
+            key = (d.get("document_id"), d.get("sheet_name"))
+            if key[0] and key not in seen:
+                col_sheet_ranked.append(key)
+                seen.add(key)
+        # RRF over (doc_id, sheet_name) across the two legs
+        rrf_scores: dict[tuple, float] = {}
+        for ranked_list in [sheet_ranked, col_sheet_ranked]:
+            for rank, key in enumerate(ranked_list):
+                rrf_scores[key] = rrf_scores.get(key, 0.0) + 1.0 / (k_rrf + rank + 1)
+        top_sheets = sorted(rrf_scores, key=lambda k: rrf_scores[k], reverse=True)[:top_k]
+        results: list[RetrievalResult] = []
+        for key in top_sheets:
+            if key in sheet_index:
+                r = sheet_index[key]
+                r.score = rrf_scores[key]
+                results.append(r)
+            else:
+                # Surfaced by column votes only — build stub from a representative
+                # column result so tabular_executor can group correctly.
+                doc_id, sheet_name = key
+                rep = next(
+                    (r for r in column_results
+                     if r.metadata.get("data", {}).get("document_id") == doc_id
+                     and r.metadata.get("data", {}).get("sheet_name") == sheet_name),
+                    None,
+                )
+                if rep is None:
+                    continue
+                stub_data = dict(rep.metadata.get("data", {}))
+                stub_data.pop("column_name", None)
+                stub_data.pop("column_type", None)
+                results.append(RetrievalResult(
+                    content=f"Sheet: {stub_data.get('filename', '')}"
+                            + (f" / sheet: {sheet_name}" if sheet_name else ""),
+                    metadata={**rep.metadata, "data": stub_data, "chunk_level": "sheet"},
+                    score=rrf_scores[key],
+                    source_type="document",
+                ))
+        return results
+    def _rank_db_tables(
+        self,
+        tbl_results: list[RetrievalResult],
+        col_results: list[RetrievalResult],
+        fts_results: list[RetrievalResult],
+        top_k: int,
+        k_rrf: int = 60,
+    ) -> list[RetrievalResult]:
+        """Rank DB tables by RRF across three legs:
+          L1 (primary): table-summary chunk similarity
+          L2 (vote):    best column-chunk position per table
+          L3 (vote):    best FTS position per table
+        Returns top-k table-chunk RetrievalResults. For tables surfaced by
+        L2/L3 but missing a table chunk, a minimal stub is returned so that
+        db_executor._fetch_full_schema can seed off data.table_name.
+        """
+        # L1: tables ranked by table-chunk cosine score
+        tbl_index: dict[str, RetrievalResult] = {}
+        tbl_ranked: list[str] = []
+        for r in tbl_results:
+            tname = r.metadata.get("data", {}).get("table_name")
+            if tname and tname not in tbl_index:
+                tbl_index[tname] = r
+                tbl_ranked.append(tname)
+        # L2: tables ranked by first-appearance in column-chunk list (best col score)
+        col_table_ranked: list[str] = []
+        seen: set[str] = set()
+        for r in col_results:
+            tname = r.metadata.get("data", {}).get("table_name")
+            if tname and tname not in seen:
+                col_table_ranked.append(tname)
+                seen.add(tname)
+        # L3: tables ranked by first-appearance in FTS list
+        fts_table_ranked: list[str] = []
+        seen = set()
+        for r in fts_results:
+            tname = r.metadata.get("data", {}).get("table_name")
+            if tname and tname not in seen:
+                fts_table_ranked.append(tname)
+                seen.add(tname)
+        # RRF over table names across the three legs
+        rrf_scores: dict[str, float] = {}
+        for ranked_list in [tbl_ranked, col_table_ranked, fts_table_ranked]:
+            for rank, tname in enumerate(ranked_list):
+                rrf_scores[tname] = rrf_scores.get(tname, 0.0) + 1.0 / (k_rrf + rank + 1)
+        top_tables = sorted(rrf_scores, key=lambda t: rrf_scores[t], reverse=True)[:top_k]
+        results: list[RetrievalResult] = []
+        for tname in top_tables:
+            if tname in tbl_index:
+                r = tbl_index[tname]
+                r.score = rrf_scores[tname]
+                results.append(r)
+            else:
+                # Surfaced by column/FTS votes with no table chunk — minimal stub
+                results.append(RetrievalResult(
+                    content=f"Table: {tname}",
+                    metadata={"data": {"table_name": tname}, "source_type": "database"},
+                    score=rrf_scores[tname],
+                    source_type="database",
+                ))
+        return results
+    # ------------------------------------------------------------------
+    # Public interface — called by the router
+    # ------------------------------------------------------------------
+    async def retrieve(self, query: str, user_id: str, k: int = 5) -> list[RetrievalResult]:
+        """Table-first retrieval for DB sources; chunk-level for tabular.
+        DB tables are ranked via RRF across three legs:
+          L1 (primary): table-summary chunk similarity
+          L2 (vote): top-K column-chunk cosine, grouped by table
+          L3 (vote): top-K FTS column hits, grouped by table
+        db_executor downstream fetches the full per-column schema for the
+        ranked table set via _fetch_full_schema — the column chunks returned
+        here are intentionally NOT used as the schema source, only for voting.
+        Tabular (CSV/XLSX) sheets are ranked via RRF across two legs:
+          L1: sheet-chunk cosine
+          L2: column-chunk votes (best position per sheet)
+        Returns sheet-level RetrievalResults so tabular_executor receives
+        full sheet context (all columns) rather than fragmented column hits.
+        """
+        embedding = await self._embed_query(query)
+        db_col_results, db_tbl_results, tabular_results, fts_results, sheet_results = await asyncio.gather(
+            self._search_db(embedding, user_id, k),
+            self._search_db_tables(embedding, user_id, k),
+            self._search_tabular(embedding, user_id, k),
+            self._search_fts_db(query, user_id, k * 4),
+            self._search_tabular_sheets(embedding, user_id, k),
+        )
+        db_ranked = self._rank_db_tables(db_tbl_results, db_col_results, fts_results, top_k=k)
+        tabular_ranked = self._rank_tabular_sheets(sheet_results, tabular_results, top_k=k)
+        results = sorted(db_ranked + tabular_ranked, key=lambda r: r.score, reverse=True)
+        logger.info(
+            "schema retrieval",
+            count=len(results),
+            db_tables_ranked=len(db_ranked),
+            db_cols=len(db_col_results),
+            db_tables=len(db_tbl_results),
+            tabular_cols=len(tabular_results),
+            tabular_sheets=len(sheet_results),
+            tabular_ranked=len(tabular_ranked),
+            fts=len(fts_results),
+        )
+        return results
+schema_retriever = SchemaRetriever()

src/rag/router.py ADDED Viewed

	@@ -0,0 +1,179 @@

+"""Routes retrieval requests to the appropriate retriever based on source_hint.
+Cross-retriever merging uses Reciprocal Rank Fusion (RRF) on per-retriever
+ranked lists — score scales differ across retrievers (RRF, cosine, distance)
+and aren't directly comparable, so we rank-merge instead of score-merge.
+"""
+import asyncio
+import hashlib
+import json
+from dataclasses import asdict
+from typing import Literal
+from src.db.redis.connection import get_redis
+from src.middlewares.logging import get_logger
+from src.rag.base import BaseRetriever, RetrievalResult
+logger = get_logger("retrieval_router")
+_CACHE_TTL = 3600  # 1 hour
+_CACHE_KEY_PREFIX = "retrieval"
+_RRF_K = 60  # standard RRF constant
+SourceHint = Literal["document", "schema", "both"]
+def _result_dedup_key(r: RetrievalResult) -> tuple:
+    """Cross-retriever dedup key — distinguishes DB columns vs DB tables vs
+    tabular columns vs prose chunks vs sheet-level chunks."""
+    data = r.metadata.get("data", {})
+    return (
+        r.source_type,
+        data.get("table_name"),
+        data.get("column_name"),
+        data.get("filename"),
+        data.get("sheet_name"),
+        data.get("chunk_index"),  # disambiguates multiple prose chunks per doc
+        r.metadata.get("chunk_level"),  # distinguishes sheet vs column chunks
+    )
+def _rrf_merge(
+    ranked_lists: list[list[RetrievalResult]],
+    top_k: int,
+    k_rrf: int = _RRF_K,
+) -> list[RetrievalResult]:
+    """Reciprocal Rank Fusion across retriever batches.
+    Each input list is treated as already best-first ordered. Items are
+    deduped via _result_dedup_key and re-ranked by aggregated reciprocal
+    rank across all lists. Score on the returned RetrievalResult is the
+    aggregated RRF score (uniform scale across legs).
+    """
+    scores: dict[tuple, float] = {}
+    index: dict[tuple, RetrievalResult] = {}
+    for ranked in ranked_lists:
+        for rank, result in enumerate(ranked):
+            key = _result_dedup_key(result)
+            scores[key] = scores.get(key, 0.0) + 1.0 / (k_rrf + rank + 1)
+            # Keep the first occurrence; metadata is identical for the same
+            # key across lists, so any copy is fine.
+            if key not in index:
+                index[key] = result
+    merged = sorted(index.values(), key=lambda r: scores[_result_dedup_key(r)], reverse=True)
+    # Overwrite score with RRF score so downstream consumers see a uniform scale.
+    for r in merged:
+        r.score = scores[_result_dedup_key(r)]
+    return merged[:top_k]
+async def invalidate_retrieval_cache(user_id: str) -> int:
+    """Delete every cached retrieval entry for `user_id`.
+    Called by ingest/upload/delete API handlers after a successful write so
+    the next retrieval picks up the new data instead of stale cached top-k.
+    Returns the number of keys removed.
+    """
+    redis = await get_redis()
+    pattern = f"{_CACHE_KEY_PREFIX}:{user_id}:*"
+    keys = [key async for key in redis.scan_iter(match=pattern)]
+    if not keys:
+        return 0
+    deleted = await redis.delete(*keys)
+    logger.info("retrieval cache invalidated", user_id=user_id, deleted=deleted)
+    return int(deleted)
+class RetrievalRouter:
+    def __init__(
+        self,
+        schema_retriever: BaseRetriever,
+        document_retriever: BaseRetriever,
+    ):
+        self._retrievers: dict[str, BaseRetriever] = {
+            "schema": schema_retriever,
+            "document": document_retriever,
+        }
+    def _route(self, source_hint: SourceHint) -> list[tuple[str, BaseRetriever]]:
+        if source_hint == "schema":
+            return [("schema", self._retrievers["schema"])]
+        if source_hint == "document":
+            return [("document", self._retrievers["document"])]
+        return list(self._retrievers.items())
+    async def retrieve(
+        self,
+        query: str,
+        user_id: str,
+        source_hint: SourceHint = "both",
+        k: int = 10,
+    ) -> list[RetrievalResult]:
+        redis = await get_redis()
+        query_hash = hashlib.md5(query.encode()).hexdigest()
+        cache_key = f"{_CACHE_KEY_PREFIX}:{user_id}:{source_hint}:{query_hash}:{k}"
+        cached = await redis.get(cache_key)
+        if cached:
+            try:
+                raw = json.loads(cached)
+                logger.info("returning cached retrieval results", source_hint=source_hint)
+                return [RetrievalResult(**r) for r in raw]
+            except Exception:
+                logger.warning("corrupted retrieval cache, fetching fresh", cache_key=cache_key)
+        results = await self._retrieve_uncached(query, user_id, source_hint, k)
+        # Empty-result fallback: orchestrator may have misclassified intent.
+        # Retry once with "both" before giving up. No-op when source_hint is
+        # already "both".
+        if not results and source_hint != "both":
+            logger.warning(
+                "empty retrieval, falling back to source_hint='both'",
+                original_source_hint=source_hint,
+            )
+            results = await self._retrieve_uncached(query, user_id, "both", k)
+        await redis.setex(
+            cache_key,
+            _CACHE_TTL,
+            json.dumps([asdict(r) for r in results]),
+        )
+        return results
+    async def _retrieve_uncached(
+        self,
+        query: str,
+        user_id: str,
+        source_hint: SourceHint,
+        k: int,
+    ) -> list[RetrievalResult]:
+        routed = self._route(source_hint)
+        batches = await asyncio.gather(
+            *[r.retrieve(query, user_id, k) for _, r in routed],
+            return_exceptions=True,
+        )
+        valid_lists: list[list[RetrievalResult]] = []
+        per_retriever: dict[str, int | str] = {}
+        for (name, _), batch in zip(routed, batches):
+            if isinstance(batch, Exception):
+                logger.error("retriever failed", retriever=name, error=str(batch))
+                per_retriever[name] = "error"
+                continue
+            valid_lists.append(batch)
+            per_retriever[name] = len(batch)
+        results = _rrf_merge(valid_lists, top_k=k)
+        logger.info(
+            "router result",
+            source_hint=source_hint,
+            per_retriever=per_retriever,
+            final_count=len(results),
+            top_score=results[0].score if results else None,
+            bottom_score=results[-1].score if results else None,
+        )
+        return results

src/storage/az_blob/az_blob.py CHANGED Viewed

@@ -57,6 +57,22 @@ class AzureBlobStorage:
             logger.error(f"Failed to download blob {blob_name}", error=str(e))
             raise
     async def delete_file(self, blob_name: str) -> bool:
         """Delete file from Azure Blob Storage."""
         try:
@@ -71,6 +87,24 @@ class AzureBlobStorage:
             logger.error(f"Failed to delete blob {blob_name}", error=str(e))
             return False
 # Singleton instance
 blob_storage = AzureBlobStorage()

             logger.error(f"Failed to download blob {blob_name}", error=str(e))
             raise
+    async def upload_bytes(self, content: bytes, blob_name: str) -> str:
+        """Upload bytes to Azure Blob Storage using a specific blob name.
+        Unlike upload_file(), this does not generate a UUID name — caller controls the blob_name.
+        Used for Parquet files where the name must be deterministic (derived from document_id).
+        """
+        try:
+            async with self._get_blob_client(blob_name) as blob_client:
+                logger.info(f"Uploading bytes to blob {blob_name}")
+                await blob_client.upload_blob(content, overwrite=True)
+            logger.info(f"Successfully uploaded {blob_name}")
+            return blob_name
+        except Exception as e:
+            logger.error(f"Failed to upload bytes to {blob_name}", error=str(e))
+            raise
     async def delete_file(self, blob_name: str) -> bool:
         """Delete file from Azure Blob Storage."""
         try:
             logger.error(f"Failed to delete blob {blob_name}", error=str(e))
             return False
+    async def delete_blobs_with_prefix(self, prefix: str) -> int:
+        """Delete all blobs whose name starts with prefix. Returns count deleted.
+        Used to delete all Parquet files for a document in one call.
+        """
+        from azure.storage.blob.aio import ContainerClient
+        container_url = f"{self.account_url}/{self.container_name}?{self.sas_token}"
+        deleted = 0
+        try:
+            async with ContainerClient.from_container_url(container_url) as container:
+                async for blob in container.list_blobs(name_starts_with=prefix):
+                    await container.delete_blob(blob.name)
+                    deleted += 1
+            logger.info(f"Deleted {deleted} blobs with prefix {prefix}")
+        except Exception as e:
+            logger.error(f"Failed to delete blobs with prefix {prefix}", error=str(e))
+        return deleted
 # Singleton instance
 blob_storage = AzureBlobStorage()

src/tools/search.py CHANGED Viewed

@@ -34,10 +34,10 @@ async def search_documents(
         formatted_results = []
         for result in results:
-            filename = result["metadata"].get("filename", "Unknown")
-            page = result["metadata"].get("page_label")
             source_label = f"{filename}, p.{page}" if page else filename
-            formatted_results.append(f"[Source: {source_label}]\n{result['content']}\n")
         return "\n".join(formatted_results)

         formatted_results = []
         for result in results:
+            filename = result.metadata.get("filename", "Unknown")
+            page = result.metadata.get("page_label")
             source_label = f"{filename}, p.{page}" if page else filename
+            formatted_results.append(f"[Source: {source_label}]\n{result.content}\n")
         return "\n".join(formatted_results)

src/utils/db_credential_encryption.py ADDED Viewed

	@@ -0,0 +1,70 @@

+"""Fernet encryption utilities for user-registered database credentials.
+Encryption key is sourced from `dataeyond__db__credential__key` env variable,
+intentionally separate from the user-auth bcrypt salt (`emarcal__bcrypt__salt`).
+Usage:
+    from src.utils.db_credential_encryption import encrypt_credentials_dict, decrypt_credentials_dict
+    # Before INSERT:
+    safe_creds = encrypt_credentials_dict(raw_credentials)
+    # After SELECT:
+    plain_creds = decrypt_credentials_dict(row.credentials)
+"""
+from cryptography.fernet import Fernet
+from src.config.settings import settings
+# Sensitive credential field names that must be encrypted at rest.
+# Covers all supported DB types:
+#   - password      : postgres, mysql, sqlserver, supabase, snowflake
+#   - service_account_json : bigquery
+SENSITIVE_FIELDS: frozenset[str] = frozenset({"password", "service_account_json"})
+def _get_cipher() -> Fernet:
+    key = settings.dataeyond_db_credential_key
+    if not key:
+        raise ValueError(
+            "dataeyond__db__credential__key is not set. "
+            "Generate one with: Fernet.generate_key().decode()"
+        )
+    return Fernet(key.encode())
+def encrypt_credential(value: str) -> str:
+    """Encrypt a single credential string value."""
+    return _get_cipher().encrypt(value.encode()).decode()
+def decrypt_credential(value: str) -> str:
+    """Decrypt a single Fernet-encrypted credential string."""
+    return _get_cipher().decrypt(value.encode()).decode()
+def encrypt_credentials_dict(creds: dict) -> dict:
+    """Return a copy of the credentials dict with sensitive fields encrypted.
+    Call this before inserting a new DatabaseClient record.
+    """
+    cipher = _get_cipher()
+    result = dict(creds)
+    for field in SENSITIVE_FIELDS:
+        if result.get(field):
+            result[field] = cipher.encrypt(result[field].encode()).decode()
+    return result
+def decrypt_credentials_dict(creds: dict) -> dict:
+    """Return a copy of the credentials dict with sensitive fields decrypted.
+    Call this after fetching a DatabaseClient record from DB.
+    """
+    cipher = _get_cipher()
+    result = dict(creds)
+    for field in SENSITIVE_FIELDS:
+        if result.get(field):
+            result[field] = cipher.decrypt(result[field].encode()).decode()
+    return result

uv.lock CHANGED Viewed

@@ -1,5 +1,5 @@
 version = 1
-revision = 2
 requires-python = "==3.12.*"
 resolution-markers = [
     "python_full_version >= '3.12.4'",
@@ -39,6 +39,7 @@ dependencies = [
     { name = "orjson" },
     { name = "pandas" },
     { name = "passlib", extra = ["bcrypt"] },
     { name = "pgvector" },
     { name = "plotly" },
     { name = "presidio-analyzer" },
@@ -46,10 +47,15 @@ dependencies = [
     { name = "prometheus-client" },
     { name = "psycopg", extra = ["binary", "pool"] },
     { name = "psycopg2" },
     { name = "pydantic" },
     { name = "pydantic-settings" },
     { name = "pymongo" },
     { name = "pypdf" },
     { name = "python-docx" },
     { name = "python-dotenv" },
     { name = "python-multipart" },
@@ -57,8 +63,11 @@ dependencies = [
     { name = "redis" },
     { name = "sentence-transformers" },
     { name = "slowapi" },
     { name = "spacy" },
     { name = "sqlalchemy", extra = ["asyncio"] },
     { name = "sse-starlette" },
     { name = "starlette" },
     { name = "structlog" },
@@ -80,11 +89,8 @@ dev = [
 [package.dev-dependencies]
 dev = [
-    { name = "mypy" },
-    { name = "pre-commit" },
     { name = "pytest" },
     { name = "pytest-asyncio" },
-    { name = "pytest-cov" },
     { name = "ruff" },
 ]
@@ -120,6 +126,7 @@ requires-dist = [
     { name = "orjson", specifier = "==3.10.12" },
     { name = "pandas", specifier = "==2.2.3" },
     { name = "passlib", extras = ["bcrypt"], specifier = "==1.7.4" },
     { name = "pgvector", specifier = "==0.3.6" },
     { name = "plotly", specifier = "==5.24.1" },
     { name = "pre-commit", marker = "extra == 'dev'", specifier = "==4.0.1" },
@@ -128,10 +135,15 @@ requires-dist = [
     { name = "prometheus-client", specifier = "==0.21.1" },
     { name = "psycopg", extras = ["binary", "pool"], specifier = "==3.2.3" },
     { name = "psycopg2", specifier = ">=2.9.11" },
     { name = "pydantic", specifier = "==2.10.3" },
     { name = "pydantic-settings", specifier = "==2.7.0" },
     { name = "pymongo", specifier = ">=4.14.0" },
     { name = "pypdf", specifier = "==5.1.0" },
     { name = "pytest", marker = "extra == 'dev'", specifier = "==8.3.4" },
     { name = "pytest-asyncio", marker = "extra == 'dev'", specifier = "==0.24.0" },
     { name = "pytest-cov", marker = "extra == 'dev'", specifier = "==6.0.0" },
@@ -143,8 +155,11 @@ requires-dist = [
     { name = "ruff", marker = "extra == 'dev'", specifier = "==0.8.4" },
     { name = "sentence-transformers", specifier = "==3.3.1" },
     { name = "slowapi", specifier = "==0.1.9" },
     { name = "spacy", specifier = "==3.8.3" },
     { name = "sqlalchemy", extras = ["asyncio"], specifier = "==2.0.36" },
     { name = "sse-starlette", specifier = "==2.1.3" },
     { name = "starlette", specifier = "==0.41.3" },
     { name = "structlog", specifier = "==24.4.0" },
@@ -156,12 +171,9 @@ provides-extras = ["dev"]
 [package.metadata.requires-dev]
 dev = [
-    { name = "mypy", specifier = "==1.13.0" },
-    { name = "pre-commit", specifier = "==4.0.1" },
-    { name = "pytest", specifier = "==8.3.4" },
-    { name = "pytest-asyncio", specifier = "==0.24.0" },
-    { name = "pytest-cov", specifier = "==6.0.0" },
-    { name = "ruff", specifier = "==0.8.4" },
 ]
 [[package]]
@@ -280,6 +292,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/13/b5/7af0cb920a476dccd612fbc9a21a3745fb29b1fcd74636078db8f7ba294c/APScheduler-3.10.4-py3-none-any.whl", hash = "sha256:fb91e8a768632a4756a585f79ec834e0e27aad5860bac7eaa523d9ccefd87661", size = 59303, upload-time = "2023-08-19T16:44:56.814Z" },
 ]
 [[package]]
 name = "asyncpg"
 version = "0.30.0"
@@ -428,6 +449,34 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/20/07/fb43edc2ff0a6a367e4a94fc39eb3b85aa1e55e24cc857af2db145ce9f0d/blis-1.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:f20f7ad69aaffd1ce14fe77de557b6df9b61e0c9e582f75a843715d836b5c8af", size = 6192759, upload-time = "2025-11-17T12:27:56.176Z" },
 ]
 [[package]]
 name = "cachetools"
 version = "5.5.0"
@@ -941,6 +990,109 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/d5/1f/5f4a3cd9e4440e9d9bc78ad0a91a1c8d46b4d429d5239ebe6793c9fe5c41/fsspec-2026.3.0-py3-none-any.whl", hash = "sha256:d2ceafaad1b3457968ed14efa28798162f1638dbb5d2a6868a2db002a5ee39a4", size = 202595, upload-time = "2026-03-27T19:11:13.595Z" },
 ]
 [[package]]
 name = "greenlet"
 version = "3.3.2"
@@ -958,6 +1110,41 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/58/2e/fe7f36ff1982d6b10a60d5e0740c759259a7d6d2e1dc41da6d96de32fff6/greenlet-3.3.2-cp312-cp312-win_arm64.whl", hash = "sha256:d3a62fa76a32b462a97198e4c9e99afb9ab375115e74e9a83ce180e7a496f643", size = 230331, upload-time = "2026-02-20T20:17:23.34Z" },
 ]
 [[package]]
 name = "h11"
 version = "0.16.0"
@@ -1127,6 +1314,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/67/8a/a342b2f0251f3dac4ca17618265d93bf244a2a4d089126e81e4c1056ac50/jiter-0.13.0-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7bb00b6d26db67a05fe3e12c76edc75f32077fb51deed13822dc648fa373bc19", size = 343768, upload-time = "2026-02-02T12:37:55.055Z" },
 ]
 [[package]]
 name = "joblib"
 version = "1.5.3"
@@ -1954,6 +2150,18 @@ bcrypt = [
     { name = "bcrypt" },
 ]
 [[package]]
 name = "pgvector"
 version = "0.3.6"
@@ -2121,6 +2329,33 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/5b/5a/bc7b4a4ef808fa59a816c17b20c4bef6884daebbdf627ff2a161da67da19/propcache-0.4.1-py3-none-any.whl", hash = "sha256:af2a6052aeb6cf17d3e46ee169099044fd8224cbaf75c76a2ef596e8163e2237", size = 13305, upload-time = "2025-10-08T19:49:00.792Z" },
 ]
 [[package]]
 name = "psycopg"
 version = "3.2.3"
@@ -2181,6 +2416,42 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/b5/bf/635fbe5dd10ed200afbbfbe98f8602829252ca1cce81cc48fb25ed8dadc0/psycopg2-2.9.11-cp312-cp312-win_amd64.whl", hash = "sha256:e03e4a6dbe87ff81540b434f2e5dc2bddad10296db5eea7bdc995bf5f4162938", size = 2713969, upload-time = "2025-10-10T11:10:15.946Z" },
 ]
 [[package]]
 name = "pycparser"
 version = "3.0"
@@ -2310,6 +2581,43 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/60/4c/33f75713d50d5247f2258405142c0318ff32c6f8976171c4fcae87a9dbdf/pymongo-4.16.0-cp312-cp312-win_arm64.whl", hash = "sha256:dfc320f08ea9a7ec5b2403dc4e8150636f0d6150f4b9792faaae539c88e7db3b", size = 892971, upload-time = "2026-01-07T18:04:35.594Z" },
 ]
 [[package]]
 name = "pyparsing"
 version = "3.3.2"
@@ -2328,6 +2636,28 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/04/fc/6f52588ac1cb4400a7804ef88d0d4e00cfe57a7ac6793ec3b00de5a8758b/pypdf-5.1.0-py3-none-any.whl", hash = "sha256:3bd4f503f4ebc58bae40d81e81a9176c400cbbac2ba2d877367595fb524dfdfc", size = 297976, upload-time = "2024-10-27T19:46:44.439Z" },
 ]
 [[package]]
 name = "pytest"
 version = "8.3.4"
@@ -2610,6 +2940,18 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/13/9f/026e18ca7d7766783d779dae5e9c656746c6ede36ef73c6d934aaf4a6dec/ruff-0.8.4-py3-none-win_arm64.whl", hash = "sha256:9183dd615d8df50defa8b1d9a074053891ba39025cf5ae88e8bcb52edcc4bf08", size = 9074500, upload-time = "2024-12-19T13:36:23.92Z" },
 ]
 [[package]]
 name = "safetensors"
 version = "0.7.0"
@@ -2764,6 +3106,60 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" },
 ]
 [[package]]
 name = "spacy"
 version = "3.8.3"
@@ -2842,6 +3238,31 @@ asyncio = [
     { name = "greenlet" },
 ]
 [[package]]
 name = "srsly"
 version = "2.5.3"
@@ -3015,6 +3436,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/72/f4/0de46cfa12cdcbcd464cc59fde36912af405696f687e53a091fb432f694c/tokenizers-0.22.2-cp39-abi3-win_arm64.whl", hash = "sha256:9ce725d22864a1e965217204946f830c37876eee3b2ba6fc6255e8e903d5fcbc", size = 2612133, upload-time = "2026-01-05T10:45:17.232Z" },
 ]
 [[package]]
 name = "torch"
 version = "2.11.0"

 version = 1
+revision = 3
 requires-python = "==3.12.*"
 resolution-markers = [
     "python_full_version >= '3.12.4'",
     { name = "orjson" },
     { name = "pandas" },
     { name = "passlib", extra = ["bcrypt"] },
+    { name = "pdf2image" },
     { name = "pgvector" },
     { name = "plotly" },
     { name = "presidio-analyzer" },
     { name = "prometheus-client" },
     { name = "psycopg", extra = ["binary", "pool"] },
     { name = "psycopg2" },
+    { name = "pyarrow" },
     { name = "pydantic" },
     { name = "pydantic-settings" },
     { name = "pymongo" },
+    { name = "pymssql" },
+    { name = "pymysql" },
     { name = "pypdf" },
+    { name = "pypdf2" },
+    { name = "pytesseract" },
     { name = "python-docx" },
     { name = "python-dotenv" },
     { name = "python-multipart" },
     { name = "redis" },
     { name = "sentence-transformers" },
     { name = "slowapi" },
+    { name = "snowflake-sqlalchemy" },
     { name = "spacy" },
     { name = "sqlalchemy", extra = ["asyncio"] },
+    { name = "sqlalchemy-bigquery" },
+    { name = "sqlglot" },
     { name = "sse-starlette" },
     { name = "starlette" },
     { name = "structlog" },
 [package.dev-dependencies]
 dev = [
     { name = "pytest" },
     { name = "pytest-asyncio" },
     { name = "ruff" },
 ]
     { name = "orjson", specifier = "==3.10.12" },
     { name = "pandas", specifier = "==2.2.3" },
     { name = "passlib", extras = ["bcrypt"], specifier = "==1.7.4" },
+    { name = "pdf2image", specifier = ">=1.17.0" },
     { name = "pgvector", specifier = "==0.3.6" },
     { name = "plotly", specifier = "==5.24.1" },
     { name = "pre-commit", marker = "extra == 'dev'", specifier = "==4.0.1" },
     { name = "prometheus-client", specifier = "==0.21.1" },
     { name = "psycopg", extras = ["binary", "pool"], specifier = "==3.2.3" },
     { name = "psycopg2", specifier = ">=2.9.11" },
+    { name = "pyarrow", specifier = ">=24.0.0" },
     { name = "pydantic", specifier = "==2.10.3" },
     { name = "pydantic-settings", specifier = "==2.7.0" },
     { name = "pymongo", specifier = ">=4.14.0" },
+    { name = "pymssql", specifier = ">=2.3.0" },
+    { name = "pymysql", specifier = ">=1.1.1" },
     { name = "pypdf", specifier = "==5.1.0" },
+    { name = "pypdf2", specifier = ">=3.0.1" },
+    { name = "pytesseract", specifier = ">=0.3.13" },
     { name = "pytest", marker = "extra == 'dev'", specifier = "==8.3.4" },
     { name = "pytest-asyncio", marker = "extra == 'dev'", specifier = "==0.24.0" },
     { name = "pytest-cov", marker = "extra == 'dev'", specifier = "==6.0.0" },
     { name = "ruff", marker = "extra == 'dev'", specifier = "==0.8.4" },
     { name = "sentence-transformers", specifier = "==3.3.1" },
     { name = "slowapi", specifier = "==0.1.9" },
+    { name = "snowflake-sqlalchemy", specifier = ">=1.7.0" },
     { name = "spacy", specifier = "==3.8.3" },
     { name = "sqlalchemy", extras = ["asyncio"], specifier = "==2.0.36" },
+    { name = "sqlalchemy-bigquery", specifier = ">=1.11.0" },
+    { name = "sqlglot", specifier = ">=25.0.0" },
     { name = "sse-starlette", specifier = "==2.1.3" },
     { name = "starlette", specifier = "==0.41.3" },
     { name = "structlog", specifier = "==24.4.0" },
 [package.metadata.requires-dev]
 dev = [
+    { name = "pytest", specifier = ">=8.3.4" },
+    { name = "pytest-asyncio", specifier = ">=0.24.0" },
+    { name = "ruff", specifier = ">=0.8.4" },
 ]
 [[package]]
     { url = "https://files.pythonhosted.org/packages/13/b5/7af0cb920a476dccd612fbc9a21a3745fb29b1fcd74636078db8f7ba294c/APScheduler-3.10.4-py3-none-any.whl", hash = "sha256:fb91e8a768632a4756a585f79ec834e0e27aad5860bac7eaa523d9ccefd87661", size = 59303, upload-time = "2023-08-19T16:44:56.814Z" },
 ]
+[[package]]
+name = "asn1crypto"
+version = "1.5.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/de/cf/d547feed25b5244fcb9392e288ff9fdc3280b10260362fc45d37a798a6ee/asn1crypto-1.5.1.tar.gz", hash = "sha256:13ae38502be632115abf8a24cbe5f4da52e3b5231990aff31123c805306ccb9c", size = 121080, upload-time = "2022-03-15T14:46:52.889Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c9/7f/09065fd9e27da0eda08b4d6897f1c13535066174cc023af248fc2a8d5e5a/asn1crypto-1.5.1-py2.py3-none-any.whl", hash = "sha256:db4e40728b728508912cbb3d44f19ce188f218e9eba635821bb4b68564f8fd67", size = 105045, upload-time = "2022-03-15T14:46:51.055Z" },
+]
 [[package]]
 name = "asyncpg"
 version = "0.30.0"
     { url = "https://files.pythonhosted.org/packages/20/07/fb43edc2ff0a6a367e4a94fc39eb3b85aa1e55e24cc857af2db145ce9f0d/blis-1.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:f20f7ad69aaffd1ce14fe77de557b6df9b61e0c9e582f75a843715d836b5c8af", size = 6192759, upload-time = "2025-11-17T12:27:56.176Z" },
 ]
+[[package]]
+name = "boto3"
+version = "1.42.89"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "botocore" },
+    { name = "jmespath" },
+    { name = "s3transfer" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/bb/0c/f7bccb22b245cabf392816baba20f9e95f78ace7dbc580fd40136e80e732/boto3-1.42.89.tar.gz", hash = "sha256:3e43aacc0801bba9bcd23a8c271c089af297a69565f783fcdd357ae0e330bf1e", size = 113165, upload-time = "2026-04-13T19:36:17.516Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b9/33/55103ba5ef9975ea54b8d39e69b76eb6e9fded3beae5f01065e26951a3a1/boto3-1.42.89-py3-none-any.whl", hash = "sha256:6204b189f4d0c655535f43d7eaa57ff4e8d965b8463c97e45952291211162932", size = 140556, upload-time = "2026-04-13T19:36:13.894Z" },
+]
+[[package]]
+name = "botocore"
+version = "1.42.89"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "jmespath" },
+    { name = "python-dateutil" },
+    { name = "urllib3" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/0f/cc/e6be943efa9051bd15c2ee14077c2b10d6e27c9e9385fc43a03a5c4ed8b5/botocore-1.42.89.tar.gz", hash = "sha256:95ac52f472dad29942f3088b278ab493044516c16dbf9133c975af16527baa99", size = 15206290, upload-time = "2026-04-13T19:36:02.321Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/91/f1/90a7b8eda38b7c3a65ca7ee0075bdf310b6b471cb1b95fab6e8994323a50/botocore-1.42.89-py3-none-any.whl", hash = "sha256:d9b786c8d9db6473063b4cc5be0ba7e6a381082307bd6afb69d4216f9fa95f35", size = 14887287, upload-time = "2026-04-13T19:35:56.677Z" },
+]
 [[package]]
 name = "cachetools"
 version = "5.5.0"
     { url = "https://files.pythonhosted.org/packages/d5/1f/5f4a3cd9e4440e9d9bc78ad0a91a1c8d46b4d429d5239ebe6793c9fe5c41/fsspec-2026.3.0-py3-none-any.whl", hash = "sha256:d2ceafaad1b3457968ed14efa28798162f1638dbb5d2a6868a2db002a5ee39a4", size = 202595, upload-time = "2026-03-27T19:11:13.595Z" },
 ]
+[[package]]
+name = "google-api-core"
+version = "2.30.3"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "google-auth" },
+    { name = "googleapis-common-protos" },
+    { name = "proto-plus" },
+    { name = "protobuf" },
+    { name = "requests" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/16/ce/502a57fb0ec752026d24df1280b162294b22a0afb98a326084f9a979138b/google_api_core-2.30.3.tar.gz", hash = "sha256:e601a37f148585319b26db36e219df68c5d07b6382cff2d580e83404e44d641b", size = 177001, upload-time = "2026-04-10T00:41:28.035Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/03/15/e56f351cf6ef1cfea58e6ac226a7318ed1deb2218c4b3cc9bd9e4b786c5a/google_api_core-2.30.3-py3-none-any.whl", hash = "sha256:a85761ba72c444dad5d611c2220633480b2b6be2521eca69cca2dbb3ffd6bfe8", size = 173274, upload-time = "2026-04-09T22:57:16.198Z" },
+]
+[package.optional-dependencies]
+grpc = [
+    { name = "grpcio" },
+    { name = "grpcio-status" },
+]
+[[package]]
+name = "google-auth"
+version = "2.49.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "cryptography" },
+    { name = "pyasn1-modules" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/c6/fc/e925290a1ad95c975c459e2df070fac2b90954e13a0370ac505dff78cb99/google_auth-2.49.2.tar.gz", hash = "sha256:c1ae38500e73065dcae57355adb6278cf8b5c8e391994ae9cbadbcb9631ab409", size = 333958, upload-time = "2026-04-10T00:41:21.888Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/73/76/d241a5c927433420507215df6cac1b1fa4ac0ba7a794df42a84326c68da8/google_auth-2.49.2-py3-none-any.whl", hash = "sha256:c2720924dfc82dedb962c9f52cabb2ab16714fd0a6a707e40561d217574ed6d5", size = 240638, upload-time = "2026-04-10T00:41:14.501Z" },
+]
+[[package]]
+name = "google-cloud-bigquery"
+version = "3.41.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "google-api-core", extra = ["grpc"] },
+    { name = "google-auth" },
+    { name = "google-cloud-core" },
+    { name = "google-resumable-media" },
+    { name = "packaging" },
+    { name = "python-dateutil" },
+    { name = "requests" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ce/13/6515c7aab55a4a0cf708ffd309fb9af5bab54c13e32dc22c5acd6497193c/google_cloud_bigquery-3.41.0.tar.gz", hash = "sha256:2217e488b47ed576360c9b2cc07d59d883a54b83167c0ef37f915c26b01a06fe", size = 513434, upload-time = "2026-03-30T22:50:55.347Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/40/33/1d3902efadef9194566d499d61507e1f038454e0b55499d2d7f8ab2a4fee/google_cloud_bigquery-3.41.0-py3-none-any.whl", hash = "sha256:2a5b5a737b401cbd824a6e5eac7554100b878668d908e6548836b5d8aaa4dcaa", size = 262343, upload-time = "2026-03-30T22:48:45.444Z" },
+]
+[[package]]
+name = "google-cloud-core"
+version = "2.5.1"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "google-api-core" },
+    { name = "google-auth" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/dc/24/6ca08b0a03c7b0c620427503ab00353a4ae806b848b93bcea18b6b76fde6/google_cloud_core-2.5.1.tar.gz", hash = "sha256:3dc94bdec9d05a31d9f355045ed0f369fbc0d8c665076c734f065d729800f811", size = 36078, upload-time = "2026-03-30T22:50:08.057Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/73/d9/5bb050cb32826466aa9b25f79e2ca2879fe66cb76782d4ed798dd7506151/google_cloud_core-2.5.1-py3-none-any.whl", hash = "sha256:ea62cdf502c20e3e14be8a32c05ed02113d7bef454e40ff3fab6fe1ec9f1f4e7", size = 29452, upload-time = "2026-03-30T22:48:31.567Z" },
+]
+[[package]]
+name = "google-crc32c"
+version = "1.8.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/03/41/4b9c02f99e4c5fb477122cd5437403b552873f014616ac1d19ac8221a58d/google_crc32c-1.8.0.tar.gz", hash = "sha256:a428e25fb7691024de47fecfbff7ff957214da51eddded0da0ae0e0f03a2cf79", size = 14192, upload-time = "2025-12-16T00:35:25.142Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/e9/5f/7307325b1198b59324c0fa9807cafb551afb65e831699f2ce211ad5c8240/google_crc32c-1.8.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:4b8286b659c1335172e39563ab0a768b8015e88e08329fa5321f774275fc3113", size = 31300, upload-time = "2025-12-16T00:21:56.723Z" },
+    { url = "https://files.pythonhosted.org/packages/21/8e/58c0d5d86e2220e6a37befe7e6a94dd2f6006044b1a33edf1ff6d9f7e319/google_crc32c-1.8.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:2a3dc3318507de089c5384cc74d54318401410f82aa65b2d9cdde9d297aca7cb", size = 30867, upload-time = "2025-12-16T00:38:31.302Z" },
+    { url = "https://files.pythonhosted.org/packages/ce/a9/a780cc66f86335a6019f557a8aaca8fbb970728f0efd2430d15ff1beae0e/google_crc32c-1.8.0-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:14f87e04d613dfa218d6135e81b78272c3b904e2a7053b841481b38a7d901411", size = 33364, upload-time = "2025-12-16T00:40:22.96Z" },
+    { url = "https://files.pythonhosted.org/packages/21/3f/3457ea803db0198c9aaca2dd373750972ce28a26f00544b6b85088811939/google_crc32c-1.8.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cb5c869c2923d56cb0c8e6bcdd73c009c36ae39b652dbe46a05eb4ef0ad01454", size = 33740, upload-time = "2025-12-16T00:40:23.96Z" },
+    { url = "https://files.pythonhosted.org/packages/df/c0/87c2073e0c72515bb8733d4eef7b21548e8d189f094b5dad20b0ecaf64f6/google_crc32c-1.8.0-cp312-cp312-win_amd64.whl", hash = "sha256:3cc0c8912038065eafa603b238abf252e204accab2a704c63b9e14837a854962", size = 34437, upload-time = "2025-12-16T00:35:21.395Z" },
+]
+[[package]]
+name = "google-resumable-media"
+version = "2.8.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "google-crc32c" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/3f/d1/b1ea14b93b6b78f57fc580125de44e9f593ab88dd2460f1a8a8d18f74754/google_resumable_media-2.8.2.tar.gz", hash = "sha256:f3354a182ebd193ae3f42e3ef95e6c9b10f128320de23ac7637236713b1acd70", size = 2164510, upload-time = "2026-03-30T23:34:25.369Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/5e/f8/50bfaf4658431ff9de45c5c3935af7ab01157a4903c603cd0eee6e78e087/google_resumable_media-2.8.2-py3-none-any.whl", hash = "sha256:82b6d8ccd11765268cdd2a2123f417ec806b8eef3000a9a38dfe3033da5fb220", size = 81511, upload-time = "2026-03-30T23:34:09.671Z" },
+]
+[[package]]
+name = "googleapis-common-protos"
+version = "1.74.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "protobuf" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/20/18/a746c8344152d368a5aac738d4c857012f2c5d1fd2eac7e17b647a7861bd/googleapis_common_protos-1.74.0.tar.gz", hash = "sha256:57971e4eeeba6aad1163c1f0fc88543f965bb49129b8bb55b2b7b26ecab084f1", size = 151254, upload-time = "2026-04-02T21:23:26.679Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b6/b0/be5d3329badb9230b765de6eea66b73abd5944bdeb5afb3562ddcd80ae84/googleapis_common_protos-1.74.0-py3-none-any.whl", hash = "sha256:702216f78610bb510e3f12ac3cafd281b7ac45cc5d86e90ad87e4d301a3426b5", size = 300743, upload-time = "2026-04-02T21:22:49.108Z" },
+]
 [[package]]
 name = "greenlet"
 version = "3.3.2"
     { url = "https://files.pythonhosted.org/packages/58/2e/fe7f36ff1982d6b10a60d5e0740c759259a7d6d2e1dc41da6d96de32fff6/greenlet-3.3.2-cp312-cp312-win_arm64.whl", hash = "sha256:d3a62fa76a32b462a97198e4c9e99afb9ab375115e74e9a83ce180e7a496f643", size = 230331, upload-time = "2026-02-20T20:17:23.34Z" },
 ]
+[[package]]
+name = "grpcio"
+version = "1.80.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/b7/48/af6173dbca4454f4637a4678b67f52ca7e0c1ed7d5894d89d434fecede05/grpcio-1.80.0.tar.gz", hash = "sha256:29aca15edd0688c22ba01d7cc01cb000d72b2033f4a3c72a81a19b56fd143257", size = 12978905, upload-time = "2026-03-30T08:49:10.502Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/5c/e8/a2b749265eb3415abc94f2e619bbd9e9707bebdda787e61c593004ec927a/grpcio-1.80.0-cp312-cp312-linux_armv7l.whl", hash = "sha256:c624cc9f1008361014378c9d776de7182b11fe8b2e5a81bc69f23a295f2a1ad0", size = 6015616, upload-time = "2026-03-30T08:47:13.428Z" },
+    { url = "https://files.pythonhosted.org/packages/3e/97/b1282161a15d699d1e90c360df18d19165a045ce1c343c7f313f5e8a0b77/grpcio-1.80.0-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:f49eddcac43c3bf350c0385366a58f36bed8cc2c0ec35ef7b74b49e56552c0c2", size = 12014204, upload-time = "2026-03-30T08:47:15.873Z" },
+    { url = "https://files.pythonhosted.org/packages/6e/5e/d319c6e997b50c155ac5a8cb12f5173d5b42677510e886d250d50264949d/grpcio-1.80.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d334591df610ab94714048e0d5b4f3dd5ad1bee74dfec11eee344220077a79de", size = 6563866, upload-time = "2026-03-30T08:47:18.588Z" },
+    { url = "https://files.pythonhosted.org/packages/ae/f6/fdd975a2cb4d78eb67769a7b3b3830970bfa2e919f1decf724ae4445f42c/grpcio-1.80.0-cp312-cp312-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:0cb517eb1d0d0aaf1d87af7cc5b801d686557c1d88b2619f5e31fab3c2315921", size = 7273060, upload-time = "2026-03-30T08:47:21.113Z" },
+    { url = "https://files.pythonhosted.org/packages/db/f0/a3deb5feba60d9538a962913e37bd2e69a195f1c3376a3dd44fe0427e996/grpcio-1.80.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4e78c4ac0d97dc2e569b2f4bcbbb447491167cb358d1a389fc4af71ab6f70411", size = 6782121, upload-time = "2026-03-30T08:47:23.827Z" },
+    { url = "https://files.pythonhosted.org/packages/ca/84/36c6dcfddc093e108141f757c407902a05085e0c328007cb090d56646cdf/grpcio-1.80.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:2ed770b4c06984f3b47eb0517b1c69ad0b84ef3f40128f51448433be904634cd", size = 7383811, upload-time = "2026-03-30T08:47:26.517Z" },
+    { url = "https://files.pythonhosted.org/packages/7c/ef/f3a77e3dc5b471a0ec86c564c98d6adfa3510d38f8ee99010410858d591e/grpcio-1.80.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:256507e2f524092f1473071a05e65a5b10d84b82e3ff24c5b571513cfaa61e2f", size = 8393860, upload-time = "2026-03-30T08:47:29.439Z" },
+    { url = "https://files.pythonhosted.org/packages/9b/8d/9d4d27ed7f33d109c50d6b5ce578a9914aa68edab75d65869a17e630a8d1/grpcio-1.80.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:9a6284a5d907c37db53350645567c522be314bac859a64a7a5ca63b77bb7958f", size = 7830132, upload-time = "2026-03-30T08:47:33.254Z" },
+    { url = "https://files.pythonhosted.org/packages/14/e4/9990b41c6d7a44e1e9dee8ac11d7a9802ba1378b40d77468a7761d1ad288/grpcio-1.80.0-cp312-cp312-win32.whl", hash = "sha256:c71309cfce2f22be26aa4a847357c502db6c621f1a49825ae98aa0907595b193", size = 4140904, upload-time = "2026-03-30T08:47:35.319Z" },
+    { url = "https://files.pythonhosted.org/packages/2f/2c/296f6138caca1f4b92a31ace4ae1b87dab692fc16a7a3417af3bb3c805bf/grpcio-1.80.0-cp312-cp312-win_amd64.whl", hash = "sha256:9fe648599c0e37594c4809d81a9e77bd138cc82eb8baa71b6a86af65426723ff", size = 4880944, upload-time = "2026-03-30T08:47:37.831Z" },
+]
+[[package]]
+name = "grpcio-status"
+version = "1.80.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "googleapis-common-protos" },
+    { name = "grpcio" },
+    { name = "protobuf" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/b1/ed/105f619bdd00cb47a49aa2feea6232ea2bbb04199d52a22cc6a7d603b5cb/grpcio_status-1.80.0.tar.gz", hash = "sha256:df73802a4c89a3ea88aa2aff971e886fccce162bc2e6511408b3d67a144381cd", size = 13901, upload-time = "2026-03-30T08:54:34.784Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/76/80/58cd2dfc19a07d022abe44bde7c365627f6c7cb6f692ada6c65ca437d09a/grpcio_status-1.80.0-py3-none-any.whl", hash = "sha256:4b56990363af50dbf2c2ebb80f1967185c07d87aa25aa2bea45ddb75fc181dbe", size = 14638, upload-time = "2026-03-30T08:54:01.569Z" },
+]
 [[package]]
 name = "h11"
 version = "0.16.0"
     { url = "https://files.pythonhosted.org/packages/67/8a/a342b2f0251f3dac4ca17618265d93bf244a2a4d089126e81e4c1056ac50/jiter-0.13.0-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7bb00b6d26db67a05fe3e12c76edc75f32077fb51deed13822dc648fa373bc19", size = 343768, upload-time = "2026-02-02T12:37:55.055Z" },
 ]
+[[package]]
+name = "jmespath"
+version = "1.1.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/d3/59/322338183ecda247fb5d1763a6cbe46eff7222eaeebafd9fa65d4bf5cb11/jmespath-1.1.0.tar.gz", hash = "sha256:472c87d80f36026ae83c6ddd0f1d05d4e510134ed462851fd5f754c8c3cbb88d", size = 27377, upload-time = "2026-01-22T16:35:26.279Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/14/2f/967ba146e6d58cf6a652da73885f52fc68001525b4197effc174321d70b4/jmespath-1.1.0-py3-none-any.whl", hash = "sha256:a5663118de4908c91729bea0acadca56526eb2698e83de10cd116ae0f4e97c64", size = 20419, upload-time = "2026-01-22T16:35:24.919Z" },
+]
 [[package]]
 name = "joblib"
 version = "1.5.3"
     { name = "bcrypt" },
 ]
+[[package]]
+name = "pdf2image"
+version = "1.17.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pillow" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/00/d8/b280f01045555dc257b8153c00dee3bc75830f91a744cd5f84ef3a0a64b1/pdf2image-1.17.0.tar.gz", hash = "sha256:eaa959bc116b420dd7ec415fcae49b98100dda3dd18cd2fdfa86d09f112f6d57", size = 12811, upload-time = "2024-01-07T20:33:01.965Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/62/33/61766ae033518957f877ab246f87ca30a85b778ebaad65b7f74fa7e52988/pdf2image-1.17.0-py3-none-any.whl", hash = "sha256:ecdd58d7afb810dffe21ef2b1bbc057ef434dabbac6c33778a38a3f7744a27e2", size = 11618, upload-time = "2024-01-07T20:32:59.957Z" },
+]
 [[package]]
 name = "pgvector"
 version = "0.3.6"
     { url = "https://files.pythonhosted.org/packages/5b/5a/bc7b4a4ef808fa59a816c17b20c4bef6884daebbdf627ff2a161da67da19/propcache-0.4.1-py3-none-any.whl", hash = "sha256:af2a6052aeb6cf17d3e46ee169099044fd8224cbaf75c76a2ef596e8163e2237", size = 13305, upload-time = "2025-10-08T19:49:00.792Z" },
 ]
+[[package]]
+name = "proto-plus"
+version = "1.27.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "protobuf" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/81/0d/94dfe80193e79d55258345901acd2917523d56e8381bc4dee7fd38e3868a/proto_plus-1.27.2.tar.gz", hash = "sha256:b2adde53adadf75737c44d3dcb0104fde65250dfc83ad59168b4aa3e574b6a24", size = 57204, upload-time = "2026-03-26T22:18:57.174Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/84/f3/1fba73eeffafc998a25d59703b63f8be4fe8a5cb12eaff7386a0ba0f7125/proto_plus-1.27.2-py3-none-any.whl", hash = "sha256:6432f75893d3b9e70b9c412f1d2f03f65b11fb164b793d14ae2ca01821d22718", size = 50450, upload-time = "2026-03-26T22:13:42.927Z" },
+]
+[[package]]
+name = "protobuf"
+version = "6.33.6"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/66/70/e908e9c5e52ef7c3a6c7902c9dfbb34c7e29c25d2f81ade3856445fd5c94/protobuf-6.33.6.tar.gz", hash = "sha256:a6768d25248312c297558af96a9f9c929e8c4cee0659cb07e780731095f38135", size = 444531, upload-time = "2026-03-18T19:05:00.988Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/fc/9f/2f509339e89cfa6f6a4c4ff50438db9ca488dec341f7e454adad60150b00/protobuf-6.33.6-cp310-abi3-win32.whl", hash = "sha256:7d29d9b65f8afef196f8334e80d6bc1d5d4adedb449971fefd3723824e6e77d3", size = 425739, upload-time = "2026-03-18T19:04:48.373Z" },
+    { url = "https://files.pythonhosted.org/packages/76/5d/683efcd4798e0030c1bab27374fd13a89f7c2515fb1f3123efdfaa5eab57/protobuf-6.33.6-cp310-abi3-win_amd64.whl", hash = "sha256:0cd27b587afca21b7cfa59a74dcbd48a50f0a6400cfb59391340ad729d91d326", size = 437089, upload-time = "2026-03-18T19:04:50.381Z" },
+    { url = "https://files.pythonhosted.org/packages/5c/01/a3c3ed5cd186f39e7880f8303cc51385a198a81469d53d0fdecf1f64d929/protobuf-6.33.6-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:9720e6961b251bde64edfdab7d500725a2af5280f3f4c87e57c0208376aa8c3a", size = 427737, upload-time = "2026-03-18T19:04:51.866Z" },
+    { url = "https://files.pythonhosted.org/packages/ee/90/b3c01fdec7d2f627b3a6884243ba328c1217ed2d978def5c12dc50d328a3/protobuf-6.33.6-cp39-abi3-manylinux2014_aarch64.whl", hash = "sha256:e2afbae9b8e1825e3529f88d514754e094278bb95eadc0e199751cdd9a2e82a2", size = 324610, upload-time = "2026-03-18T19:04:53.096Z" },
+    { url = "https://files.pythonhosted.org/packages/9b/ca/25afc144934014700c52e05103c2421997482d561f3101ff352e1292fb81/protobuf-6.33.6-cp39-abi3-manylinux2014_s390x.whl", hash = "sha256:c96c37eec15086b79762ed265d59ab204dabc53056e3443e702d2681f4b39ce3", size = 339381, upload-time = "2026-03-18T19:04:54.616Z" },
+    { url = "https://files.pythonhosted.org/packages/16/92/d1e32e3e0d894fe00b15ce28ad4944ab692713f2e7f0a99787405e43533a/protobuf-6.33.6-cp39-abi3-manylinux2014_x86_64.whl", hash = "sha256:e9db7e292e0ab79dd108d7f1a94fe31601ce1ee3f7b79e0692043423020b0593", size = 323436, upload-time = "2026-03-18T19:04:55.768Z" },
+    { url = "https://files.pythonhosted.org/packages/c4/72/02445137af02769918a93807b2b7890047c32bfb9f90371cbc12688819eb/protobuf-6.33.6-py3-none-any.whl", hash = "sha256:77179e006c476e69bf8e8ce866640091ec42e1beb80b213c3900006ecfba6901", size = 170656, upload-time = "2026-03-18T19:04:59.826Z" },
+]
 [[package]]
 name = "psycopg"
 version = "3.2.3"
     { url = "https://files.pythonhosted.org/packages/b5/bf/635fbe5dd10ed200afbbfbe98f8602829252ca1cce81cc48fb25ed8dadc0/psycopg2-2.9.11-cp312-cp312-win_amd64.whl", hash = "sha256:e03e4a6dbe87ff81540b434f2e5dc2bddad10296db5eea7bdc995bf5f4162938", size = 2713969, upload-time = "2025-10-10T11:10:15.946Z" },
 ]
+[[package]]
+name = "pyarrow"
+version = "24.0.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/91/13/13e1069b351bdc3881266e11147ffccf687505dbb0ea74036237f5d454a5/pyarrow-24.0.0.tar.gz", hash = "sha256:85fe721a14dd823aca09127acbb06c3ca723efbd436c004f16bca601b04dcc83", size = 1180261, upload-time = "2026-04-21T10:51:25.837Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b4/a9/9686d9f07837f91f775e8932659192e02c74f9d8920524b480b85212cc68/pyarrow-24.0.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:6233c9ed9ab9d1db47de57d9753256d9dcffbf42db341576099f0fd9f6bf4810", size = 34981559, upload-time = "2026-04-21T10:47:22.17Z" },
+    { url = "https://files.pythonhosted.org/packages/80/b6/0ddf0e9b6ead3474ab087ae598c76b031fc45532bf6a63f3a553440fb258/pyarrow-24.0.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:f7616236ec1bc2b15bfdec22a71ab38851c86f8f05ff64f379e1278cf20c634a", size = 36663654, upload-time = "2026-04-21T10:47:28.315Z" },
+    { url = "https://files.pythonhosted.org/packages/7c/3b/926382efe8ce27ba729071d3566ade6dfb86bdf112f366000196b2f5780a/pyarrow-24.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:1617043b99bd33e5318ae18eb2919af09c71322ef1ca46566cdafc6e6712fb66", size = 45679394, upload-time = "2026-04-21T10:47:34.821Z" },
+    { url = "https://files.pythonhosted.org/packages/b3/7a/829f7d9dfd37c207206081d6dad474d81dde29952401f07f2ba507814818/pyarrow-24.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:6165461f55ef6314f026de6638d661188e3455d3ec49834556a0ebbdbace18bb", size = 48863122, upload-time = "2026-04-21T10:47:42.056Z" },
+    { url = "https://files.pythonhosted.org/packages/5f/e8/f88ce625fe8babaae64e8db2d417c7653adb3019b08aae85c5ed787dc816/pyarrow-24.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3b13dedfe76a0ad2d1d859b0811b53827a4e9d93a0bcb05cf59333ab4980cc7e", size = 49376032, upload-time = "2026-04-21T10:47:48.967Z" },
+    { url = "https://files.pythonhosted.org/packages/36/7a/82c363caa145fff88fb475da50d3bf52bb024f61917be5424c3392eaf878/pyarrow-24.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:25ea65d868eb04015cd18e6df2fbe98f07e5bda2abefabcb88fce39a947716f6", size = 51929490, upload-time = "2026-04-21T10:47:55.981Z" },
+    { url = "https://files.pythonhosted.org/packages/66/1c/e3e72c8014ad2743ca64a701652c733cc5cbcee15c0463a32a8c55518d9e/pyarrow-24.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:295f0a7f2e242dabd513737cf076007dc5b2d59237e3eca37b05c0c6446f3826", size = 27355660, upload-time = "2026-04-21T10:48:01.718Z" },
+]
+[[package]]
+name = "pyasn1"
+version = "0.6.3"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/5c/5f/6583902b6f79b399c9c40674ac384fd9cd77805f9e6205075f828ef11fb2/pyasn1-0.6.3.tar.gz", hash = "sha256:697a8ecd6d98891189184ca1fa05d1bb00e2f84b5977c481452050549c8a72cf", size = 148685, upload-time = "2026-03-17T01:06:53.382Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/5d/a0/7d793dce3fa811fe047d6ae2431c672364b462850c6235ae306c0efd025f/pyasn1-0.6.3-py3-none-any.whl", hash = "sha256:a80184d120f0864a52a073acc6fc642847d0be408e7c7252f31390c0f4eadcde", size = 83997, upload-time = "2026-03-17T01:06:52.036Z" },
+]
+[[package]]
+name = "pyasn1-modules"
+version = "0.4.2"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "pyasn1" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/e9/e6/78ebbb10a8c8e4b61a59249394a4a594c1a7af95593dc933a349c8d00964/pyasn1_modules-0.4.2.tar.gz", hash = "sha256:677091de870a80aae844b1ca6134f54652fa2c8c5a52aa396440ac3106e941e6", size = 307892, upload-time = "2025-03-28T02:41:22.17Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/47/8d/d529b5d697919ba8c11ad626e835d4039be708a35b0d22de83a269a6682c/pyasn1_modules-0.4.2-py3-none-any.whl", hash = "sha256:29253a9207ce32b64c3ac6600edc75368f98473906e8fd1043bd6b5b1de2c14a", size = 181259, upload-time = "2025-03-28T02:41:19.028Z" },
+]
 [[package]]
 name = "pycparser"
 version = "3.0"
     { url = "https://files.pythonhosted.org/packages/60/4c/33f75713d50d5247f2258405142c0318ff32c6f8976171c4fcae87a9dbdf/pymongo-4.16.0-cp312-cp312-win_arm64.whl", hash = "sha256:dfc320f08ea9a7ec5b2403dc4e8150636f0d6150f4b9792faaae539c88e7db3b", size = 892971, upload-time = "2026-01-07T18:04:35.594Z" },
 ]
+[[package]]
+name = "pymssql"
+version = "2.3.13"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/7a/cc/843c044b7f71ee329436b7327c578383e2f2499313899f88ad267cdf1f33/pymssql-2.3.13.tar.gz", hash = "sha256:2137e904b1a65546be4ccb96730a391fcd5a85aab8a0632721feb5d7e39cfbce", size = 203153, upload-time = "2026-02-14T05:00:36.865Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ba/60/a2e8a8a38f7be21d54402e2b3365cd56f1761ce9f2706c97f864e8aa8300/pymssql-2.3.13-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:cf4f32b4a05b66f02cb7d55a0f3bcb0574a6f8cf0bee4bea6f7b104038364733", size = 3158689, upload-time = "2026-02-14T04:59:46.982Z" },
+    { url = "https://files.pythonhosted.org/packages/43/9e/0cf0ffb9e2f73238baf766d8e31d7237b5bee3cc1bb29a376b404610994a/pymssql-2.3.13-cp312-cp312-macosx_15_0_x86_64.whl", hash = "sha256:2b056eb175955f7fb715b60dc1c0c624969f4d24dbdcf804b41ab1e640a2b131", size = 2960018, upload-time = "2026-02-14T04:59:48.668Z" },
+    { url = "https://files.pythonhosted.org/packages/93/ea/bc27354feaca717faa4626911f6b19bb62985c87dda28957c63de4de5895/pymssql-2.3.13-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:319810b89aa64b99d9c5c01518752c813938df230496fa2c4c6dda0603f04c4c", size = 3065719, upload-time = "2026-02-14T04:59:50.369Z" },
+    { url = "https://files.pythonhosted.org/packages/1e/7a/8028681c96241fb5fc850b87c8959402c353e4b83c6e049a99ffa67ded54/pymssql-2.3.13-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c0ea72641cb0f8bce7ad8565dbdbda4a7437aa58bce045f2a3a788d71af2e4be", size = 3190567, upload-time = "2026-02-14T04:59:52.202Z" },
+    { url = "https://files.pythonhosted.org/packages/aa/f1/ab5b76adbbd6db9ce746d448db34b044683522e7e7b95053f9dd0165297b/pymssql-2.3.13-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1493f63d213607f708a5722aa230776ada726ccdb94097fab090a1717a2534e0", size = 3710481, upload-time = "2026-02-14T04:59:54.01Z" },
+    { url = "https://files.pythonhosted.org/packages/59/aa/2fa0951475cd0a1829e0b8bfbe334d04ece4bce11546a556b005c4100689/pymssql-2.3.13-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:eb3275985c23479e952d6462ae6c8b2b6993ab6b99a92805a9c17942cf3d5b3d", size = 3453789, upload-time = "2026-02-14T04:59:56.841Z" },
+    { url = "https://files.pythonhosted.org/packages/78/08/8cd2af9003f9fc03912b658a64f5a4919dcd68f0dd3bbc822b49a3d14fd9/pymssql-2.3.13-cp312-cp312-win_amd64.whl", hash = "sha256:a930adda87bdd8351a5637cf73d6491936f34e525a5e513068a6eac742f69cdb", size = 1994709, upload-time = "2026-02-14T04:59:58.972Z" },
+]
+[[package]]
+name = "pymysql"
+version = "1.1.2"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/f5/ae/1fe3fcd9f959efa0ebe200b8de88b5a5ce3e767e38c7ac32fb179f16a388/pymysql-1.1.2.tar.gz", hash = "sha256:4961d3e165614ae65014e361811a724e2044ad3ea3739de9903ae7c21f539f03", size = 48258, upload-time = "2025-08-24T12:55:55.146Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7c/4c/ad33b92b9864cbde84f259d5df035a6447f91891f5be77788e2a3892bce3/pymysql-1.1.2-py3-none-any.whl", hash = "sha256:e6b1d89711dd51f8f74b1631fe08f039e7d76cf67a42a323d3178f0f25762ed9", size = 45300, upload-time = "2025-08-24T12:55:53.394Z" },
+]
+[[package]]
+name = "pyopenssl"
+version = "25.1.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "cryptography" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/04/8c/cd89ad05804f8e3c17dea8f178c3f40eeab5694c30e0c9f5bcd49f576fc3/pyopenssl-25.1.0.tar.gz", hash = "sha256:8d031884482e0c67ee92bf9a4d8cceb08d92aba7136432ffb0703c5280fc205b", size = 179937, upload-time = "2025-05-17T16:28:31.31Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/80/28/2659c02301b9500751f8d42f9a6632e1508aa5120de5e43042b8b30f8d5d/pyopenssl-25.1.0-py3-none-any.whl", hash = "sha256:2b11f239acc47ac2e5aca04fd7fa829800aeee22a2eb30d744572a157bd8a1ab", size = 56771, upload-time = "2025-05-17T16:28:29.197Z" },
+]
 [[package]]
 name = "pyparsing"
 version = "3.3.2"
     { url = "https://files.pythonhosted.org/packages/04/fc/6f52588ac1cb4400a7804ef88d0d4e00cfe57a7ac6793ec3b00de5a8758b/pypdf-5.1.0-py3-none-any.whl", hash = "sha256:3bd4f503f4ebc58bae40d81e81a9176c400cbbac2ba2d877367595fb524dfdfc", size = 297976, upload-time = "2024-10-27T19:46:44.439Z" },
 ]
+[[package]]
+name = "pypdf2"
+version = "3.0.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/9f/bb/18dc3062d37db6c491392007dfd1a7f524bb95886eb956569ac38a23a784/PyPDF2-3.0.1.tar.gz", hash = "sha256:a74408f69ba6271f71b9352ef4ed03dc53a31aa404d29b5d31f53bfecfee1440", size = 227419, upload-time = "2022-12-31T10:36:13.13Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/8e/5e/c86a5643653825d3c913719e788e41386bee415c2b87b4f955432f2de6b2/pypdf2-3.0.1-py3-none-any.whl", hash = "sha256:d16e4205cfee272fbdc0568b68d82be796540b1537508cef59388f839c191928", size = 232572, upload-time = "2022-12-31T10:36:10.327Z" },
+]
+[[package]]
+name = "pytesseract"
+version = "0.3.13"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "packaging" },
+    { name = "pillow" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/9f/a6/7d679b83c285974a7cb94d739b461fa7e7a9b17a3abfd7bf6cbc5c2394b0/pytesseract-0.3.13.tar.gz", hash = "sha256:4bf5f880c99406f52a3cfc2633e42d9dc67615e69d8a509d74867d3baddb5db9", size = 17689, upload-time = "2024-08-16T02:33:56.762Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/7a/33/8312d7ce74670c9d39a532b2c246a853861120486be9443eebf048043637/pytesseract-0.3.13-py3-none-any.whl", hash = "sha256:7a99c6c2ac598360693d83a416e36e0b33a67638bb9d77fdcac094a3589d4b34", size = 14705, upload-time = "2024-08-16T02:36:10.09Z" },
+]
 [[package]]
 name = "pytest"
 version = "8.3.4"
     { url = "https://files.pythonhosted.org/packages/13/9f/026e18ca7d7766783d779dae5e9c656746c6ede36ef73c6d934aaf4a6dec/ruff-0.8.4-py3-none-win_arm64.whl", hash = "sha256:9183dd615d8df50defa8b1d9a074053891ba39025cf5ae88e8bcb52edcc4bf08", size = 9074500, upload-time = "2024-12-19T13:36:23.92Z" },
 ]
+[[package]]
+name = "s3transfer"
+version = "0.16.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "botocore" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/05/04/74127fc843314818edfa81b5540e26dd537353b123a4edc563109d8f17dd/s3transfer-0.16.0.tar.gz", hash = "sha256:8e990f13268025792229cd52fa10cb7163744bf56e719e0b9cb925ab79abf920", size = 153827, upload-time = "2025-12-01T02:30:59.114Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/fc/51/727abb13f44c1fcf6d145979e1535a35794db0f6e450a0cb46aa24732fe2/s3transfer-0.16.0-py3-none-any.whl", hash = "sha256:18e25d66fed509e3868dc1572b3f427ff947dd2c56f844a5bf09481ad3f3b2fe", size = 86830, upload-time = "2025-12-01T02:30:57.729Z" },
+]
 [[package]]
 name = "safetensors"
 version = "0.7.0"
     { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" },
 ]
+[[package]]
+name = "snowflake-connector-python"
+version = "4.0.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "asn1crypto" },
+    { name = "boto3" },
+    { name = "botocore" },
+    { name = "certifi" },
+    { name = "charset-normalizer" },
+    { name = "cryptography" },
+    { name = "filelock" },
+    { name = "idna" },
+    { name = "packaging" },
+    { name = "platformdirs" },
+    { name = "pyjwt" },
+    { name = "pyopenssl" },
+    { name = "pytz" },
+    { name = "requests" },
+    { name = "sortedcontainers" },
+    { name = "tomlkit" },
+    { name = "typing-extensions" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/1d/f1/4aff125021a9c5e0183f2f55dd7d04b7256a0e1e10db50d537a7415d9c55/snowflake_connector_python-4.0.0.tar.gz", hash = "sha256:4b10a865c4a5e1fa60c365c7fe41e0433605e6e5edc824e8730a9038f330b3a6", size = 813937, upload-time = "2025-10-09T10:11:34.631Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ea/b0/462c0deee35d6d03d3d729b3f923615bae665beb7f9a94673a23a52080fe/snowflake_connector_python-4.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:bfd3b8523d7adc830f99c5c4c635689ceca61700a05368d5bbb34c6811f2ec54", size = 1029568, upload-time = "2025-10-09T10:11:42.125Z" },
+    { url = "https://files.pythonhosted.org/packages/ff/4b/bb3ae3f07e7927c8f16c4c0f1283d3c721978d16e8bf4193fc8e41025c1e/snowflake_connector_python-4.0.0-cp312-cp312-macosx_11_0_x86_64.whl", hash = "sha256:835161dd46ef8f5fc9d2f135ca654c2f3fbdf57b035d3e1980506aa8eac671dc", size = 1041337, upload-time = "2025-10-09T10:11:43.692Z" },
+    { url = "https://files.pythonhosted.org/packages/9c/75/4bfac89f10c6dbb75e97adf1e217737fc599ebf964031c9298b6cbd807d0/snowflake_connector_python-4.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:65e4e36dd1b0c7235d84cddef8a3c97c5ea0dc8fea85e31e45fc485000b77a83", size = 2699730, upload-time = "2025-10-09T10:11:25.295Z" },
+    { url = "https://files.pythonhosted.org/packages/cd/78/0e916416c50909dbae511fe38b1e671a9efa62decdce51b174a0396804e4/snowflake_connector_python-4.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e6132986d6965e4005b0167270612fbc7fa4bc4ef42726a40b85a8f57475a78d", size = 2731336, upload-time = "2025-10-09T10:11:27.028Z" },
+    { url = "https://files.pythonhosted.org/packages/83/f0/3db8a2f3f5ee724d309c661af739a70d0643070b9b4597728151ef900f9b/snowflake_connector_python-4.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:a790f06808e4481c23cfed1396d2c9a786060ddd62408b1fda1a63e1e6bc4b07", size = 1176292, upload-time = "2025-10-09T10:11:54.956Z" },
+]
+[[package]]
+name = "snowflake-sqlalchemy"
+version = "1.9.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "snowflake-connector-python" },
+    { name = "sqlalchemy" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/ff/6a/fcc5c00c3a253029a7b7b293a3958ba07d5e97623b643de47be0cc9e5530/snowflake_sqlalchemy-1.9.0.tar.gz", hash = "sha256:fb32baf559f7f933ae8fde2ec535bcea5381bb15188777cd8c006b3226efa3b1", size = 141707, upload-time = "2026-03-04T13:48:17.905Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/88/28/b7ae8df80847e8157b74669ad7e1b0180e82ac0e3daf950612effd232fea/snowflake_sqlalchemy-1.9.0-py3-none-any.whl", hash = "sha256:f0b1528173e93c8c80bd9ca510985054667e0e514dd90b890271ac1cfae261c1", size = 78953, upload-time = "2026-03-04T13:48:16.393Z" },
+]
+[[package]]
+name = "sortedcontainers"
+version = "2.4.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/e8/c4/ba2f8066cceb6f23394729afe52f3bf7adec04bf9ed2c820b39e19299111/sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88", size = 30594, upload-time = "2021-05-16T22:03:42.897Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/32/46/9cb0e58b2deb7f82b84065f37f3bffeb12413f947f9388e4cac22c4621ce/sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0", size = 29575, upload-time = "2021-05-16T22:03:41.177Z" },
+]
 [[package]]
 name = "spacy"
 version = "3.8.3"
     { name = "greenlet" },
 ]
+[[package]]
+name = "sqlalchemy-bigquery"
+version = "1.16.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "google-api-core" },
+    { name = "google-auth" },
+    { name = "google-cloud-bigquery" },
+    { name = "packaging" },
+    { name = "sqlalchemy" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/7e/6a/c49932b3d9c44cab9202b1866c5b36b7f0d0455d4653fbc0af4466aeaa76/sqlalchemy_bigquery-1.16.0.tar.gz", hash = "sha256:fe937a0d1f4cf7219fcf5d4995c6718805b38d4df43e29398dec5dc7b6d1987e", size = 119632, upload-time = "2025-11-06T01:35:40.373Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c0/87/11e6de00ef7949bb8ea06b55304a1a4911c329fdf0d9882b464db240c2c5/sqlalchemy_bigquery-1.16.0-py3-none-any.whl", hash = "sha256:0fe7634cd954f3e74f5e2db6d159f9e5ee87a47fbe8d52eac3cd3bb3dadb3a77", size = 40615, upload-time = "2025-11-06T01:35:39.358Z" },
+]
+[[package]]
+name = "sqlglot"
+version = "30.6.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/3c/66/6ece15f197874e56c76e1d0269cebf284ba992a80dfadca9d1972fdf7edf/sqlglot-30.6.0.tar.gz", hash = "sha256:246d34d39927422a50a3fa155f37b2f6346fba85f1a755b13c941eb32ef93361", size = 5835307, upload-time = "2026-04-20T20:11:08.164Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/dc/e7/64fe971cbca33a0446b06f4a5ff8e3fa4a1dbd0a039ceabcc3e6cf4087a9/sqlglot-30.6.0-py3-none-any.whl", hash = "sha256:e005fc2f47994f90d7d8df341f1cbe937518497b0b7b1507d4c03c4c9dfd2778", size = 673920, upload-time = "2026-04-20T20:11:05.758Z" },
+]
 [[package]]
 name = "srsly"
 version = "2.5.3"
     { url = "https://files.pythonhosted.org/packages/72/f4/0de46cfa12cdcbcd464cc59fde36912af405696f687e53a091fb432f694c/tokenizers-0.22.2-cp39-abi3-win_arm64.whl", hash = "sha256:9ce725d22864a1e965217204946f830c37876eee3b2ba6fc6255e8e903d5fcbc", size = 2612133, upload-time = "2026-01-05T10:45:17.232Z" },
 ]
+[[package]]
+name = "tomlkit"
+version = "0.14.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/c3/af/14b24e41977adb296d6bd1fb59402cf7d60ce364f90c890bd2ec65c43b5a/tomlkit-0.14.0.tar.gz", hash = "sha256:cf00efca415dbd57575befb1f6634c4f42d2d87dbba376128adb42c121b87064", size = 187167, upload-time = "2026-01-13T01:14:53.304Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/b5/11/87d6d29fb5d237229d67973a6c9e06e048f01cf4994dee194ab0ea841814/tomlkit-0.14.0-py3-none-any.whl", hash = "sha256:592064ed85b40fa213469f81ac584f67a4f2992509a7c3ea2d632208623a3680", size = 39310, upload-time = "2026-01-13T01:14:51.965Z" },
+]
 [[package]]
 name = "torch"
 version = "2.11.0"