[KM-438][KM-439] Improve Retrieval and Querying feature
#15
by rhbt6767 - opened
- .dockerignore +6 -0
- .gitattributes +1 -0
- .gitignore +13 -1
- .vscode/launch.json +25 -0
- Dockerfile +2 -0
- README.md +2 -0
- main.py +2 -0
- pyproject.toml +19 -10
- src/agents/chatbot.py +11 -1
- src/agents/orchestration.py +5 -0
- src/api/v1/chat.py +80 -17
- src/api/v1/db_client.py +471 -3
- src/api/v1/document.py +43 -128
- src/config/agents/system_prompt.md +1 -2
- src/config/settings.py +5 -0
- src/database_client/database_client_service.py +164 -0
- src/db/postgres/init_db.py +43 -1
- src/db/postgres/models.py +16 -0
- src/document/document_service.py +17 -1
- src/knowledge/parquet_service.py +77 -0
- src/knowledge/processing_service.py +145 -56
- src/models/credentials.py +164 -0
- src/models/sql_query.py +8 -0
- src/models/structured_output.py +4 -0
- src/pipeline/db_pipeline/__init__.py +3 -0
- src/pipeline/db_pipeline/db_pipeline_service.py +302 -0
- src/pipeline/db_pipeline/extractor.py +283 -0
- src/pipeline/document_pipeline/__init__.py +0 -0
- src/pipeline/document_pipeline/document_pipeline.py +94 -0
- src/query/__init__.py +0 -0
- src/query/base.py +32 -0
- src/query/executors/__init__.py +0 -0
- src/query/executors/db_executor.py +648 -0
- src/query/executors/tabular.py +287 -0
- src/query/query_executor.py +42 -0
- src/rag/base.py +20 -0
- src/rag/retriever.py +24 -48
- src/rag/retrievers/__init__.py +0 -0
- src/rag/retrievers/baseline.py +76 -0
- src/rag/retrievers/document.py +158 -0
- src/rag/retrievers/schema.py +411 -0
- src/rag/router.py +179 -0
- src/storage/az_blob/az_blob.py +34 -0
- src/tools/search.py +3 -3
- src/utils/db_credential_encryption.py +70 -0
- uv.lock +440 -10
.dockerignore
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.venv
|
| 2 |
+
software/
|
| 3 |
+
__pycache__
|
| 4 |
+
*.py[oc]
|
| 5 |
+
.env
|
| 6 |
+
.env.*
|
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
software/** filter=lfs diff=lfs merge=lfs -text
|
.gitignore
CHANGED
|
@@ -26,6 +26,10 @@ test/users/user_accounts.csv
|
|
| 26 |
.env.prd
|
| 27 |
.env.example
|
| 28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
erd/
|
| 30 |
playground/
|
| 31 |
playground_retriever.py
|
|
@@ -33,4 +37,12 @@ playground_chat.py
|
|
| 33 |
playground_flush_cache.py
|
| 34 |
playground_create_user.py
|
| 35 |
API_CONTRACT.md
|
| 36 |
-
context_engineering/
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
.env.prd
|
| 27 |
.env.example
|
| 28 |
|
| 29 |
+
CLAUDE.md
|
| 30 |
+
|
| 31 |
+
/experiments
|
| 32 |
+
src/rag/experiments/
|
| 33 |
erd/
|
| 34 |
playground/
|
| 35 |
playground_retriever.py
|
|
|
|
| 37 |
playground_flush_cache.py
|
| 38 |
playground_create_user.py
|
| 39 |
API_CONTRACT.md
|
| 40 |
+
context_engineering/
|
| 41 |
+
sample_file/
|
| 42 |
+
test_tesseract.py
|
| 43 |
+
|
| 44 |
+
# Windows binaries — installed via apt in Docker instead
|
| 45 |
+
software/
|
| 46 |
+
|
| 47 |
+
tests/
|
| 48 |
+
.claude/
|
.vscode/launch.json
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
// Use IntelliSense to learn about possible attributes.
|
| 3 |
+
// Hover to view descriptions of existing attributes.
|
| 4 |
+
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
|
| 5 |
+
"version": "0.2.0",
|
| 6 |
+
"configurations": [
|
| 7 |
+
{
|
| 8 |
+
"name": "DataEyond: FastAPI (debug)",
|
| 9 |
+
"type": "debugpy",
|
| 10 |
+
"request": "launch",
|
| 11 |
+
"module": "uvicorn",
|
| 12 |
+
"args": [
|
| 13 |
+
"main:app",
|
| 14 |
+
"--host", "0.0.0.0",
|
| 15 |
+
"--port", "7860",
|
| 16 |
+
"--reload"
|
| 17 |
+
],
|
| 18 |
+
"jinja": true,
|
| 19 |
+
"justMyCode": true,
|
| 20 |
+
"envFile": "${workspaceFolder}/.env",
|
| 21 |
+
"console": "integratedTerminal",
|
| 22 |
+
"cwd": "${workspaceFolder}"
|
| 23 |
+
}
|
| 24 |
+
]
|
| 25 |
+
}
|
Dockerfile
CHANGED
|
@@ -12,6 +12,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
| 12 |
libpq-dev \
|
| 13 |
gcc \
|
| 14 |
libgomp1 \
|
|
|
|
|
|
|
| 15 |
&& rm -rf /var/lib/apt/lists/*
|
| 16 |
|
| 17 |
RUN addgroup --system app && \
|
|
|
|
| 12 |
libpq-dev \
|
| 13 |
gcc \
|
| 14 |
libgomp1 \
|
| 15 |
+
tesseract-ocr \
|
| 16 |
+
poppler-utils \
|
| 17 |
&& rm -rf /var/lib/apt/lists/*
|
| 18 |
|
| 19 |
RUN addgroup --system app && \
|
README.md
CHANGED
|
@@ -11,6 +11,8 @@ short_description: AI Agent core service
|
|
| 11 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
| 12 |
|
| 13 |
|
|
|
|
|
|
|
| 14 |
How to run:
|
| 15 |
`uv run --no-sync uvicorn main:app --host 0.0.0.0 --port 7860`
|
| 16 |
|
|
|
|
| 11 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
| 12 |
|
| 13 |
|
| 14 |
+
# Agentic Service Data Eyond
|
| 15 |
+
|
| 16 |
How to run:
|
| 17 |
`uv run --no-sync uvicorn main:app --host 0.0.0.0 --port 7860`
|
| 18 |
|
main.py
CHANGED
|
@@ -10,6 +10,7 @@ from src.api.v1.chat import router as chat_router
|
|
| 10 |
from src.api.v1.room import router as room_router
|
| 11 |
from src.api.v1.users import router as users_router
|
| 12 |
from src.api.v1.knowledge import router as knowledge_router
|
|
|
|
| 13 |
from src.db.postgres.init_db import init_db
|
| 14 |
import uvicorn
|
| 15 |
|
|
@@ -35,6 +36,7 @@ app.include_router(document_router)
|
|
| 35 |
app.include_router(knowledge_router)
|
| 36 |
app.include_router(room_router)
|
| 37 |
app.include_router(chat_router)
|
|
|
|
| 38 |
|
| 39 |
|
| 40 |
@app.on_event("startup")
|
|
|
|
| 10 |
from src.api.v1.room import router as room_router
|
| 11 |
from src.api.v1.users import router as users_router
|
| 12 |
from src.api.v1.knowledge import router as knowledge_router
|
| 13 |
+
from src.api.v1.db_client import router as db_client_router
|
| 14 |
from src.db.postgres.init_db import init_db
|
| 15 |
import uvicorn
|
| 16 |
|
|
|
|
| 36 |
app.include_router(knowledge_router)
|
| 37 |
app.include_router(room_router)
|
| 38 |
app.include_router(chat_router)
|
| 39 |
+
app.include_router(db_client_router)
|
| 40 |
|
| 41 |
|
| 42 |
@app.on_event("startup")
|
pyproject.toml
CHANGED
|
@@ -79,6 +79,18 @@ dependencies = [
|
|
| 79 |
"jsonpatch>=1.33",
|
| 80 |
"pymongo>=4.14.0",
|
| 81 |
"psycopg2>=2.9.11",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
]
|
| 83 |
|
| 84 |
[project.optional-dependencies]
|
|
@@ -92,16 +104,6 @@ dev = [
|
|
| 92 |
"pre-commit==4.0.1",
|
| 93 |
]
|
| 94 |
|
| 95 |
-
[tool.uv]
|
| 96 |
-
dev-dependencies = [
|
| 97 |
-
"pytest==8.3.4",
|
| 98 |
-
"pytest-asyncio==0.24.0",
|
| 99 |
-
"pytest-cov==6.0.0",
|
| 100 |
-
"ruff==0.8.4",
|
| 101 |
-
"mypy==1.13.0",
|
| 102 |
-
"pre-commit==4.0.1",
|
| 103 |
-
]
|
| 104 |
-
|
| 105 |
[tool.hatch.build.targets.wheel]
|
| 106 |
packages = ["src/agent_service"]
|
| 107 |
|
|
@@ -133,3 +135,10 @@ testpaths = ["tests"]
|
|
| 133 |
filterwarnings = [
|
| 134 |
"ignore::DeprecationWarning",
|
| 135 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
"jsonpatch>=1.33",
|
| 80 |
"pymongo>=4.14.0",
|
| 81 |
"psycopg2>=2.9.11",
|
| 82 |
+
# --- SQL parsing / guardrails ---
|
| 83 |
+
"sqlglot>=25.0.0",
|
| 84 |
+
# --- User-DB connectors (db_pipeline) ---
|
| 85 |
+
"pymysql>=1.1.1",
|
| 86 |
+
"pymssql>=2.3.0",
|
| 87 |
+
"sqlalchemy-bigquery>=1.11.0",
|
| 88 |
+
"snowflake-sqlalchemy>=1.7.0",
|
| 89 |
+
# --- OCR (pdf processing) ---
|
| 90 |
+
"pdf2image>=1.17.0",
|
| 91 |
+
"pytesseract>=0.3.13",
|
| 92 |
+
"pypdf2>=3.0.1",
|
| 93 |
+
"pyarrow>=24.0.0",
|
| 94 |
]
|
| 95 |
|
| 96 |
[project.optional-dependencies]
|
|
|
|
| 104 |
"pre-commit==4.0.1",
|
| 105 |
]
|
| 106 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
[tool.hatch.build.targets.wheel]
|
| 108 |
packages = ["src/agent_service"]
|
| 109 |
|
|
|
|
| 135 |
filterwarnings = [
|
| 136 |
"ignore::DeprecationWarning",
|
| 137 |
]
|
| 138 |
+
|
| 139 |
+
[dependency-groups]
|
| 140 |
+
dev = [
|
| 141 |
+
"pytest>=8.3.4",
|
| 142 |
+
"pytest-asyncio>=0.24.0",
|
| 143 |
+
"ruff>=0.8.4",
|
| 144 |
+
]
|
src/agents/chatbot.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
"""Chatbot agent with RAG capabilities."""
|
| 2 |
|
|
|
|
| 3 |
from langchain_openai import AzureChatOpenAI
|
| 4 |
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
|
| 5 |
from langchain_core.output_parsers import StrOutputParser
|
|
@@ -9,6 +10,14 @@ from langchain_core.messages import HumanMessage, AIMessage
|
|
| 9 |
|
| 10 |
logger = get_logger("chatbot")
|
| 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
class ChatbotAgent:
|
| 14 |
"""Chatbot agent with RAG capabilities."""
|
|
@@ -64,7 +73,8 @@ class ChatbotAgent:
|
|
| 64 |
async def astream_response(self, messages: list, context: str = ""):
|
| 65 |
"""Stream response tokens as they are generated."""
|
| 66 |
try:
|
| 67 |
-
|
|
|
|
| 68 |
async for token in self.chain.astream({"messages": messages, "context": context}):
|
| 69 |
yield token
|
| 70 |
except Exception as e:
|
|
|
|
| 1 |
"""Chatbot agent with RAG capabilities."""
|
| 2 |
|
| 3 |
+
import tiktoken
|
| 4 |
from langchain_openai import AzureChatOpenAI
|
| 5 |
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
|
| 6 |
from langchain_core.output_parsers import StrOutputParser
|
|
|
|
| 10 |
|
| 11 |
logger = get_logger("chatbot")
|
| 12 |
|
| 13 |
+
_enc = tiktoken.get_encoding("cl100k_base")
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def _count_tokens(messages: list, context: str) -> dict:
|
| 17 |
+
msg_tokens = sum(len(_enc.encode(m.content)) for m in messages)
|
| 18 |
+
ctx_tokens = len(_enc.encode(context))
|
| 19 |
+
return {"messages_tokens": msg_tokens, "context_tokens": ctx_tokens, "total": msg_tokens + ctx_tokens}
|
| 20 |
+
|
| 21 |
|
| 22 |
class ChatbotAgent:
|
| 23 |
"""Chatbot agent with RAG capabilities."""
|
|
|
|
| 73 |
async def astream_response(self, messages: list, context: str = ""):
|
| 74 |
"""Stream response tokens as they are generated."""
|
| 75 |
try:
|
| 76 |
+
token_counts = _count_tokens(messages, context)
|
| 77 |
+
logger.info("LLM input tokens", **token_counts)
|
| 78 |
async for token in self.chain.astream({"messages": messages, "context": context}):
|
| 79 |
yield token
|
| 80 |
except Exception as e:
|
src/agents/orchestration.py
CHANGED
|
@@ -35,6 +35,11 @@ Intent Routing:
|
|
| 35 |
- greeting -> needs_search=False, direct_response="Hello! How can I assist you today?"
|
| 36 |
- goodbye -> needs_search=False, direct_response="Goodbye! Have a great day!"
|
| 37 |
- other -> needs_search=True, search_query=<standalone rewritten query>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
"""),
|
| 39 |
MessagesPlaceholder(variable_name="history"),
|
| 40 |
("user", "{message}")
|
|
|
|
| 35 |
- greeting -> needs_search=False, direct_response="Hello! How can I assist you today?"
|
| 36 |
- goodbye -> needs_search=False, direct_response="Goodbye! Have a great day!"
|
| 37 |
- other -> needs_search=True, search_query=<standalone rewritten query>
|
| 38 |
+
|
| 39 |
+
Source Routing (set source_hint):
|
| 40 |
+
- Columns, tables, sheets, data types, schema, row counts, statistics -> source_hint=schema
|
| 41 |
+
- Document content, paragraphs, reports, articles, text -> source_hint=document
|
| 42 |
+
- Unclear or spans both -> source_hint=both
|
| 43 |
"""),
|
| 44 |
MessagesPlaceholder(variable_name="history"),
|
| 45 |
("user", "{message}")
|
src/api/v1/chat.py
CHANGED
|
@@ -9,6 +9,9 @@ from src.db.postgres.models import ChatMessage, MessageSource
|
|
| 9 |
from src.agents.orchestration import orchestrator
|
| 10 |
from src.agents.chatbot import chatbot
|
| 11 |
from src.rag.retriever import retriever
|
|
|
|
|
|
|
|
|
|
| 12 |
from src.db.redis.connection import get_redis
|
| 13 |
from src.config.settings import settings
|
| 14 |
from src.middlewares.logging import get_logger, log_execution
|
|
@@ -45,34 +48,66 @@ class ChatRequest(BaseModel):
|
|
| 45 |
message: str
|
| 46 |
|
| 47 |
|
| 48 |
-
def _format_context(results: List[
|
| 49 |
"""Format retrieval results as context string for the LLM."""
|
| 50 |
lines = []
|
| 51 |
for result in results:
|
| 52 |
-
|
| 53 |
-
|
|
|
|
| 54 |
source_label = f"{filename}, p.{page}" if page else filename
|
| 55 |
-
lines.append(f"[Source: {source_label}]\n{result
|
| 56 |
return "\n".join(lines)
|
| 57 |
|
| 58 |
|
| 59 |
-
def _extract_sources(results: List[
|
| 60 |
"""Extract deduplicated source references from retrieval results."""
|
| 61 |
seen = set()
|
| 62 |
sources = []
|
| 63 |
for result in results:
|
| 64 |
-
meta = result
|
| 65 |
-
|
| 66 |
-
if
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
return sources
|
| 74 |
|
| 75 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
async def get_cached_response(redis, cache_key: str) -> Optional[str]:
|
| 77 |
cached = await redis.get(cache_key)
|
| 78 |
if cached:
|
|
@@ -155,9 +190,12 @@ async def chat_stream(request: ChatRequest, db: AsyncSession = Depends(get_db)):
|
|
| 155 |
sources: List[Dict[str, Any]] = []
|
| 156 |
|
| 157 |
if intent_result is None:
|
| 158 |
-
# Step 2: Launch retrieval and history loading in parallel, then run orchestrator
|
|
|
|
|
|
|
|
|
|
| 159 |
retrieval_task = asyncio.create_task(
|
| 160 |
-
retriever.retrieve(request.message, request.user_id, db)
|
| 161 |
)
|
| 162 |
history_task = asyncio.create_task(
|
| 163 |
load_history(db, request.room_id, limit=6) # 6 msgs (3 pairs) for orchestrator
|
|
@@ -165,18 +203,28 @@ async def chat_stream(request: ChatRequest, db: AsyncSession = Depends(get_db)):
|
|
| 165 |
history = await history_task # fast DB query (<100ms), done before orchestrator finishes
|
| 166 |
intent_result = await orchestrator.analyze_message(request.message, history)
|
| 167 |
|
|
|
|
| 168 |
if not intent_result.get("needs_search"):
|
| 169 |
retrieval_task.cancel()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
raw_results = []
|
| 171 |
else:
|
| 172 |
-
search_query = intent_result.get("search_query", request.message)
|
| 173 |
logger.info(f"Searching for: {search_query}")
|
| 174 |
if search_query != request.message:
|
| 175 |
retrieval_task.cancel()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
raw_results = await retriever.retrieve(
|
| 177 |
query=search_query,
|
| 178 |
user_id=request.user_id,
|
| 179 |
db=db,
|
|
|
|
|
|
|
| 180 |
)
|
| 181 |
else:
|
| 182 |
raw_results = await retrieval_task
|
|
@@ -184,6 +232,21 @@ async def chat_stream(request: ChatRequest, db: AsyncSession = Depends(get_db)):
|
|
| 184 |
context = _format_context(raw_results)
|
| 185 |
sources = _extract_sources(raw_results)
|
| 186 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
# Step 3: Direct response for greetings / non-document intents
|
| 188 |
if intent_result.get("direct_response"):
|
| 189 |
response = intent_result["direct_response"]
|
|
|
|
| 9 |
from src.agents.orchestration import orchestrator
|
| 10 |
from src.agents.chatbot import chatbot
|
| 11 |
from src.rag.retriever import retriever
|
| 12 |
+
from src.rag.base import RetrievalResult
|
| 13 |
+
from src.query.query_executor import query_executor
|
| 14 |
+
from src.query.base import QueryResult
|
| 15 |
from src.db.redis.connection import get_redis
|
| 16 |
from src.config.settings import settings
|
| 17 |
from src.middlewares.logging import get_logger, log_execution
|
|
|
|
| 48 |
message: str
|
| 49 |
|
| 50 |
|
| 51 |
+
def _format_context(results: List[RetrievalResult]) -> str:
|
| 52 |
"""Format retrieval results as context string for the LLM."""
|
| 53 |
lines = []
|
| 54 |
for result in results:
|
| 55 |
+
data = result.metadata.get("data", {})
|
| 56 |
+
filename = data.get("filename", "Unknown")
|
| 57 |
+
page = data.get("page_label")
|
| 58 |
source_label = f"{filename}, p.{page}" if page else filename
|
| 59 |
+
lines.append(f"[Source: {source_label}]\n{result.content}\n")
|
| 60 |
return "\n".join(lines)
|
| 61 |
|
| 62 |
|
| 63 |
+
def _extract_sources(results: List[RetrievalResult]) -> List[Dict[str, Any]]:
|
| 64 |
"""Extract deduplicated source references from retrieval results."""
|
| 65 |
seen = set()
|
| 66 |
sources = []
|
| 67 |
for result in results:
|
| 68 |
+
meta = result.metadata
|
| 69 |
+
data = meta.get("data", {})
|
| 70 |
+
if "document_id" in data:
|
| 71 |
+
key = (data.get("document_id"), data.get("page_label"))
|
| 72 |
+
if key not in seen:
|
| 73 |
+
seen.add(key)
|
| 74 |
+
sources.append({
|
| 75 |
+
"document_id": data.get("document_id"),
|
| 76 |
+
"filename": data.get("filename", "Unknown"),
|
| 77 |
+
"page_label": data.get("page_label", "Unknown"),
|
| 78 |
+
})
|
| 79 |
+
else:
|
| 80 |
+
key = (data.get("table_name"), data.get("column_name"))
|
| 81 |
+
if key not in seen:
|
| 82 |
+
seen.add(key)
|
| 83 |
+
table_name = data.get("table_name")
|
| 84 |
+
user_id = meta.get("user_id")
|
| 85 |
+
sources.append({
|
| 86 |
+
"document_id": f"{user_id}_{table_name}",
|
| 87 |
+
"filename": data.get("table_name", "Unknown"),
|
| 88 |
+
"page_label": data.get("column_name", "Unknown"),
|
| 89 |
+
})
|
| 90 |
+
|
| 91 |
+
logger.debug(f"Extracted sources: {sources}")
|
| 92 |
return sources
|
| 93 |
|
| 94 |
|
| 95 |
+
def _format_query_results(results: list[QueryResult]) -> str:
|
| 96 |
+
if not results:
|
| 97 |
+
return ""
|
| 98 |
+
lines = []
|
| 99 |
+
for r in results:
|
| 100 |
+
name = r.metadata.get("client_name", r.source_id)
|
| 101 |
+
lines.append(f"[Query result — {name}, tables: {r.table_or_file}]")
|
| 102 |
+
lines.append(f"SQL: {r.metadata.get('sql', '')}")
|
| 103 |
+
if r.columns and r.rows:
|
| 104 |
+
lines.append(" | ".join(r.columns))
|
| 105 |
+
for row in r.rows[:20]:
|
| 106 |
+
lines.append(" | ".join(str(row.get(c, "")) for c in r.columns))
|
| 107 |
+
lines.append(f"({r.row_count} rows total)\n")
|
| 108 |
+
return "\n".join(lines)
|
| 109 |
+
|
| 110 |
+
|
| 111 |
async def get_cached_response(redis, cache_key: str) -> Optional[str]:
|
| 112 |
cached = await redis.get(cache_key)
|
| 113 |
if cached:
|
|
|
|
| 190 |
sources: List[Dict[str, Any]] = []
|
| 191 |
|
| 192 |
if intent_result is None:
|
| 193 |
+
# Step 2: Launch retrieval and history loading in parallel, then run orchestrator.
|
| 194 |
+
# k=5
|
| 195 |
+
# tables — db_executor's FK expansion is one-hop and cannot bridge
|
| 196 |
+
# 2-hop gaps (e.g. customers -> order_items -> products) on its own.
|
| 197 |
retrieval_task = asyncio.create_task(
|
| 198 |
+
retriever.retrieve(request.message, request.user_id, db, k=5)
|
| 199 |
)
|
| 200 |
history_task = asyncio.create_task(
|
| 201 |
load_history(db, request.room_id, limit=6) # 6 msgs (3 pairs) for orchestrator
|
|
|
|
| 203 |
history = await history_task # fast DB query (<100ms), done before orchestrator finishes
|
| 204 |
intent_result = await orchestrator.analyze_message(request.message, history)
|
| 205 |
|
| 206 |
+
search_query = intent_result.get("search_query", request.message) or request.message
|
| 207 |
if not intent_result.get("needs_search"):
|
| 208 |
retrieval_task.cancel()
|
| 209 |
+
try:
|
| 210 |
+
await retrieval_task
|
| 211 |
+
except asyncio.CancelledError:
|
| 212 |
+
pass
|
| 213 |
raw_results = []
|
| 214 |
else:
|
|
|
|
| 215 |
logger.info(f"Searching for: {search_query}")
|
| 216 |
if search_query != request.message:
|
| 217 |
retrieval_task.cancel()
|
| 218 |
+
try:
|
| 219 |
+
await retrieval_task
|
| 220 |
+
except asyncio.CancelledError:
|
| 221 |
+
pass
|
| 222 |
raw_results = await retriever.retrieve(
|
| 223 |
query=search_query,
|
| 224 |
user_id=request.user_id,
|
| 225 |
db=db,
|
| 226 |
+
k=5,
|
| 227 |
+
source_hint=intent_result.get("source_hint", "both"),
|
| 228 |
)
|
| 229 |
else:
|
| 230 |
raw_results = await retrieval_task
|
|
|
|
| 232 |
context = _format_context(raw_results)
|
| 233 |
sources = _extract_sources(raw_results)
|
| 234 |
|
| 235 |
+
source_hint = intent_result.get("source_hint", "both")
|
| 236 |
+
if source_hint in ("schema", "both"):
|
| 237 |
+
# Use search_query (orchestrator's standalone rewrite) so follow-up
|
| 238 |
+
# messages like "dive deeper" or "show me last year" resolve correctly.
|
| 239 |
+
# For first-turn questions search_query == request.message, so no change.
|
| 240 |
+
query_results = await query_executor.execute(
|
| 241 |
+
results=raw_results,
|
| 242 |
+
user_id=request.user_id,
|
| 243 |
+
db=db,
|
| 244 |
+
question=search_query,
|
| 245 |
+
)
|
| 246 |
+
query_context = _format_query_results(query_results)
|
| 247 |
+
if query_context:
|
| 248 |
+
context = query_context + "\n\n" + context
|
| 249 |
+
|
| 250 |
# Step 3: Direct response for greetings / non-document intents
|
| 251 |
if intent_result.get("direct_response"):
|
| 252 |
response = intent_result["direct_response"]
|
src/api/v1/db_client.py
CHANGED
|
@@ -1,5 +1,473 @@
|
|
| 1 |
-
|
| 2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
-
|
| 5 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""API endpoints for user-registered database connections.
|
| 2 |
|
| 3 |
+
Credential schemas (DbType, PostgresCredentials, etc.) live in
|
| 4 |
+
`src/models/credentials.py` — they are imported below (with noqa: F401) so
|
| 5 |
+
FastAPI/Swagger picks them up for OpenAPI schema generation even though they
|
| 6 |
+
are not referenced by name in this file.
|
| 7 |
+
"""
|
| 8 |
|
| 9 |
+
from typing import Any, Dict, List, Literal, Optional
|
| 10 |
+
from datetime import datetime
|
| 11 |
+
|
| 12 |
+
from fastapi import APIRouter, Depends, HTTPException, Query, Request, status
|
| 13 |
+
from pydantic import BaseModel, Field
|
| 14 |
+
from sqlalchemy.ext.asyncio import AsyncSession
|
| 15 |
+
|
| 16 |
+
from src.database_client.database_client_service import database_client_service
|
| 17 |
+
from src.db.postgres.connection import get_db
|
| 18 |
+
from src.middlewares.logging import get_logger, log_execution
|
| 19 |
+
from src.middlewares.rate_limit import limiter
|
| 20 |
+
from src.models.credentials import ( # noqa: F401 — re-exported for Swagger schema discovery
|
| 21 |
+
BigQueryCredentials,
|
| 22 |
+
CredentialSchemas,
|
| 23 |
+
DbType,
|
| 24 |
+
MysqlCredentials,
|
| 25 |
+
PostgresCredentials,
|
| 26 |
+
SnowflakeCredentials,
|
| 27 |
+
SqlServerCredentials,
|
| 28 |
+
SupabaseCredentials,
|
| 29 |
+
)
|
| 30 |
+
from src.pipeline.db_pipeline import db_pipeline_service
|
| 31 |
+
from src.utils.db_credential_encryption import decrypt_credentials_dict
|
| 32 |
+
|
| 33 |
+
logger = get_logger("database_client_api")
|
| 34 |
+
|
| 35 |
+
router = APIRouter(prefix="/api/v1", tags=["Database Clients"])
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
# ---------------------------------------------------------------------------
|
| 39 |
+
# Request / Response schemas
|
| 40 |
+
# ---------------------------------------------------------------------------
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
class DatabaseClientCreate(BaseModel):
|
| 44 |
+
"""
|
| 45 |
+
Payload to register a new external database connection.
|
| 46 |
+
|
| 47 |
+
The `credentials` object shape depends on `db_type`:
|
| 48 |
+
|
| 49 |
+
| db_type | Required fields |
|
| 50 |
+
|-------------|----------------------------------------------------------|
|
| 51 |
+
| postgres | host, port, database, username, password, ssl_mode |
|
| 52 |
+
| mysql | host, port, database, username, password, ssl |
|
| 53 |
+
| sqlserver | host, port, database, username, password, driver? |
|
| 54 |
+
| supabase | host, port, database, username, password, ssl_mode |
|
| 55 |
+
| bigquery | project_id, dataset_id, location?, service_account_json |
|
| 56 |
+
| snowflake | account, warehouse, database, schema?, username, password, role? |
|
| 57 |
+
|
| 58 |
+
Sensitive fields (`password`, `service_account_json`) are encrypted
|
| 59 |
+
at rest using Fernet symmetric encryption.
|
| 60 |
+
"""
|
| 61 |
+
|
| 62 |
+
name: str = Field(..., description="Display name for this connection.", examples=["Production DB"])
|
| 63 |
+
db_type: DbType = Field(..., description="Type of the database engine.", examples=["postgres"])
|
| 64 |
+
credentials: Dict[str, Any] = Field(
|
| 65 |
+
...,
|
| 66 |
+
description="Connection credentials. Shape depends on db_type. See schema descriptions above.",
|
| 67 |
+
examples=[
|
| 68 |
+
{
|
| 69 |
+
"host": "db.example.com",
|
| 70 |
+
"port": 5432,
|
| 71 |
+
"database": "mydb",
|
| 72 |
+
"username": "admin",
|
| 73 |
+
"password": "s3cr3t!",
|
| 74 |
+
"ssl_mode": "require",
|
| 75 |
+
}
|
| 76 |
+
],
|
| 77 |
+
)
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
class DatabaseClientUpdate(BaseModel):
|
| 81 |
+
"""
|
| 82 |
+
Payload to update an existing database connection.
|
| 83 |
+
|
| 84 |
+
All fields are optional — only provided fields will be updated.
|
| 85 |
+
If `credentials` is provided, it replaces the entire credentials object
|
| 86 |
+
and sensitive fields are re-encrypted.
|
| 87 |
+
"""
|
| 88 |
+
|
| 89 |
+
name: Optional[str] = Field(None, description="New display name for this connection.", examples=["Staging DB"])
|
| 90 |
+
credentials: Optional[Dict[str, Any]] = Field(
|
| 91 |
+
None,
|
| 92 |
+
description="Updated credentials object. Replaces existing credentials entirely if provided.",
|
| 93 |
+
examples=[{"host": "new-host.example.com", "port": 5432, "database": "mydb", "username": "admin", "password": "n3wP@ss!", "ssl_mode": "require"}],
|
| 94 |
+
)
|
| 95 |
+
status: Optional[Literal["active", "inactive"]] = Field(
|
| 96 |
+
None,
|
| 97 |
+
description="Set to 'inactive' to soft-disable the connection without deleting it.",
|
| 98 |
+
examples=["inactive"],
|
| 99 |
+
)
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
class DatabaseClientResponse(BaseModel):
|
| 103 |
+
"""
|
| 104 |
+
Database connection record returned by the API.
|
| 105 |
+
|
| 106 |
+
Credentials are **never** included in the response for security reasons.
|
| 107 |
+
"""
|
| 108 |
+
|
| 109 |
+
id: str = Field(..., description="Unique identifier of the database connection.")
|
| 110 |
+
user_id: str = Field(..., description="ID of the user who owns this connection.")
|
| 111 |
+
name: str = Field(..., description="Display name of the connection.")
|
| 112 |
+
db_type: str = Field(..., description="Database engine type.")
|
| 113 |
+
status: str = Field(..., description="Connection status: 'active' or 'inactive'.")
|
| 114 |
+
created_at: datetime = Field(..., description="Timestamp when the connection was registered.")
|
| 115 |
+
updated_at: Optional[datetime] = Field(None, description="Timestamp of the last update, if any.")
|
| 116 |
+
|
| 117 |
+
model_config = {"from_attributes": True}
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
# ---------------------------------------------------------------------------
|
| 121 |
+
# Supported DB types registry
|
| 122 |
+
# ---------------------------------------------------------------------------
|
| 123 |
+
|
| 124 |
+
_DB_TYPES: List[Dict[str, Any]] = [
|
| 125 |
+
{
|
| 126 |
+
"db_type": "postgres",
|
| 127 |
+
"display_name": "PostgreSQL",
|
| 128 |
+
"logo": "postgres",
|
| 129 |
+
"status": "active",
|
| 130 |
+
"message": None,
|
| 131 |
+
"fields": [
|
| 132 |
+
{"name": "host", "type": "string", "required": True, "default": None, "description": "Hostname or IP address"},
|
| 133 |
+
{"name": "port", "type": "integer", "required": False, "default": 5432, "description": "Port number"},
|
| 134 |
+
{"name": "database", "type": "string", "required": True, "default": None, "description": "Database name"},
|
| 135 |
+
{"name": "username", "type": "string", "required": True, "default": None, "description": "Database username"},
|
| 136 |
+
{"name": "password", "type": "string", "required": True, "default": None, "description": "Database password", "sensitive": True},
|
| 137 |
+
{"name": "ssl_mode", "type": "select", "required": False, "default": "require", "description": "SSL mode", "options": ["disable", "require", "verify-ca", "verify-full"]},
|
| 138 |
+
],
|
| 139 |
+
},
|
| 140 |
+
{
|
| 141 |
+
"db_type": "mysql",
|
| 142 |
+
"display_name": "MySQL",
|
| 143 |
+
"logo": "mysql",
|
| 144 |
+
"status": "active",
|
| 145 |
+
"message": None,
|
| 146 |
+
"fields": [
|
| 147 |
+
{"name": "host", "type": "string", "required": True, "default": None, "description": "Hostname or IP address"},
|
| 148 |
+
{"name": "port", "type": "integer", "required": False, "default": 3306, "description": "Port number"},
|
| 149 |
+
{"name": "database", "type": "string", "required": True, "default": None, "description": "Database name"},
|
| 150 |
+
{"name": "username", "type": "string", "required": True, "default": None, "description": "Database username"},
|
| 151 |
+
{"name": "password", "type": "string", "required": True, "default": None, "description": "Database password", "sensitive": True},
|
| 152 |
+
{"name": "ssl", "type": "boolean", "required": False, "default": True, "description": "Enable SSL"},
|
| 153 |
+
],
|
| 154 |
+
},
|
| 155 |
+
{
|
| 156 |
+
"db_type": "supabase",
|
| 157 |
+
"display_name": "Supabase",
|
| 158 |
+
"logo": "supabase",
|
| 159 |
+
"status": "active",
|
| 160 |
+
"message": None,
|
| 161 |
+
"fields": [
|
| 162 |
+
{"name": "host", "type": "string", "required": True, "default": None, "description": "Supabase database host"},
|
| 163 |
+
{"name": "port", "type": "integer", "required": False, "default": 5432, "description": "Port number (5432 direct, 6543 pooler)"},
|
| 164 |
+
{"name": "database", "type": "string", "required": False, "default": "postgres", "description": "Database name"},
|
| 165 |
+
{"name": "username", "type": "string", "required": True, "default": None, "description": "Database user"},
|
| 166 |
+
{"name": "password", "type": "string", "required": True, "default": None, "description": "Database password", "sensitive": True},
|
| 167 |
+
{"name": "ssl_mode", "type": "select", "required": False, "default": "require", "description": "SSL mode", "options": ["require", "verify-ca", "verify-full"]},
|
| 168 |
+
],
|
| 169 |
+
},
|
| 170 |
+
{
|
| 171 |
+
"db_type": "sqlserver",
|
| 172 |
+
"display_name": "SQL Server",
|
| 173 |
+
"logo": "sqlserver",
|
| 174 |
+
"status": "inactive",
|
| 175 |
+
"message": "Coming soon",
|
| 176 |
+
"fields": [
|
| 177 |
+
{"name": "host", "type": "string", "required": True, "default": None, "description": "Hostname or IP address"},
|
| 178 |
+
{"name": "port", "type": "integer", "required": False, "default": 1433, "description": "Port number"},
|
| 179 |
+
{"name": "database", "type": "string", "required": True, "default": None, "description": "Database name"},
|
| 180 |
+
{"name": "username", "type": "string", "required": True, "default": None, "description": "Database username"},
|
| 181 |
+
{"name": "password", "type": "string", "required": True, "default": None, "description": "Database password", "sensitive": True},
|
| 182 |
+
{"name": "driver", "type": "string", "required": False, "default": None, "description": "ODBC driver name"},
|
| 183 |
+
],
|
| 184 |
+
},
|
| 185 |
+
{
|
| 186 |
+
"db_type": "bigquery",
|
| 187 |
+
"display_name": "BigQuery",
|
| 188 |
+
"logo": "bigquery",
|
| 189 |
+
"status": "inactive",
|
| 190 |
+
"message": "Coming soon",
|
| 191 |
+
"fields": [
|
| 192 |
+
{"name": "project_id", "type": "string", "required": True, "default": None, "description": "GCP project ID"},
|
| 193 |
+
{"name": "dataset_id", "type": "string", "required": True, "default": None, "description": "BigQuery dataset name"},
|
| 194 |
+
{"name": "location", "type": "string", "required": False, "default": "US", "description": "Dataset location/region"},
|
| 195 |
+
{"name": "service_account_json", "type": "string", "required": True, "default": None, "description": "GCP Service Account key JSON", "sensitive": True},
|
| 196 |
+
],
|
| 197 |
+
},
|
| 198 |
+
{
|
| 199 |
+
"db_type": "snowflake",
|
| 200 |
+
"display_name": "Snowflake",
|
| 201 |
+
"logo": "snowflake",
|
| 202 |
+
"status": "inactive",
|
| 203 |
+
"message": "Coming soon",
|
| 204 |
+
"fields": [
|
| 205 |
+
{"name": "account", "type": "string", "required": True, "default": None, "description": "Snowflake account identifier"},
|
| 206 |
+
{"name": "warehouse", "type": "string", "required": True, "default": None, "description": "Virtual warehouse name"},
|
| 207 |
+
{"name": "database", "type": "string", "required": True, "default": None, "description": "Database name"},
|
| 208 |
+
{"name": "schema", "type": "string", "required": False, "default": "PUBLIC", "description": "Schema name"},
|
| 209 |
+
{"name": "username", "type": "string", "required": True, "default": None, "description": "Snowflake username"},
|
| 210 |
+
{"name": "password", "type": "string", "required": True, "default": None, "description": "Snowflake password", "sensitive": True},
|
| 211 |
+
{"name": "role", "type": "string", "required": False, "default": None, "description": "Snowflake role"},
|
| 212 |
+
],
|
| 213 |
+
},
|
| 214 |
+
]
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
# ---------------------------------------------------------------------------
|
| 218 |
+
# Endpoints
|
| 219 |
+
# ---------------------------------------------------------------------------
|
| 220 |
+
|
| 221 |
+
|
| 222 |
+
@router.get(
|
| 223 |
+
"/database-clients/dbtypes",
|
| 224 |
+
summary="List supported database types",
|
| 225 |
+
response_description="All database types supported by DataEyond with their connection parameters.",
|
| 226 |
+
)
|
| 227 |
+
async def list_db_types():
|
| 228 |
+
"""
|
| 229 |
+
Return every database type DataEyond can connect to, along with the
|
| 230 |
+
credential fields the frontend should render, a logo filename, and
|
| 231 |
+
an active/inactive status with an optional message.
|
| 232 |
+
"""
|
| 233 |
+
return _DB_TYPES
|
| 234 |
+
|
| 235 |
+
|
| 236 |
+
@router.post(
|
| 237 |
+
"/database-clients",
|
| 238 |
+
response_model=DatabaseClientResponse,
|
| 239 |
+
status_code=status.HTTP_201_CREATED,
|
| 240 |
+
summary="Register a new database connection",
|
| 241 |
+
response_description="The newly created database connection record (credentials excluded).",
|
| 242 |
+
responses={
|
| 243 |
+
201: {"description": "Connection registered successfully."},
|
| 244 |
+
422: {"description": "Validation error — check the credentials shape for the given db_type."},
|
| 245 |
+
500: {"description": "Internal server error."},
|
| 246 |
+
},
|
| 247 |
+
)
|
| 248 |
+
@limiter.limit("10/minute")
|
| 249 |
+
@log_execution(logger)
|
| 250 |
+
async def create_database_client(
|
| 251 |
+
request: Request,
|
| 252 |
+
payload: DatabaseClientCreate,
|
| 253 |
+
user_id: str = Query(..., description="ID of the user registering the connection."),
|
| 254 |
+
db: AsyncSession = Depends(get_db),
|
| 255 |
+
):
|
| 256 |
+
"""
|
| 257 |
+
Register a new external database connection for a user.
|
| 258 |
+
|
| 259 |
+
The `credentials` object must match the shape for the chosen `db_type`
|
| 260 |
+
(see **CredentialSchemas** in the schema section below for exact fields).
|
| 261 |
+
Sensitive fields (`password`, `service_account_json`) are encrypted
|
| 262 |
+
before being persisted — they are never returned in any response.
|
| 263 |
+
"""
|
| 264 |
+
try:
|
| 265 |
+
client = await database_client_service.create(
|
| 266 |
+
db=db,
|
| 267 |
+
user_id=user_id,
|
| 268 |
+
name=payload.name,
|
| 269 |
+
db_type=payload.db_type,
|
| 270 |
+
credentials=payload.credentials,
|
| 271 |
+
)
|
| 272 |
+
return DatabaseClientResponse.model_validate(client)
|
| 273 |
+
except Exception as e:
|
| 274 |
+
logger.error(f"Failed to create database client for user {user_id}", error=str(e))
|
| 275 |
+
raise HTTPException(
|
| 276 |
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
| 277 |
+
detail=f"Failed to create database client: {str(e)}",
|
| 278 |
+
)
|
| 279 |
+
|
| 280 |
+
|
| 281 |
+
@router.get(
|
| 282 |
+
"/database-clients/{user_id}",
|
| 283 |
+
response_model=List[DatabaseClientResponse],
|
| 284 |
+
summary="List all database connections for a user",
|
| 285 |
+
response_description="List of database connections (credentials excluded).",
|
| 286 |
+
responses={
|
| 287 |
+
200: {"description": "Returns an empty list if the user has no connections."},
|
| 288 |
+
},
|
| 289 |
+
)
|
| 290 |
+
@log_execution(logger)
|
| 291 |
+
async def list_database_clients(
|
| 292 |
+
user_id: str,
|
| 293 |
+
db: AsyncSession = Depends(get_db),
|
| 294 |
+
):
|
| 295 |
+
"""
|
| 296 |
+
Return all database connections registered by the specified user,
|
| 297 |
+
ordered by creation date (newest first).
|
| 298 |
+
|
| 299 |
+
Credentials are never included in the response.
|
| 300 |
+
"""
|
| 301 |
+
clients = await database_client_service.get_user_clients(db, user_id)
|
| 302 |
+
return [DatabaseClientResponse.model_validate(c) for c in clients]
|
| 303 |
+
|
| 304 |
+
|
| 305 |
+
@router.get(
|
| 306 |
+
"/database-clients/{user_id}/{client_id}",
|
| 307 |
+
response_model=DatabaseClientResponse,
|
| 308 |
+
summary="Get a single database connection",
|
| 309 |
+
response_description="Database connection detail (credentials excluded).",
|
| 310 |
+
responses={
|
| 311 |
+
404: {"description": "Connection not found."},
|
| 312 |
+
403: {"description": "Access denied — user_id does not own this connection."},
|
| 313 |
+
},
|
| 314 |
+
)
|
| 315 |
+
@log_execution(logger)
|
| 316 |
+
async def get_database_client(
|
| 317 |
+
user_id: str,
|
| 318 |
+
client_id: str,
|
| 319 |
+
db: AsyncSession = Depends(get_db),
|
| 320 |
+
):
|
| 321 |
+
"""
|
| 322 |
+
Return the detail of a single database connection.
|
| 323 |
+
|
| 324 |
+
Returns **403** if the `user_id` in the path does not match the owner
|
| 325 |
+
of the requested connection.
|
| 326 |
+
"""
|
| 327 |
+
client = await database_client_service.get(db, client_id)
|
| 328 |
+
|
| 329 |
+
if not client:
|
| 330 |
+
raise HTTPException(status_code=404, detail="Database client not found")
|
| 331 |
+
|
| 332 |
+
if client.user_id != user_id:
|
| 333 |
+
raise HTTPException(status_code=403, detail="Access denied")
|
| 334 |
+
|
| 335 |
+
return DatabaseClientResponse.model_validate(client)
|
| 336 |
+
|
| 337 |
+
|
| 338 |
+
@router.put(
|
| 339 |
+
"/database-clients/{client_id}",
|
| 340 |
+
response_model=DatabaseClientResponse,
|
| 341 |
+
summary="Update a database connection",
|
| 342 |
+
response_description="Updated database connection record (credentials excluded).",
|
| 343 |
+
responses={
|
| 344 |
+
404: {"description": "Connection not found."},
|
| 345 |
+
403: {"description": "Access denied — user_id does not own this connection."},
|
| 346 |
+
},
|
| 347 |
+
)
|
| 348 |
+
@log_execution(logger)
|
| 349 |
+
async def update_database_client(
|
| 350 |
+
client_id: str,
|
| 351 |
+
payload: DatabaseClientUpdate,
|
| 352 |
+
user_id: str = Query(..., description="ID of the user who owns the connection."),
|
| 353 |
+
db: AsyncSession = Depends(get_db),
|
| 354 |
+
):
|
| 355 |
+
"""
|
| 356 |
+
Update an existing database connection.
|
| 357 |
+
|
| 358 |
+
Only fields present in the request body are updated.
|
| 359 |
+
If `credentials` is provided it **replaces** the entire credentials object
|
| 360 |
+
and sensitive fields are re-encrypted automatically.
|
| 361 |
+
"""
|
| 362 |
+
client = await database_client_service.get(db, client_id)
|
| 363 |
+
|
| 364 |
+
if not client:
|
| 365 |
+
raise HTTPException(status_code=404, detail="Database client not found")
|
| 366 |
+
|
| 367 |
+
if client.user_id != user_id:
|
| 368 |
+
raise HTTPException(status_code=403, detail="Access denied")
|
| 369 |
+
|
| 370 |
+
updated = await database_client_service.update(
|
| 371 |
+
db=db,
|
| 372 |
+
client_id=client_id,
|
| 373 |
+
name=payload.name,
|
| 374 |
+
credentials=payload.credentials,
|
| 375 |
+
status=payload.status,
|
| 376 |
+
)
|
| 377 |
+
return DatabaseClientResponse.model_validate(updated)
|
| 378 |
+
|
| 379 |
+
|
| 380 |
+
@router.delete(
|
| 381 |
+
"/database-clients/{client_id}",
|
| 382 |
+
status_code=status.HTTP_200_OK,
|
| 383 |
+
summary="Delete a database connection",
|
| 384 |
+
responses={
|
| 385 |
+
200: {"description": "Connection deleted successfully."},
|
| 386 |
+
404: {"description": "Connection not found."},
|
| 387 |
+
403: {"description": "Access denied — user_id does not own this connection."},
|
| 388 |
+
},
|
| 389 |
+
)
|
| 390 |
+
@log_execution(logger)
|
| 391 |
+
async def delete_database_client(
|
| 392 |
+
client_id: str,
|
| 393 |
+
user_id: str = Query(..., description="ID of the user who owns the connection."),
|
| 394 |
+
db: AsyncSession = Depends(get_db),
|
| 395 |
+
):
|
| 396 |
+
"""
|
| 397 |
+
Permanently delete a database connection.
|
| 398 |
+
|
| 399 |
+
This action is irreversible. The stored credentials are also removed.
|
| 400 |
+
"""
|
| 401 |
+
client = await database_client_service.get(db, client_id)
|
| 402 |
+
|
| 403 |
+
if not client:
|
| 404 |
+
raise HTTPException(status_code=404, detail="Database client not found")
|
| 405 |
+
|
| 406 |
+
if client.user_id != user_id:
|
| 407 |
+
raise HTTPException(status_code=403, detail="Access denied")
|
| 408 |
+
|
| 409 |
+
await database_client_service.delete(db, client_id)
|
| 410 |
+
return {"status": "success", "message": "Database client deleted successfully"}
|
| 411 |
+
|
| 412 |
+
|
| 413 |
+
@router.post(
|
| 414 |
+
"/database-clients/{client_id}/ingest",
|
| 415 |
+
status_code=status.HTTP_200_OK,
|
| 416 |
+
summary="Ingest schema from a registered database into the vector store",
|
| 417 |
+
response_description="Count of chunks ingested.",
|
| 418 |
+
responses={
|
| 419 |
+
200: {"description": "Ingestion completed successfully."},
|
| 420 |
+
403: {"description": "Access denied — user_id does not own this connection."},
|
| 421 |
+
404: {"description": "Connection not found."},
|
| 422 |
+
501: {"description": "The connection's db_type is not yet supported by the pipeline."},
|
| 423 |
+
500: {"description": "Ingestion failed (connection error, profiling error, etc.)."},
|
| 424 |
+
},
|
| 425 |
+
)
|
| 426 |
+
@limiter.limit("5/minute")
|
| 427 |
+
@log_execution(logger)
|
| 428 |
+
async def ingest_database_client(
|
| 429 |
+
request: Request,
|
| 430 |
+
client_id: str,
|
| 431 |
+
user_id: str = Query(..., description="ID of the user who owns the connection."),
|
| 432 |
+
db: AsyncSession = Depends(get_db),
|
| 433 |
+
):
|
| 434 |
+
"""
|
| 435 |
+
Decrypt the stored credentials, connect to the user's database, introspect
|
| 436 |
+
its schema, profile each column, embed the descriptions, and store them in
|
| 437 |
+
the shared PGVector collection tagged with `source_type="database"`.
|
| 438 |
+
|
| 439 |
+
Chunks become retrievable via the same retriever used for document chunks.
|
| 440 |
+
"""
|
| 441 |
+
client = await database_client_service.get(db, client_id)
|
| 442 |
+
|
| 443 |
+
if not client:
|
| 444 |
+
raise HTTPException(status_code=404, detail="Database client not found")
|
| 445 |
+
|
| 446 |
+
if client.user_id != user_id:
|
| 447 |
+
raise HTTPException(status_code=403, detail="Access denied")
|
| 448 |
+
|
| 449 |
+
if client.status != "active":
|
| 450 |
+
raise HTTPException(
|
| 451 |
+
status_code=status.HTTP_409_CONFLICT,
|
| 452 |
+
detail="Cannot ingest from an inactive database connection.",
|
| 453 |
+
)
|
| 454 |
+
|
| 455 |
+
try:
|
| 456 |
+
creds = decrypt_credentials_dict(client.credentials)
|
| 457 |
+
with db_pipeline_service.engine_scope(
|
| 458 |
+
db_type=client.db_type,
|
| 459 |
+
credentials=creds,
|
| 460 |
+
) as engine:
|
| 461 |
+
total = await db_pipeline_service.run(user_id=user_id, client_id=client_id, engine=engine)
|
| 462 |
+
except NotImplementedError as e:
|
| 463 |
+
raise HTTPException(status_code=status.HTTP_501_NOT_IMPLEMENTED, detail=str(e))
|
| 464 |
+
except Exception as e:
|
| 465 |
+
logger.error(
|
| 466 |
+
f"Ingestion failed for client {client_id}", user_id=user_id, error=str(e)
|
| 467 |
+
)
|
| 468 |
+
raise HTTPException(
|
| 469 |
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
| 470 |
+
detail=f"Ingestion failed: {e}",
|
| 471 |
+
)
|
| 472 |
+
|
| 473 |
+
return {"status": "success", "client_id": client_id, "chunks_ingested": total}
|
src/api/v1/document.py
CHANGED
|
@@ -1,21 +1,20 @@
|
|
| 1 |
"""Document management API endpoints."""
|
| 2 |
-
|
| 3 |
-
from fastapi import APIRouter, Depends, HTTPException, Request, UploadFile, File
|
| 4 |
from sqlalchemy.ext.asyncio import AsyncSession
|
| 5 |
from src.db.postgres.connection import get_db
|
| 6 |
from src.document.document_service import document_service
|
| 7 |
-
from src.knowledge.processing_service import knowledge_processor
|
| 8 |
-
from src.storage.az_blob.az_blob import blob_storage
|
| 9 |
from src.middlewares.logging import get_logger, log_execution
|
| 10 |
from src.middlewares.rate_limit import limiter
|
|
|
|
| 11 |
from pydantic import BaseModel
|
| 12 |
from typing import List
|
| 13 |
-
|
| 14 |
logger = get_logger("document_api")
|
| 15 |
-
|
| 16 |
router = APIRouter(prefix="/api/v1", tags=["Documents"])
|
| 17 |
-
|
| 18 |
-
|
| 19 |
class DocumentResponse(BaseModel):
|
| 20 |
id: str
|
| 21 |
filename: str
|
|
@@ -23,6 +22,27 @@ class DocumentResponse(BaseModel):
|
|
| 23 |
file_size: int
|
| 24 |
file_type: str
|
| 25 |
created_at: str
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
|
| 28 |
@router.get("/documents/{user_id}", response_model=List[DocumentResponse])
|
|
@@ -44,8 +64,8 @@ async def list_documents(
|
|
| 44 |
)
|
| 45 |
for doc in documents
|
| 46 |
]
|
| 47 |
-
|
| 48 |
-
|
| 49 |
@router.post("/document/upload")
|
| 50 |
@limiter.limit("10/minute")
|
| 51 |
@log_execution(logger)
|
|
@@ -57,57 +77,12 @@ async def upload_document(
|
|
| 57 |
):
|
| 58 |
"""Upload a document."""
|
| 59 |
if not user_id:
|
| 60 |
-
raise HTTPException(
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
# Read file content
|
| 67 |
-
content = await file.read()
|
| 68 |
-
file_size = len(content)
|
| 69 |
-
|
| 70 |
-
# Get file type
|
| 71 |
-
filename = file.filename
|
| 72 |
-
file_type = filename.split('.')[-1].lower() if '.' in filename else 'txt'
|
| 73 |
-
|
| 74 |
-
if file_type not in ['pdf', 'docx', 'txt']:
|
| 75 |
-
raise HTTPException(
|
| 76 |
-
status_code=400,
|
| 77 |
-
detail="Unsupported file type. Supported: pdf, docx, txt"
|
| 78 |
-
)
|
| 79 |
-
|
| 80 |
-
# Upload to blob storage
|
| 81 |
-
blob_name = await blob_storage.upload_file(content, filename, user_id)
|
| 82 |
-
|
| 83 |
-
# Create document record
|
| 84 |
-
document = await document_service.create_document(
|
| 85 |
-
db=db,
|
| 86 |
-
user_id=user_id,
|
| 87 |
-
filename=filename,
|
| 88 |
-
blob_name=blob_name,
|
| 89 |
-
file_size=file_size,
|
| 90 |
-
file_type=file_type
|
| 91 |
-
)
|
| 92 |
-
|
| 93 |
-
return {
|
| 94 |
-
"status": "success",
|
| 95 |
-
"message": "Document uploaded successfully",
|
| 96 |
-
"data": {
|
| 97 |
-
"id": document.id,
|
| 98 |
-
"filename": document.filename,
|
| 99 |
-
"status": document.status
|
| 100 |
-
}
|
| 101 |
-
}
|
| 102 |
-
|
| 103 |
-
except Exception as e:
|
| 104 |
-
logger.error(f"Upload failed for user {user_id}", error=str(e))
|
| 105 |
-
raise HTTPException(
|
| 106 |
-
status_code=500,
|
| 107 |
-
detail=f"Upload failed: {str(e)}"
|
| 108 |
-
)
|
| 109 |
-
|
| 110 |
-
|
| 111 |
@router.delete("/document/delete")
|
| 112 |
@log_execution(logger)
|
| 113 |
async def delete_document(
|
|
@@ -116,31 +91,10 @@ async def delete_document(
|
|
| 116 |
db: AsyncSession = Depends(get_db)
|
| 117 |
):
|
| 118 |
"""Delete a document."""
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
status_code=404,
|
| 124 |
-
detail="Document not found"
|
| 125 |
-
)
|
| 126 |
-
|
| 127 |
-
if document.user_id != user_id:
|
| 128 |
-
raise HTTPException(
|
| 129 |
-
status_code=403,
|
| 130 |
-
detail="Access denied"
|
| 131 |
-
)
|
| 132 |
-
|
| 133 |
-
success = await document_service.delete_document(db, document_id)
|
| 134 |
-
|
| 135 |
-
if success:
|
| 136 |
-
return {"status": "success", "message": "Document deleted successfully"}
|
| 137 |
-
else:
|
| 138 |
-
raise HTTPException(
|
| 139 |
-
status_code=500,
|
| 140 |
-
detail="Failed to delete document"
|
| 141 |
-
)
|
| 142 |
-
|
| 143 |
-
|
| 144 |
@router.post("/document/process")
|
| 145 |
@log_execution(logger)
|
| 146 |
async def process_document(
|
|
@@ -149,45 +103,6 @@ async def process_document(
|
|
| 149 |
db: AsyncSession = Depends(get_db)
|
| 150 |
):
|
| 151 |
"""Process document and ingest to vector index."""
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
raise HTTPException(
|
| 156 |
-
status_code=404,
|
| 157 |
-
detail="Document not found"
|
| 158 |
-
)
|
| 159 |
-
|
| 160 |
-
if document.user_id != user_id:
|
| 161 |
-
raise HTTPException(
|
| 162 |
-
status_code=403,
|
| 163 |
-
detail="Access denied"
|
| 164 |
-
)
|
| 165 |
-
|
| 166 |
-
try:
|
| 167 |
-
# Update status to processing
|
| 168 |
-
await document_service.update_document_status(db, document_id, "processing")
|
| 169 |
-
|
| 170 |
-
# Process document
|
| 171 |
-
chunks_count = await knowledge_processor.process_document(document, db)
|
| 172 |
-
|
| 173 |
-
# Update status to completed
|
| 174 |
-
await document_service.update_document_status(db, document_id, "completed")
|
| 175 |
-
|
| 176 |
-
return {
|
| 177 |
-
"status": "success",
|
| 178 |
-
"message": "Document processed successfully",
|
| 179 |
-
"data": {
|
| 180 |
-
"document_id": document_id,
|
| 181 |
-
"chunks_processed": chunks_count
|
| 182 |
-
}
|
| 183 |
-
}
|
| 184 |
-
|
| 185 |
-
except Exception as e:
|
| 186 |
-
logger.error(f"Processing failed for document {document_id}", error=str(e))
|
| 187 |
-
await document_service.update_document_status(
|
| 188 |
-
db, document_id, "failed", str(e)
|
| 189 |
-
)
|
| 190 |
-
raise HTTPException(
|
| 191 |
-
status_code=500,
|
| 192 |
-
detail=f"Processing failed: {str(e)}"
|
| 193 |
-
)
|
|
|
|
| 1 |
"""Document management API endpoints."""
|
| 2 |
+
|
| 3 |
+
from fastapi import APIRouter, Depends, HTTPException, Request, UploadFile, File
|
| 4 |
from sqlalchemy.ext.asyncio import AsyncSession
|
| 5 |
from src.db.postgres.connection import get_db
|
| 6 |
from src.document.document_service import document_service
|
|
|
|
|
|
|
| 7 |
from src.middlewares.logging import get_logger, log_execution
|
| 8 |
from src.middlewares.rate_limit import limiter
|
| 9 |
+
from src.pipeline.document_pipeline.document_pipeline import document_pipeline
|
| 10 |
from pydantic import BaseModel
|
| 11 |
from typing import List
|
| 12 |
+
|
| 13 |
logger = get_logger("document_api")
|
| 14 |
+
|
| 15 |
router = APIRouter(prefix="/api/v1", tags=["Documents"])
|
| 16 |
+
|
| 17 |
+
|
| 18 |
class DocumentResponse(BaseModel):
|
| 19 |
id: str
|
| 20 |
filename: str
|
|
|
|
| 22 |
file_size: int
|
| 23 |
file_type: str
|
| 24 |
created_at: str
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
# NOTE: Keep in sync with SUPPORTED_FILE_TYPES in src/pipeline/document_pipeline/document_pipeline.py
|
| 28 |
+
_DOC_TYPES = [
|
| 29 |
+
{"doc_type": "pdf", "max_size": 10, "status": "active", "message": None},
|
| 30 |
+
{"doc_type": "docx", "max_size": 10, "status": "active", "message": None},
|
| 31 |
+
{"doc_type": "txt", "max_size": 10, "status": "active", "message": None},
|
| 32 |
+
{"doc_type": "csv", "max_size": 10, "status": "active", "message": None},
|
| 33 |
+
{"doc_type": "xlsx", "max_size": 10, "status": "active", "message": None},
|
| 34 |
+
]
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
@router.get(
|
| 38 |
+
"/documents/doctypes",
|
| 39 |
+
summary="List supported document types",
|
| 40 |
+
response_description="All document types supported by DataEyond with their size limits and status.",
|
| 41 |
+
)
|
| 42 |
+
@log_execution(logger)
|
| 43 |
+
async def get_document_types():
|
| 44 |
+
"""Return every document type DataEyond can process, with max file size and active/inactive status."""
|
| 45 |
+
return {"status": "success", "data": _DOC_TYPES}
|
| 46 |
|
| 47 |
|
| 48 |
@router.get("/documents/{user_id}", response_model=List[DocumentResponse])
|
|
|
|
| 64 |
)
|
| 65 |
for doc in documents
|
| 66 |
]
|
| 67 |
+
|
| 68 |
+
|
| 69 |
@router.post("/document/upload")
|
| 70 |
@limiter.limit("10/minute")
|
| 71 |
@log_execution(logger)
|
|
|
|
| 77 |
):
|
| 78 |
"""Upload a document."""
|
| 79 |
if not user_id:
|
| 80 |
+
raise HTTPException(status_code=400, detail="user_id is required")
|
| 81 |
+
|
| 82 |
+
data = await document_pipeline.upload(file, user_id, db)
|
| 83 |
+
return {"status": "success", "message": "Document uploaded successfully", "data": data}
|
| 84 |
+
|
| 85 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
@router.delete("/document/delete")
|
| 87 |
@log_execution(logger)
|
| 88 |
async def delete_document(
|
|
|
|
| 91 |
db: AsyncSession = Depends(get_db)
|
| 92 |
):
|
| 93 |
"""Delete a document."""
|
| 94 |
+
await document_pipeline.delete(document_id, user_id, db)
|
| 95 |
+
return {"status": "success", "message": "Document deleted successfully"}
|
| 96 |
+
|
| 97 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
@router.post("/document/process")
|
| 99 |
@log_execution(logger)
|
| 100 |
async def process_document(
|
|
|
|
| 103 |
db: AsyncSession = Depends(get_db)
|
| 104 |
):
|
| 105 |
"""Process document and ingest to vector index."""
|
| 106 |
+
data = await document_pipeline.process(document_id, user_id, db)
|
| 107 |
+
return {"status": "success", "message": "Document processed successfully", "data": data}
|
| 108 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/config/agents/system_prompt.md
CHANGED
|
@@ -3,8 +3,7 @@ You are a helpful AI assistant with access to user's uploaded documents. Your ro
|
|
| 3 |
1. Answer questions based on provided document context
|
| 4 |
2. If no relevant information is found in documents, acknowledge this honestly
|
| 5 |
3. Be concise and direct in your responses
|
| 6 |
-
4.
|
| 7 |
-
5. If user's question is unclear, ask for clarification
|
| 8 |
|
| 9 |
When document context is provided:
|
| 10 |
- Use information from documents to answer accurately
|
|
|
|
| 3 |
1. Answer questions based on provided document context
|
| 4 |
2. If no relevant information is found in documents, acknowledge this honestly
|
| 5 |
3. Be concise and direct in your responses
|
| 6 |
+
4. If user's question is unclear, ask for clarification
|
|
|
|
| 7 |
|
| 8 |
When document context is provided:
|
| 9 |
- Use information from documents to answer accurately
|
src/config/settings.py
CHANGED
|
@@ -61,6 +61,11 @@ class Settings(BaseSettings):
|
|
| 61 |
# Bcrypt salt (for users - existing)
|
| 62 |
emarcal_bcrypt_salt: str = Field(alias="emarcal__bcrypt__salt", default="")
|
| 63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
|
| 65 |
# Singleton instance
|
| 66 |
settings = Settings()
|
|
|
|
| 61 |
# Bcrypt salt (for users - existing)
|
| 62 |
emarcal_bcrypt_salt: str = Field(alias="emarcal__bcrypt__salt", default="")
|
| 63 |
|
| 64 |
+
# DB credential encryption (Fernet key for user-registered database creds)
|
| 65 |
+
dataeyond_db_credential_key: str = Field(
|
| 66 |
+
alias="dataeyond__db__credential__key"
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
|
| 70 |
# Singleton instance
|
| 71 |
settings = Settings()
|
src/database_client/database_client_service.py
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Service for managing user-registered external database connections."""
|
| 2 |
+
|
| 3 |
+
import uuid
|
| 4 |
+
from typing import List, Optional
|
| 5 |
+
|
| 6 |
+
from sqlalchemy import delete, select
|
| 7 |
+
from sqlalchemy.ext.asyncio import AsyncSession
|
| 8 |
+
|
| 9 |
+
from src.db.postgres.models import DatabaseClient
|
| 10 |
+
from src.middlewares.logging import get_logger
|
| 11 |
+
from src.utils.db_credential_encryption import (
|
| 12 |
+
decrypt_credentials_dict,
|
| 13 |
+
encrypt_credentials_dict,
|
| 14 |
+
)
|
| 15 |
+
|
| 16 |
+
logger = get_logger("database_client_service")
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
# Fields that identify the same physical database per db_type.
|
| 20 |
+
_CONNECTION_IDENTITY_KEYS: dict[str, tuple[str, ...]] = {
|
| 21 |
+
"postgres": ("host", "port", "database"),
|
| 22 |
+
"supabase": ("host", "port", "database"),
|
| 23 |
+
"mysql": ("host", "port", "database"),
|
| 24 |
+
"sqlserver": ("host", "port", "database"),
|
| 25 |
+
"bigquery": ("project_id", "dataset_id"),
|
| 26 |
+
"snowflake": ("account", "warehouse", "database"),
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class DatabaseClientService:
|
| 31 |
+
"""Service for managing user-registered external database connections."""
|
| 32 |
+
|
| 33 |
+
async def _find_duplicate(
|
| 34 |
+
self,
|
| 35 |
+
db: AsyncSession,
|
| 36 |
+
user_id: str,
|
| 37 |
+
db_type: str,
|
| 38 |
+
credentials: dict,
|
| 39 |
+
) -> Optional[DatabaseClient]:
|
| 40 |
+
"""Return an existing client if it points to the same physical database."""
|
| 41 |
+
identity_keys = _CONNECTION_IDENTITY_KEYS.get(db_type, ())
|
| 42 |
+
if not identity_keys:
|
| 43 |
+
return None
|
| 44 |
+
|
| 45 |
+
result = await db.execute(
|
| 46 |
+
select(DatabaseClient).where(
|
| 47 |
+
DatabaseClient.user_id == user_id,
|
| 48 |
+
DatabaseClient.db_type == db_type,
|
| 49 |
+
)
|
| 50 |
+
)
|
| 51 |
+
for existing in result.scalars().all():
|
| 52 |
+
decrypted = decrypt_credentials_dict(existing.credentials)
|
| 53 |
+
if all(
|
| 54 |
+
decrypted.get(k) == credentials.get(k) for k in identity_keys
|
| 55 |
+
):
|
| 56 |
+
return existing
|
| 57 |
+
return None
|
| 58 |
+
|
| 59 |
+
async def create(
|
| 60 |
+
self,
|
| 61 |
+
db: AsyncSession,
|
| 62 |
+
user_id: str,
|
| 63 |
+
name: str,
|
| 64 |
+
db_type: str,
|
| 65 |
+
credentials: dict,
|
| 66 |
+
) -> DatabaseClient:
|
| 67 |
+
"""Register a new database client connection.
|
| 68 |
+
|
| 69 |
+
If a connection to the same physical database already exists for this
|
| 70 |
+
user, the existing record is returned instead of creating a duplicate.
|
| 71 |
+
Credentials are encrypted before being stored.
|
| 72 |
+
"""
|
| 73 |
+
existing = await self._find_duplicate(db, user_id, db_type, credentials)
|
| 74 |
+
if existing:
|
| 75 |
+
logger.info(
|
| 76 |
+
f"Duplicate connection detected, returning existing client {existing.id}"
|
| 77 |
+
)
|
| 78 |
+
return existing
|
| 79 |
+
|
| 80 |
+
client = DatabaseClient(
|
| 81 |
+
id=str(uuid.uuid4()),
|
| 82 |
+
user_id=user_id,
|
| 83 |
+
name=name,
|
| 84 |
+
db_type=db_type,
|
| 85 |
+
credentials=encrypt_credentials_dict(credentials),
|
| 86 |
+
status="active",
|
| 87 |
+
)
|
| 88 |
+
db.add(client)
|
| 89 |
+
await db.commit()
|
| 90 |
+
await db.refresh(client)
|
| 91 |
+
logger.info(f"Created database client {client.id} for user {user_id}")
|
| 92 |
+
return client
|
| 93 |
+
|
| 94 |
+
async def get_user_clients(
|
| 95 |
+
self,
|
| 96 |
+
db: AsyncSession,
|
| 97 |
+
user_id: str,
|
| 98 |
+
) -> List[DatabaseClient]:
|
| 99 |
+
"""Return all active and inactive database clients for a user."""
|
| 100 |
+
result = await db.execute(
|
| 101 |
+
select(DatabaseClient)
|
| 102 |
+
.where(DatabaseClient.user_id == user_id)
|
| 103 |
+
.order_by(DatabaseClient.created_at.desc())
|
| 104 |
+
)
|
| 105 |
+
return result.scalars().all()
|
| 106 |
+
|
| 107 |
+
async def get(
|
| 108 |
+
self,
|
| 109 |
+
db: AsyncSession,
|
| 110 |
+
client_id: str,
|
| 111 |
+
) -> Optional[DatabaseClient]:
|
| 112 |
+
"""Return a single database client by its ID."""
|
| 113 |
+
result = await db.execute(
|
| 114 |
+
select(DatabaseClient).where(DatabaseClient.id == client_id)
|
| 115 |
+
)
|
| 116 |
+
return result.scalars().first()
|
| 117 |
+
|
| 118 |
+
async def update(
|
| 119 |
+
self,
|
| 120 |
+
db: AsyncSession,
|
| 121 |
+
client_id: str,
|
| 122 |
+
name: Optional[str] = None,
|
| 123 |
+
credentials: Optional[dict] = None,
|
| 124 |
+
status: Optional[str] = None,
|
| 125 |
+
) -> Optional[DatabaseClient]:
|
| 126 |
+
"""Update an existing database client connection.
|
| 127 |
+
|
| 128 |
+
Only non-None fields are updated.
|
| 129 |
+
Credentials are re-encrypted if provided.
|
| 130 |
+
"""
|
| 131 |
+
client = await self.get(db, client_id)
|
| 132 |
+
if not client:
|
| 133 |
+
return None
|
| 134 |
+
|
| 135 |
+
if name is not None:
|
| 136 |
+
client.name = name
|
| 137 |
+
if credentials is not None:
|
| 138 |
+
client.credentials = encrypt_credentials_dict(credentials)
|
| 139 |
+
if status is not None:
|
| 140 |
+
client.status = status
|
| 141 |
+
|
| 142 |
+
await db.commit()
|
| 143 |
+
await db.refresh(client)
|
| 144 |
+
logger.info(f"Updated database client {client_id}")
|
| 145 |
+
return client
|
| 146 |
+
|
| 147 |
+
async def delete(
|
| 148 |
+
self,
|
| 149 |
+
db: AsyncSession,
|
| 150 |
+
client_id: str,
|
| 151 |
+
) -> bool:
|
| 152 |
+
"""Permanently delete a database client connection."""
|
| 153 |
+
result = await db.execute(
|
| 154 |
+
delete(DatabaseClient).where(DatabaseClient.id == client_id)
|
| 155 |
+
)
|
| 156 |
+
await db.commit()
|
| 157 |
+
deleted = result.rowcount > 0
|
| 158 |
+
if deleted:
|
| 159 |
+
logger.info(f"Deleted database client {client_id}")
|
| 160 |
+
return deleted
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
database_client_service = DatabaseClientService()
|
| 164 |
+
|
src/db/postgres/init_db.py
CHANGED
|
@@ -2,7 +2,14 @@
|
|
| 2 |
|
| 3 |
from sqlalchemy import text
|
| 4 |
from src.db.postgres.connection import engine, Base
|
| 5 |
-
from src.db.postgres.models import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
|
| 8 |
async def init_db():
|
|
@@ -21,3 +28,38 @@ async def init_db():
|
|
| 21 |
await conn.execute(text(
|
| 22 |
"ALTER TABLE rooms ADD COLUMN IF NOT EXISTS status VARCHAR NOT NULL DEFAULT 'active'"
|
| 23 |
))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
from sqlalchemy import text
|
| 4 |
from src.db.postgres.connection import engine, Base
|
| 5 |
+
from src.db.postgres.models import (
|
| 6 |
+
ChatMessage,
|
| 7 |
+
DatabaseClient,
|
| 8 |
+
Document,
|
| 9 |
+
MessageSource,
|
| 10 |
+
Room,
|
| 11 |
+
User,
|
| 12 |
+
)
|
| 13 |
|
| 14 |
|
| 15 |
async def init_db():
|
|
|
|
| 28 |
await conn.execute(text(
|
| 29 |
"ALTER TABLE rooms ADD COLUMN IF NOT EXISTS status VARCHAR NOT NULL DEFAULT 'active'"
|
| 30 |
))
|
| 31 |
+
|
| 32 |
+
# HNSW index for fast approximate vector similarity search
|
| 33 |
+
# Only created when the embedding column has explicit dimensions (HNSW requirement).
|
| 34 |
+
# atttypmod > 0 means the vector column was created with a dimension (e.g. vector(1536));
|
| 35 |
+
# atttypmod = -1 means dimensionless — HNSW would fail with "column does not have dimensions".
|
| 36 |
+
await conn.execute(text("""
|
| 37 |
+
DO $$
|
| 38 |
+
BEGIN
|
| 39 |
+
IF EXISTS (
|
| 40 |
+
SELECT FROM pg_attribute a
|
| 41 |
+
JOIN pg_class c ON c.oid = a.attrelid
|
| 42 |
+
WHERE c.relname = 'langchain_pg_embedding'
|
| 43 |
+
AND a.attname = 'embedding'
|
| 44 |
+
AND a.atttypmod > 0
|
| 45 |
+
) THEN
|
| 46 |
+
CREATE INDEX IF NOT EXISTS idx_langchain_pg_embedding_hnsw
|
| 47 |
+
ON langchain_pg_embedding USING hnsw (embedding vector_cosine_ops);
|
| 48 |
+
END IF;
|
| 49 |
+
END $$
|
| 50 |
+
"""))
|
| 51 |
+
|
| 52 |
+
# GIN index for FTS on schema chunks — only created if table exists
|
| 53 |
+
# (langchain_pg_embedding is created by PGVector on first use, not by create_all)
|
| 54 |
+
await conn.execute(text("""
|
| 55 |
+
DO $$
|
| 56 |
+
BEGIN
|
| 57 |
+
IF EXISTS (
|
| 58 |
+
SELECT FROM information_schema.tables
|
| 59 |
+
WHERE table_name = 'langchain_pg_embedding'
|
| 60 |
+
) THEN
|
| 61 |
+
CREATE INDEX IF NOT EXISTS idx_langchain_pg_embedding_fts
|
| 62 |
+
ON langchain_pg_embedding USING GIN (to_tsvector('english', document));
|
| 63 |
+
END IF;
|
| 64 |
+
END $$
|
| 65 |
+
"""))
|
src/db/postgres/models.py
CHANGED
|
@@ -4,6 +4,7 @@ from uuid import uuid4
|
|
| 4 |
from sqlalchemy import Column, String, DateTime, Text, Integer, ForeignKey
|
| 5 |
from sqlalchemy.orm import relationship
|
| 6 |
from sqlalchemy.sql import func
|
|
|
|
| 7 |
from src.db.postgres.connection import Base
|
| 8 |
|
| 9 |
|
|
@@ -81,3 +82,18 @@ class MessageSource(Base):
|
|
| 81 |
created_at = Column(DateTime(timezone=True), server_default=func.now())
|
| 82 |
|
| 83 |
message = relationship("ChatMessage", back_populates="sources")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
from sqlalchemy import Column, String, DateTime, Text, Integer, ForeignKey
|
| 5 |
from sqlalchemy.orm import relationship
|
| 6 |
from sqlalchemy.sql import func
|
| 7 |
+
from sqlalchemy.dialects.postgresql import JSONB
|
| 8 |
from src.db.postgres.connection import Base
|
| 9 |
|
| 10 |
|
|
|
|
| 82 |
created_at = Column(DateTime(timezone=True), server_default=func.now())
|
| 83 |
|
| 84 |
message = relationship("ChatMessage", back_populates="sources")
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
class DatabaseClient(Base):
|
| 88 |
+
"""User-registered external database connections."""
|
| 89 |
+
__tablename__ = "databases"
|
| 90 |
+
|
| 91 |
+
id = Column(String, primary_key=True, default=lambda: str(uuid4()))
|
| 92 |
+
user_id = Column(String, nullable=False, index=True)
|
| 93 |
+
name = Column(String, nullable=False) # display name, e.g. "Prod DB"
|
| 94 |
+
db_type = Column(String, nullable=False) # postgres|mysql|sqlserver|supabase|bigquery|snowflake
|
| 95 |
+
credentials = Column(JSONB, nullable=False) # per-type JSON; sensitive fields Fernet-encrypted
|
| 96 |
+
status = Column(String, nullable=False, default="active") # active | inactive
|
| 97 |
+
created_at = Column(DateTime(timezone=True), server_default=func.now())
|
| 98 |
+
updated_at = Column(DateTime(timezone=True), onupdate=func.now())
|
| 99 |
+
|
src/document/document_service.py
CHANGED
|
@@ -1,8 +1,9 @@
|
|
| 1 |
"""Service for managing documents."""
|
| 2 |
|
| 3 |
from sqlalchemy.ext.asyncio import AsyncSession
|
| 4 |
-
from sqlalchemy import select, delete
|
| 5 |
from src.db.postgres.models import Document
|
|
|
|
| 6 |
from src.storage.az_blob.az_blob import blob_storage
|
| 7 |
from src.middlewares.logging import get_logger
|
| 8 |
from typing import List, Optional
|
|
@@ -77,6 +78,21 @@ class DocumentService:
|
|
| 77 |
# Delete from blob storage
|
| 78 |
await blob_storage.delete_file(document.blob_name)
|
| 79 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
# Delete from database
|
| 81 |
await db.execute(
|
| 82 |
delete(Document).where(Document.id == document_id)
|
|
|
|
| 1 |
"""Service for managing documents."""
|
| 2 |
|
| 3 |
from sqlalchemy.ext.asyncio import AsyncSession
|
| 4 |
+
from sqlalchemy import select, delete, text
|
| 5 |
from src.db.postgres.models import Document
|
| 6 |
+
from src.db.postgres.connection import _pgvector_engine
|
| 7 |
from src.storage.az_blob.az_blob import blob_storage
|
| 8 |
from src.middlewares.logging import get_logger
|
| 9 |
from typing import List, Optional
|
|
|
|
| 78 |
# Delete from blob storage
|
| 79 |
await blob_storage.delete_file(document.blob_name)
|
| 80 |
|
| 81 |
+
# Delete vector embeddings from pgvector (scoped to user + collection to avoid cross-user over-delete)
|
| 82 |
+
async with _pgvector_engine.begin() as conn:
|
| 83 |
+
await conn.execute(
|
| 84 |
+
text("""
|
| 85 |
+
DELETE FROM langchain_pg_embedding
|
| 86 |
+
WHERE cmetadata->>'user_id' = :user_id
|
| 87 |
+
AND cmetadata->>'source_type' = 'document'
|
| 88 |
+
AND cmetadata->'data'->>'document_id' = :doc_id
|
| 89 |
+
AND collection_id = (
|
| 90 |
+
SELECT uuid FROM langchain_pg_collection WHERE name = 'document_embeddings'
|
| 91 |
+
)
|
| 92 |
+
"""),
|
| 93 |
+
{"user_id": document.user_id, "doc_id": document_id},
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
# Delete from database
|
| 97 |
await db.execute(
|
| 98 |
delete(Document).where(Document.id == document_id)
|
src/knowledge/parquet_service.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Parquet service — converts, uploads, downloads, and deletes Parquet files for CSV/XLSX.
|
| 2 |
+
|
| 3 |
+
Parquet files are stored in Azure Blob alongside the original document using
|
| 4 |
+
a deterministic naming convention based on document_id:
|
| 5 |
+
|
| 6 |
+
CSV: {user_id}/{document_id}.parquet
|
| 7 |
+
XLSX sheet: {user_id}/{document_id}__{safe_sheet_name}.parquet
|
| 8 |
+
|
| 9 |
+
This allows tabular.py to construct the correct blob name at retrieval time
|
| 10 |
+
without needing to store it separately, and allows document_pipeline.py to
|
| 11 |
+
delete all Parquet files for a document using a prefix delete.
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
import io
|
| 15 |
+
|
| 16 |
+
import pandas as pd
|
| 17 |
+
|
| 18 |
+
from src.middlewares.logging import get_logger
|
| 19 |
+
from src.storage.az_blob.az_blob import blob_storage
|
| 20 |
+
|
| 21 |
+
logger = get_logger("parquet_service")
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def _safe_sheet_name(sheet_name: str) -> str:
|
| 25 |
+
return sheet_name.replace("/", "_").replace(" ", "_").replace("\\", "_")
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def parquet_blob_name(user_id: str, document_id: str, sheet_name: str | None = None) -> str:
|
| 29 |
+
"""Construct deterministic Parquet blob name."""
|
| 30 |
+
if sheet_name:
|
| 31 |
+
return f"{user_id}/{document_id}__{_safe_sheet_name(sheet_name)}.parquet"
|
| 32 |
+
return f"{user_id}/{document_id}.parquet"
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def _to_parquet_bytes(df: pd.DataFrame) -> bytes:
|
| 36 |
+
buf = io.BytesIO()
|
| 37 |
+
df.to_parquet(buf, index=False)
|
| 38 |
+
return buf.getvalue()
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
async def upload_parquet(
|
| 42 |
+
df: pd.DataFrame,
|
| 43 |
+
user_id: str,
|
| 44 |
+
document_id: str,
|
| 45 |
+
sheet_name: str | None = None,
|
| 46 |
+
) -> str:
|
| 47 |
+
"""Convert DataFrame to Parquet and upload to Azure Blob. Returns blob_name."""
|
| 48 |
+
blob_name = parquet_blob_name(user_id, document_id, sheet_name)
|
| 49 |
+
parquet_bytes = _to_parquet_bytes(df)
|
| 50 |
+
await blob_storage.upload_bytes(parquet_bytes, blob_name)
|
| 51 |
+
logger.info(f"Uploaded Parquet {blob_name} ({len(parquet_bytes)} bytes)")
|
| 52 |
+
return blob_name
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
async def download_parquet(
|
| 56 |
+
user_id: str,
|
| 57 |
+
document_id: str,
|
| 58 |
+
sheet_name: str | None = None,
|
| 59 |
+
) -> pd.DataFrame:
|
| 60 |
+
"""Download Parquet from Azure Blob and return as DataFrame."""
|
| 61 |
+
blob_name = parquet_blob_name(user_id, document_id, sheet_name)
|
| 62 |
+
content = await blob_storage.download_file(blob_name)
|
| 63 |
+
df = pd.read_parquet(io.BytesIO(content))
|
| 64 |
+
logger.info(f"Downloaded Parquet {blob_name}: {len(df)} rows, {len(df.columns)} columns")
|
| 65 |
+
return df
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
async def delete_document_parquets(user_id: str, document_id: str) -> int:
|
| 69 |
+
"""Delete all Parquet files for a document (CSV = 1 file, XLSX = one per sheet).
|
| 70 |
+
|
| 71 |
+
Uses prefix delete: {user_id}/{document_id} matches all Parquet variants
|
| 72 |
+
for this document without touching the original blob (which uses a random UUID name).
|
| 73 |
+
"""
|
| 74 |
+
prefix = f"{user_id}/{document_id}"
|
| 75 |
+
deleted = await blob_storage.delete_blobs_with_prefix(prefix)
|
| 76 |
+
logger.info(f"Deleted {deleted} Parquet file(s) for document {document_id}")
|
| 77 |
+
return deleted
|
src/knowledge/processing_service.py
CHANGED
|
@@ -5,16 +5,20 @@ from langchain_core.documents import Document as LangChainDocument
|
|
| 5 |
from src.db.postgres.vector_store import get_vector_store
|
| 6 |
from src.storage.az_blob.az_blob import blob_storage
|
| 7 |
from src.db.postgres.models import Document as DBDocument
|
| 8 |
-
from src.config.settings import settings
|
| 9 |
from sqlalchemy.ext.asyncio import AsyncSession
|
| 10 |
from src.middlewares.logging import get_logger
|
| 11 |
-
from
|
| 12 |
-
from azure.core.credentials import AzureKeyCredential
|
| 13 |
from typing import List
|
| 14 |
-
import
|
|
|
|
| 15 |
import docx
|
|
|
|
|
|
|
|
|
|
| 16 |
from io import BytesIO
|
| 17 |
|
|
|
|
|
|
|
| 18 |
logger = get_logger("knowledge_processing")
|
| 19 |
|
| 20 |
|
|
@@ -40,6 +44,10 @@ class KnowledgeProcessingService:
|
|
| 40 |
|
| 41 |
if db_doc.file_type == "pdf":
|
| 42 |
documents = await self._build_pdf_documents(content, db_doc)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
else:
|
| 44 |
text = self._extract_text(content, db_doc.file_type)
|
| 45 |
if not text.strip():
|
|
@@ -49,10 +57,15 @@ class KnowledgeProcessingService:
|
|
| 49 |
LangChainDocument(
|
| 50 |
page_content=chunk,
|
| 51 |
metadata={
|
| 52 |
-
"document_id": db_doc.id,
|
| 53 |
"user_id": db_doc.user_id,
|
| 54 |
-
"
|
| 55 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
}
|
| 57 |
)
|
| 58 |
for i, chunk in enumerate(chunks)
|
|
@@ -74,62 +87,138 @@ class KnowledgeProcessingService:
|
|
| 74 |
async def _build_pdf_documents(
|
| 75 |
self, content: bytes, db_doc: DBDocument
|
| 76 |
) -> List[LangChainDocument]:
|
| 77 |
-
"""Build LangChain documents from PDF with page_label metadata.
|
| 78 |
-
|
| 79 |
-
Uses Azure Document Intelligence (per-page) when credentials are present,
|
| 80 |
-
falls back to pypdf (also per-page) otherwise.
|
| 81 |
-
"""
|
| 82 |
documents: List[LangChainDocument] = []
|
| 83 |
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
documents.append(LangChainDocument(
|
| 105 |
-
page_content=chunk,
|
| 106 |
-
metadata={
|
| 107 |
-
"document_id": db_doc.id,
|
| 108 |
-
"user_id": db_doc.user_id,
|
| 109 |
-
"filename": db_doc.filename,
|
| 110 |
-
"chunk_index": len(documents),
|
| 111 |
-
"page_label": page.page_number,
|
| 112 |
-
}
|
| 113 |
-
))
|
| 114 |
-
else:
|
| 115 |
-
logger.warning("Azure DI not configured, using pypdf")
|
| 116 |
-
pdf_reader = pypdf.PdfReader(BytesIO(content))
|
| 117 |
-
for page_num, page in enumerate(pdf_reader.pages, start=1):
|
| 118 |
-
page_text = page.extract_text() or ""
|
| 119 |
-
if not page_text.strip():
|
| 120 |
-
continue
|
| 121 |
-
for chunk in self.text_splitter.split_text(page_text):
|
| 122 |
-
documents.append(LangChainDocument(
|
| 123 |
-
page_content=chunk,
|
| 124 |
-
metadata={
|
| 125 |
"document_id": db_doc.id,
|
| 126 |
-
"user_id": db_doc.user_id,
|
| 127 |
"filename": db_doc.filename,
|
|
|
|
| 128 |
"chunk_index": len(documents),
|
| 129 |
"page_label": page_num,
|
| 130 |
-
}
|
| 131 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
return documents
|
| 134 |
|
| 135 |
def _extract_text(self, content: bytes, file_type: str) -> str:
|
|
|
|
| 5 |
from src.db.postgres.vector_store import get_vector_store
|
| 6 |
from src.storage.az_blob.az_blob import blob_storage
|
| 7 |
from src.db.postgres.models import Document as DBDocument
|
|
|
|
| 8 |
from sqlalchemy.ext.asyncio import AsyncSession
|
| 9 |
from src.middlewares.logging import get_logger
|
| 10 |
+
from src.knowledge.parquet_service import upload_parquet
|
|
|
|
| 11 |
from typing import List
|
| 12 |
+
from datetime import datetime, timezone, timedelta
|
| 13 |
+
import sys
|
| 14 |
import docx
|
| 15 |
+
import pandas as pd
|
| 16 |
+
import pytesseract
|
| 17 |
+
from pdf2image import convert_from_bytes
|
| 18 |
from io import BytesIO
|
| 19 |
|
| 20 |
+
_JAKARTA_TZ = timezone(timedelta(hours=7))
|
| 21 |
+
|
| 22 |
logger = get_logger("knowledge_processing")
|
| 23 |
|
| 24 |
|
|
|
|
| 44 |
|
| 45 |
if db_doc.file_type == "pdf":
|
| 46 |
documents = await self._build_pdf_documents(content, db_doc)
|
| 47 |
+
elif db_doc.file_type == "csv":
|
| 48 |
+
documents = await self._build_csv_documents(content, db_doc)
|
| 49 |
+
elif db_doc.file_type == "xlsx":
|
| 50 |
+
documents = await self._build_excel_documents(content, db_doc)
|
| 51 |
else:
|
| 52 |
text = self._extract_text(content, db_doc.file_type)
|
| 53 |
if not text.strip():
|
|
|
|
| 57 |
LangChainDocument(
|
| 58 |
page_content=chunk,
|
| 59 |
metadata={
|
|
|
|
| 60 |
"user_id": db_doc.user_id,
|
| 61 |
+
"source_type": "document",
|
| 62 |
+
"updated_at": datetime.now(_JAKARTA_TZ).isoformat(),
|
| 63 |
+
"data": {
|
| 64 |
+
"document_id": db_doc.id,
|
| 65 |
+
"filename": db_doc.filename,
|
| 66 |
+
"file_type": db_doc.file_type,
|
| 67 |
+
"chunk_index": i,
|
| 68 |
+
},
|
| 69 |
}
|
| 70 |
)
|
| 71 |
for i, chunk in enumerate(chunks)
|
|
|
|
| 87 |
async def _build_pdf_documents(
|
| 88 |
self, content: bytes, db_doc: DBDocument
|
| 89 |
) -> List[LangChainDocument]:
|
| 90 |
+
"""Build LangChain documents from PDF with page_label metadata using Tesseract OCR."""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
documents: List[LangChainDocument] = []
|
| 92 |
|
| 93 |
+
poppler_path = None
|
| 94 |
+
if sys.platform == "win32":
|
| 95 |
+
pytesseract.pytesseract.tesseract_cmd = r"./software/Tesseract-OCR/tesseract.exe"
|
| 96 |
+
poppler_path = "./software/poppler-24.08.0/Library/bin"
|
| 97 |
+
|
| 98 |
+
images = convert_from_bytes(content, poppler_path=poppler_path)
|
| 99 |
+
logger.info(f"Tesseract OCR: converting {len(images)} pages")
|
| 100 |
+
|
| 101 |
+
for page_num, image in enumerate(images, start=1):
|
| 102 |
+
page_text = pytesseract.image_to_string(image)
|
| 103 |
+
if not page_text.strip():
|
| 104 |
+
continue
|
| 105 |
+
for chunk in self.text_splitter.split_text(page_text):
|
| 106 |
+
documents.append(LangChainDocument(
|
| 107 |
+
page_content=chunk,
|
| 108 |
+
metadata={
|
| 109 |
+
"user_id": db_doc.user_id,
|
| 110 |
+
"source_type": "document",
|
| 111 |
+
"updated_at": datetime.now(_JAKARTA_TZ).isoformat(),
|
| 112 |
+
"data": {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
"document_id": db_doc.id,
|
|
|
|
| 114 |
"filename": db_doc.filename,
|
| 115 |
+
"file_type": db_doc.file_type,
|
| 116 |
"chunk_index": len(documents),
|
| 117 |
"page_label": page_num,
|
| 118 |
+
},
|
| 119 |
+
}
|
| 120 |
+
))
|
| 121 |
+
|
| 122 |
+
return documents
|
| 123 |
+
|
| 124 |
+
def _profile_dataframe(
|
| 125 |
+
self, df: pd.DataFrame, source_name: str, db_doc: DBDocument
|
| 126 |
+
) -> List[LangChainDocument]:
|
| 127 |
+
"""Profile each column of a dataframe → one chunk per column."""
|
| 128 |
+
documents = []
|
| 129 |
+
row_count = len(df)
|
| 130 |
+
|
| 131 |
+
for col_name in df.columns:
|
| 132 |
+
col = df[col_name]
|
| 133 |
+
is_numeric = pd.api.types.is_numeric_dtype(col)
|
| 134 |
+
null_count = int(col.isnull().sum())
|
| 135 |
+
distinct_count = int(col.nunique())
|
| 136 |
+
distinct_ratio = distinct_count / row_count if row_count > 0 else 0
|
| 137 |
+
|
| 138 |
+
text = f"Source: {source_name} ({row_count} rows)\n"
|
| 139 |
+
text += f"Column: {col_name} ({col.dtype})\n"
|
| 140 |
+
text += f"Null count: {null_count}\n"
|
| 141 |
+
text += f"Distinct count: {distinct_count} ({distinct_ratio:.1%})\n"
|
| 142 |
+
|
| 143 |
+
if is_numeric:
|
| 144 |
+
text += f"Min: {col.min()}, Max: {col.max()}\n"
|
| 145 |
+
text += f"Mean: {col.mean():.4f}, Median: {col.median():.4f}\n"
|
| 146 |
+
|
| 147 |
+
if 0 < distinct_ratio <= 0.05:
|
| 148 |
+
top_values = col.value_counts().head(10)
|
| 149 |
+
top_str = ", ".join(f"{v} ({c})" for v, c in top_values.items())
|
| 150 |
+
text += f"Top values: {top_str}\n"
|
| 151 |
+
|
| 152 |
+
text += f"Sample values: {col.dropna().head(5).tolist()}"
|
| 153 |
+
|
| 154 |
+
documents.append(LangChainDocument(
|
| 155 |
+
page_content=text,
|
| 156 |
+
metadata={
|
| 157 |
+
"user_id": db_doc.user_id,
|
| 158 |
+
"source_type": "document",
|
| 159 |
+
"chunk_level": "column",
|
| 160 |
+
"updated_at": datetime.now(_JAKARTA_TZ).isoformat(),
|
| 161 |
+
"data": {
|
| 162 |
+
"document_id": db_doc.id,
|
| 163 |
+
"filename": db_doc.filename,
|
| 164 |
+
"file_type": db_doc.file_type,
|
| 165 |
+
"source": source_name,
|
| 166 |
+
"column_name": col_name,
|
| 167 |
+
"column_type": str(col.dtype),
|
| 168 |
+
}
|
| 169 |
+
}
|
| 170 |
+
))
|
| 171 |
+
return documents
|
| 172 |
+
|
| 173 |
+
def _to_sheet_document(
|
| 174 |
+
self, df: pd.DataFrame, db_doc: DBDocument, sheet_name: str | None, source_name: str
|
| 175 |
+
) -> LangChainDocument:
|
| 176 |
+
col_summary = ", ".join(f"{c} ({df[c].dtype})" for c in df.columns)
|
| 177 |
+
text = (
|
| 178 |
+
f"Source: {source_name} ({len(df)} rows)\n"
|
| 179 |
+
f"Columns ({len(df.columns)}): {col_summary}"
|
| 180 |
+
)
|
| 181 |
+
return LangChainDocument(
|
| 182 |
+
page_content=text,
|
| 183 |
+
metadata={
|
| 184 |
+
"user_id": db_doc.user_id,
|
| 185 |
+
"source_type": "document",
|
| 186 |
+
"chunk_level": "sheet",
|
| 187 |
+
"updated_at": datetime.now(_JAKARTA_TZ).isoformat(),
|
| 188 |
+
"data": {
|
| 189 |
+
"document_id": db_doc.id,
|
| 190 |
+
"filename": db_doc.filename,
|
| 191 |
+
"file_type": db_doc.file_type,
|
| 192 |
+
"sheet_name": sheet_name,
|
| 193 |
+
"column_names": list(df.columns),
|
| 194 |
+
"row_count": len(df),
|
| 195 |
+
},
|
| 196 |
+
},
|
| 197 |
+
)
|
| 198 |
|
| 199 |
+
async def _build_csv_documents(self, content: bytes, db_doc: DBDocument) -> List[LangChainDocument]:
|
| 200 |
+
"""Profile each column of a CSV file and upload Parquet to Azure Blob."""
|
| 201 |
+
df = pd.read_csv(BytesIO(content))
|
| 202 |
+
await upload_parquet(df, db_doc.user_id, db_doc.id)
|
| 203 |
+
logger.info(f"Uploaded Parquet for CSV {db_doc.id}")
|
| 204 |
+
docs = self._profile_dataframe(df, db_doc.filename, db_doc)
|
| 205 |
+
docs.append(self._to_sheet_document(df, db_doc, sheet_name=None, source_name=db_doc.filename))
|
| 206 |
+
return docs
|
| 207 |
+
|
| 208 |
+
async def _build_excel_documents(self, content: bytes, db_doc: DBDocument) -> List[LangChainDocument]:
|
| 209 |
+
"""Profile each column of every sheet in an Excel file and upload one Parquet per sheet."""
|
| 210 |
+
sheets = pd.read_excel(BytesIO(content), sheet_name=None)
|
| 211 |
+
documents = []
|
| 212 |
+
for sheet_name, df in sheets.items():
|
| 213 |
+
source_name = f"{db_doc.filename} / sheet: {sheet_name}"
|
| 214 |
+
docs = self._profile_dataframe(df, source_name, db_doc)
|
| 215 |
+
for doc in docs:
|
| 216 |
+
doc.metadata["data"]["sheet_name"] = sheet_name
|
| 217 |
+
doc.metadata["chunk_level"] = "column"
|
| 218 |
+
documents.extend(docs)
|
| 219 |
+
documents.append(self._to_sheet_document(df, db_doc, sheet_name, source_name))
|
| 220 |
+
await upload_parquet(df, db_doc.user_id, db_doc.id, sheet_name)
|
| 221 |
+
logger.info(f"Uploaded Parquet for sheet '{sheet_name}' of {db_doc.id}")
|
| 222 |
return documents
|
| 223 |
|
| 224 |
def _extract_text(self, content: bytes, file_type: str) -> str:
|
src/models/credentials.py
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Pydantic credential schemas for user-registered external databases.
|
| 2 |
+
|
| 3 |
+
Imported by the `/database-clients` API router (`src/api/v1/db_client.py`) and,
|
| 4 |
+
via `DbType`, by the db pipeline connector (`src/pipeline/db_pipeline/connector.py`).
|
| 5 |
+
|
| 6 |
+
Sensitive fields (`password`, `service_account_json`) are Fernet-encrypted by
|
| 7 |
+
the database_client service before being stored in the JSONB column; these
|
| 8 |
+
schemas describe the plaintext wire format, not the stored shape.
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
from typing import Literal, Optional, Union
|
| 12 |
+
|
| 13 |
+
from pydantic import BaseModel, Field
|
| 14 |
+
|
| 15 |
+
# ---------------------------------------------------------------------------
|
| 16 |
+
# Supported DB types
|
| 17 |
+
# ---------------------------------------------------------------------------
|
| 18 |
+
|
| 19 |
+
DbType = Literal["postgres", "mysql", "sqlserver", "supabase", "bigquery", "snowflake"]
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
# ---------------------------------------------------------------------------
|
| 23 |
+
# Typed credential schemas per DB type
|
| 24 |
+
# ---------------------------------------------------------------------------
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class PostgresCredentials(BaseModel):
|
| 28 |
+
"""Connection credentials for PostgreSQL."""
|
| 29 |
+
|
| 30 |
+
host: str = Field(..., description="Hostname or IP address of the PostgreSQL server.", examples=["db.example.com"])
|
| 31 |
+
port: int = Field(5432, description="Port number (default: 5432).", examples=[5432])
|
| 32 |
+
database: str = Field(..., description="Name of the target database.", examples=["mydb"])
|
| 33 |
+
username: str = Field(..., description="Database username.", examples=["admin"])
|
| 34 |
+
password: str = Field(..., description="Database password. Will be encrypted at rest.", examples=["s3cr3t!"])
|
| 35 |
+
ssl_mode: Literal["disable", "require", "verify-ca", "verify-full"] = Field(
|
| 36 |
+
"require",
|
| 37 |
+
description="SSL mode for the connection.",
|
| 38 |
+
examples=["require"],
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
class MysqlCredentials(BaseModel):
|
| 43 |
+
"""Connection credentials for MySQL."""
|
| 44 |
+
|
| 45 |
+
host: str = Field(..., description="Hostname or IP address of the MySQL server.", examples=["db.example.com"])
|
| 46 |
+
port: int = Field(3306, description="Port number (default: 3306).", examples=[3306])
|
| 47 |
+
database: str = Field(..., description="Name of the target database.", examples=["mydb"])
|
| 48 |
+
username: str = Field(..., description="Database username.", examples=["admin"])
|
| 49 |
+
password: str = Field(..., description="Database password. Will be encrypted at rest.", examples=["s3cr3t!"])
|
| 50 |
+
ssl: bool = Field(True, description="Enable SSL for the connection.", examples=[True])
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
class SqlServerCredentials(BaseModel):
|
| 54 |
+
"""Connection credentials for Microsoft SQL Server."""
|
| 55 |
+
|
| 56 |
+
host: str = Field(..., description="Hostname or IP address of the SQL Server.", examples=["sqlserver.example.com"])
|
| 57 |
+
port: int = Field(1433, description="Port number (default: 1433).", examples=[1433])
|
| 58 |
+
database: str = Field(..., description="Name of the target database.", examples=["mydb"])
|
| 59 |
+
username: str = Field(..., description="Database username.", examples=["sa"])
|
| 60 |
+
password: str = Field(..., description="Database password. Will be encrypted at rest.", examples=["s3cr3t!"])
|
| 61 |
+
driver: Optional[str] = Field(
|
| 62 |
+
None,
|
| 63 |
+
description="ODBC driver name. Leave empty to use the default driver.",
|
| 64 |
+
examples=["ODBC Driver 17 for SQL Server"],
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
class SupabaseCredentials(BaseModel):
|
| 69 |
+
"""Connection credentials for Supabase (PostgreSQL-based).
|
| 70 |
+
|
| 71 |
+
Use the connection string details from your Supabase project dashboard
|
| 72 |
+
under Settings > Database.
|
| 73 |
+
"""
|
| 74 |
+
|
| 75 |
+
host: str = Field(
|
| 76 |
+
...,
|
| 77 |
+
description="Supabase database host (e.g. db.<project-ref>.supabase.co, or the pooler host).",
|
| 78 |
+
examples=["db.xxxx.supabase.co"],
|
| 79 |
+
)
|
| 80 |
+
port: int = Field(
|
| 81 |
+
5432,
|
| 82 |
+
description="Port number. Use 5432 for direct connection, 6543 for the connection pooler.",
|
| 83 |
+
examples=[5432],
|
| 84 |
+
)
|
| 85 |
+
database: str = Field("postgres", description="Database name (always 'postgres' for Supabase).", examples=["postgres"])
|
| 86 |
+
username: str = Field(
|
| 87 |
+
...,
|
| 88 |
+
description="Database user. Use 'postgres' for direct connection, or 'postgres.<project-ref>' for the pooler.",
|
| 89 |
+
examples=["postgres"],
|
| 90 |
+
)
|
| 91 |
+
password: str = Field(..., description="Database password (set in Supabase dashboard). Will be encrypted at rest.", examples=["s3cr3t!"])
|
| 92 |
+
ssl_mode: Literal["require", "verify-ca", "verify-full"] = Field(
|
| 93 |
+
"require",
|
| 94 |
+
description="SSL mode. Supabase always requires SSL.",
|
| 95 |
+
examples=["require"],
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
class BigQueryCredentials(BaseModel):
|
| 100 |
+
"""Connection credentials for Google BigQuery.
|
| 101 |
+
|
| 102 |
+
Requires a GCP Service Account with at least BigQuery Data Viewer
|
| 103 |
+
and BigQuery Job User roles.
|
| 104 |
+
"""
|
| 105 |
+
|
| 106 |
+
project_id: str = Field(..., description="GCP project ID where the BigQuery dataset resides.", examples=["my-gcp-project"])
|
| 107 |
+
dataset_id: str = Field(..., description="BigQuery dataset name to connect to.", examples=["my_dataset"])
|
| 108 |
+
location: Optional[str] = Field(
|
| 109 |
+
"US",
|
| 110 |
+
description="Dataset location/region (default: US).",
|
| 111 |
+
examples=["US", "EU", "asia-southeast1"],
|
| 112 |
+
)
|
| 113 |
+
service_account_json: str = Field(
|
| 114 |
+
...,
|
| 115 |
+
description=(
|
| 116 |
+
"Full content of the GCP Service Account key JSON file as a string. "
|
| 117 |
+
"Will be encrypted at rest."
|
| 118 |
+
),
|
| 119 |
+
examples=['{"type":"service_account","project_id":"my-gcp-project","private_key_id":"..."}'],
|
| 120 |
+
)
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
class SnowflakeCredentials(BaseModel):
|
| 124 |
+
"""Connection credentials for Snowflake."""
|
| 125 |
+
|
| 126 |
+
account: str = Field(
|
| 127 |
+
...,
|
| 128 |
+
description="Snowflake account identifier, including region if applicable (e.g. myaccount.us-east-1).",
|
| 129 |
+
examples=["myaccount.us-east-1"],
|
| 130 |
+
)
|
| 131 |
+
warehouse: str = Field(..., description="Name of the virtual warehouse to use for queries.", examples=["COMPUTE_WH"])
|
| 132 |
+
database: str = Field(..., description="Name of the target Snowflake database.", examples=["MY_DB"])
|
| 133 |
+
db_schema: Optional[str] = Field("PUBLIC", alias="schema", description="Schema name (default: PUBLIC).", examples=["PUBLIC"])
|
| 134 |
+
username: str = Field(..., description="Snowflake username.", examples=["admin"])
|
| 135 |
+
password: str = Field(..., description="Snowflake password. Will be encrypted at rest.", examples=["s3cr3t!"])
|
| 136 |
+
role: Optional[str] = Field(None, description="Snowflake role to assume for the session.", examples=["SYSADMIN"])
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
# Union of all credential shapes — reserved for future typed validation on
|
| 140 |
+
# DatabaseClientCreate.credentials (currently Dict[str, Any]). Kept exported
|
| 141 |
+
# so downstream code can reference it without re-declaring.
|
| 142 |
+
CredentialsUnion = Union[
|
| 143 |
+
PostgresCredentials,
|
| 144 |
+
MysqlCredentials,
|
| 145 |
+
SqlServerCredentials,
|
| 146 |
+
SupabaseCredentials,
|
| 147 |
+
BigQueryCredentials,
|
| 148 |
+
SnowflakeCredentials,
|
| 149 |
+
]
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
# Doc-only helper: surfaces per-type credential shapes in the Swagger "Schemas"
|
| 153 |
+
# panel so API consumers can discover the exact field set for each db_type.
|
| 154 |
+
# Not referenced by any endpoint — importing it in db_client.py is enough for
|
| 155 |
+
# FastAPI's OpenAPI generator to pick it up.
|
| 156 |
+
class CredentialSchemas(BaseModel):
|
| 157 |
+
"""Reference schemas for `credentials` per `db_type` (Swagger-only, not used by endpoints)."""
|
| 158 |
+
|
| 159 |
+
postgres: PostgresCredentials
|
| 160 |
+
mysql: MysqlCredentials
|
| 161 |
+
sqlserver: SqlServerCredentials
|
| 162 |
+
supabase: SupabaseCredentials
|
| 163 |
+
bigquery: BigQueryCredentials
|
| 164 |
+
snowflake: SnowflakeCredentials
|
src/models/sql_query.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Structured output model for LLM-generated SQL queries."""
|
| 2 |
+
|
| 3 |
+
from pydantic import BaseModel, Field
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class SQLQuery(BaseModel):
|
| 7 |
+
sql: str = Field(description="A single SQL SELECT statement. No markdown, no explanation inline.")
|
| 8 |
+
reasoning: str = Field(description="One sentence: what this query answers.")
|
src/models/structured_output.py
CHANGED
|
@@ -19,3 +19,7 @@ class IntentClassification(BaseModel):
|
|
| 19 |
default="",
|
| 20 |
description="Direct response if no search needed (for greetings, etc.)"
|
| 21 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
default="",
|
| 20 |
description="Direct response if no search needed (for greetings, etc.)"
|
| 21 |
)
|
| 22 |
+
source_hint: str = Field(
|
| 23 |
+
default="both",
|
| 24 |
+
description="Which sources to search: 'document' (PDF/DOCX/TXT), 'schema' (DB/CSV/XLSX), or 'both'"
|
| 25 |
+
)
|
src/pipeline/db_pipeline/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from src.pipeline.db_pipeline.db_pipeline_service import DbPipelineService, db_pipeline_service
|
| 2 |
+
|
| 3 |
+
__all__ = ["DbPipelineService", "db_pipeline_service"]
|
src/pipeline/db_pipeline/db_pipeline_service.py
ADDED
|
@@ -0,0 +1,302 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Service for ingesting a user's external database into the vector store.
|
| 2 |
+
|
| 3 |
+
End-to-end flow: connect -> introspect schema -> profile columns -> build text
|
| 4 |
+
-> embed + store in the shared PGVector collection (tagged with
|
| 5 |
+
`source_type="database"`, retrievable via the same retriever used for docs).
|
| 6 |
+
|
| 7 |
+
Sync DB work (SQLAlchemy inspect, pandas read_sql) runs in a threadpool;
|
| 8 |
+
async vector writes stay on the event loop.
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import asyncio
|
| 12 |
+
from contextlib import contextmanager
|
| 13 |
+
from datetime import datetime, timezone, timedelta
|
| 14 |
+
from typing import Any, Iterator, Optional
|
| 15 |
+
|
| 16 |
+
from langchain_core.documents import Document as LangChainDocument
|
| 17 |
+
from sqlalchemy import URL, create_engine, text
|
| 18 |
+
from sqlalchemy.engine import Engine
|
| 19 |
+
|
| 20 |
+
from src.db.postgres.connection import _pgvector_engine
|
| 21 |
+
from src.db.postgres.vector_store import get_vector_store
|
| 22 |
+
from src.middlewares.logging import get_logger
|
| 23 |
+
from src.models.credentials import DbType
|
| 24 |
+
from src.pipeline.db_pipeline.extractor import (
|
| 25 |
+
build_table_chunk,
|
| 26 |
+
fetch_sample_row,
|
| 27 |
+
get_row_count,
|
| 28 |
+
get_schema,
|
| 29 |
+
profile_table,
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
logger = get_logger("db_pipeline")
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
class DbPipelineService:
|
| 36 |
+
"""End-to-end DB ingestion: connect -> introspect -> profile -> embed -> store."""
|
| 37 |
+
|
| 38 |
+
def connect(self, db_type: DbType, credentials: dict[str, Any]) -> Engine:
|
| 39 |
+
"""Build a SQLAlchemy engine for the user's database.
|
| 40 |
+
|
| 41 |
+
`credentials` is the plaintext dict matching the per-type schema in
|
| 42 |
+
`src/models/credentials.py`. BigQuery/Snowflake auth models differ
|
| 43 |
+
from host/port/user/pass, so every shape flows through one dict.
|
| 44 |
+
|
| 45 |
+
Optional driver imports (snowflake-sqlalchemy, json for BigQuery) are
|
| 46 |
+
done lazily so an env missing one driver doesn't break module import.
|
| 47 |
+
"""
|
| 48 |
+
logger.info("connecting to user db", db_type=db_type)
|
| 49 |
+
|
| 50 |
+
if db_type in ("postgres", "supabase"):
|
| 51 |
+
query = (
|
| 52 |
+
{"sslmode": credentials["ssl_mode"]} if credentials.get("ssl_mode") else {}
|
| 53 |
+
)
|
| 54 |
+
url = URL.create(
|
| 55 |
+
drivername="postgresql+psycopg2",
|
| 56 |
+
username=credentials["username"],
|
| 57 |
+
password=credentials["password"],
|
| 58 |
+
host=credentials["host"],
|
| 59 |
+
port=credentials["port"],
|
| 60 |
+
database=credentials["database"],
|
| 61 |
+
query=query,
|
| 62 |
+
)
|
| 63 |
+
return create_engine(url)
|
| 64 |
+
|
| 65 |
+
if db_type == "mysql":
|
| 66 |
+
url = URL.create(
|
| 67 |
+
drivername="mysql+pymysql",
|
| 68 |
+
username=credentials["username"],
|
| 69 |
+
password=credentials["password"],
|
| 70 |
+
host=credentials["host"],
|
| 71 |
+
port=credentials["port"],
|
| 72 |
+
database=credentials["database"],
|
| 73 |
+
)
|
| 74 |
+
# pymysql only activates TLS when the `ssl` dict is truthy
|
| 75 |
+
# (empty dict is falsy and silently disables TLS). Use system-
|
| 76 |
+
# default CAs via certifi + hostname verification — required by
|
| 77 |
+
# managed MySQL providers like TiDB Cloud / PlanetScale / Aiven.
|
| 78 |
+
if credentials.get("ssl", True):
|
| 79 |
+
import certifi
|
| 80 |
+
|
| 81 |
+
connect_args = {
|
| 82 |
+
"ssl": {
|
| 83 |
+
"ca": certifi.where(),
|
| 84 |
+
"check_hostname": True,
|
| 85 |
+
}
|
| 86 |
+
}
|
| 87 |
+
else:
|
| 88 |
+
connect_args = {}
|
| 89 |
+
return create_engine(url, connect_args=connect_args)
|
| 90 |
+
|
| 91 |
+
if db_type == "sqlserver":
|
| 92 |
+
# `driver` applies to pyodbc only; we ship pymssql. Accept-and-ignore
|
| 93 |
+
# keeps the credential schema stable.
|
| 94 |
+
if credentials.get("driver"):
|
| 95 |
+
logger.info(
|
| 96 |
+
"sqlserver driver hint ignored (using pymssql)",
|
| 97 |
+
driver=credentials["driver"],
|
| 98 |
+
)
|
| 99 |
+
url = URL.create(
|
| 100 |
+
drivername="mssql+pymssql",
|
| 101 |
+
username=credentials["username"],
|
| 102 |
+
password=credentials["password"],
|
| 103 |
+
host=credentials["host"],
|
| 104 |
+
port=credentials["port"],
|
| 105 |
+
database=credentials["database"],
|
| 106 |
+
)
|
| 107 |
+
return create_engine(url)
|
| 108 |
+
|
| 109 |
+
if db_type == "bigquery":
|
| 110 |
+
import json
|
| 111 |
+
|
| 112 |
+
sa_info = json.loads(credentials["service_account_json"])
|
| 113 |
+
# sqlalchemy-bigquery URL shape: bigquery://<project>/<dataset>
|
| 114 |
+
url = f"bigquery://{credentials['project_id']}/{credentials['dataset_id']}"
|
| 115 |
+
return create_engine(
|
| 116 |
+
url,
|
| 117 |
+
credentials_info=sa_info,
|
| 118 |
+
location=credentials.get("location", "US"),
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
if db_type == "snowflake":
|
| 122 |
+
from snowflake.sqlalchemy import URL as SnowflakeURL
|
| 123 |
+
|
| 124 |
+
url = SnowflakeURL(
|
| 125 |
+
account=credentials["account"],
|
| 126 |
+
user=credentials["username"],
|
| 127 |
+
password=credentials["password"],
|
| 128 |
+
database=credentials["database"],
|
| 129 |
+
schema=(
|
| 130 |
+
credentials.get("db_schema")
|
| 131 |
+
or credentials.get("schema")
|
| 132 |
+
or "PUBLIC"
|
| 133 |
+
),
|
| 134 |
+
warehouse=credentials["warehouse"],
|
| 135 |
+
role=credentials.get("role") or "",
|
| 136 |
+
)
|
| 137 |
+
return create_engine(url)
|
| 138 |
+
|
| 139 |
+
raise NotImplementedError(f"Unsupported db_type: {db_type}")
|
| 140 |
+
|
| 141 |
+
@contextmanager
|
| 142 |
+
def engine_scope(
|
| 143 |
+
self, db_type: DbType, credentials: dict[str, Any]
|
| 144 |
+
) -> Iterator[Engine]:
|
| 145 |
+
"""Yield a connected Engine and dispose its pool on exit.
|
| 146 |
+
|
| 147 |
+
API callers should prefer this over raw `connect(...)` so user DB
|
| 148 |
+
connection pools do not leak between pipeline runs.
|
| 149 |
+
"""
|
| 150 |
+
engine = self.connect(db_type, credentials)
|
| 151 |
+
try:
|
| 152 |
+
yield engine
|
| 153 |
+
finally:
|
| 154 |
+
engine.dispose()
|
| 155 |
+
|
| 156 |
+
def _to_document(
|
| 157 |
+
self, user_id: str, client_id: str, table_name: str, entry: dict, updated_at: str
|
| 158 |
+
) -> LangChainDocument:
|
| 159 |
+
col = entry["col"]
|
| 160 |
+
return LangChainDocument(
|
| 161 |
+
page_content=entry["text"],
|
| 162 |
+
metadata={
|
| 163 |
+
"user_id": user_id,
|
| 164 |
+
"source_type": "database",
|
| 165 |
+
"chunk_level": "column",
|
| 166 |
+
"database_client_id": client_id,
|
| 167 |
+
"updated_at": updated_at,
|
| 168 |
+
"data": {
|
| 169 |
+
"table_name": table_name,
|
| 170 |
+
"column_name": col["name"],
|
| 171 |
+
"column_type": col["type"],
|
| 172 |
+
"is_primary_key": col.get("is_primary_key", False),
|
| 173 |
+
"foreign_key": col.get("foreign_key"),
|
| 174 |
+
},
|
| 175 |
+
},
|
| 176 |
+
)
|
| 177 |
+
|
| 178 |
+
def _to_table_document(
|
| 179 |
+
self,
|
| 180 |
+
user_id: str,
|
| 181 |
+
client_id: str,
|
| 182 |
+
table_name: str,
|
| 183 |
+
columns: list[dict],
|
| 184 |
+
row_count: int,
|
| 185 |
+
text: str,
|
| 186 |
+
updated_at: str,
|
| 187 |
+
) -> LangChainDocument:
|
| 188 |
+
foreign_keys = []
|
| 189 |
+
for c in columns:
|
| 190 |
+
fk = c.get("foreign_key")
|
| 191 |
+
if not fk:
|
| 192 |
+
continue
|
| 193 |
+
target_table, _, target_column = fk.partition(".")
|
| 194 |
+
foreign_keys.append({
|
| 195 |
+
"column": c["name"],
|
| 196 |
+
"target_table": target_table,
|
| 197 |
+
"target_column": target_column,
|
| 198 |
+
})
|
| 199 |
+
|
| 200 |
+
return LangChainDocument(
|
| 201 |
+
page_content=text,
|
| 202 |
+
metadata={
|
| 203 |
+
"user_id": user_id,
|
| 204 |
+
"source_type": "database",
|
| 205 |
+
"chunk_level": "table",
|
| 206 |
+
"database_client_id": client_id,
|
| 207 |
+
"updated_at": updated_at,
|
| 208 |
+
"data": {
|
| 209 |
+
"table_name": table_name,
|
| 210 |
+
"row_count": row_count,
|
| 211 |
+
"primary_key": [c["name"] for c in columns if c.get("is_primary_key")],
|
| 212 |
+
"foreign_keys": foreign_keys,
|
| 213 |
+
"column_names": [c["name"] for c in columns],
|
| 214 |
+
},
|
| 215 |
+
},
|
| 216 |
+
)
|
| 217 |
+
|
| 218 |
+
async def run(
|
| 219 |
+
self,
|
| 220 |
+
user_id: str,
|
| 221 |
+
client_id: str,
|
| 222 |
+
engine: Engine,
|
| 223 |
+
exclude_tables: Optional[frozenset[str]] = None,
|
| 224 |
+
) -> int:
|
| 225 |
+
"""Introspect the user's DB, profile columns, embed descriptions, store in PGVector.
|
| 226 |
+
|
| 227 |
+
Returns:
|
| 228 |
+
Total number of chunks ingested.
|
| 229 |
+
"""
|
| 230 |
+
vector_store = get_vector_store()
|
| 231 |
+
logger.info("db pipeline start", user_id=user_id)
|
| 232 |
+
|
| 233 |
+
# Profile first — if this fails, old embeddings are untouched
|
| 234 |
+
schema = await asyncio.to_thread(get_schema, engine, exclude_tables)
|
| 235 |
+
|
| 236 |
+
updated_at = datetime.now(timezone(timedelta(hours=7))).isoformat()
|
| 237 |
+
all_docs: list = []
|
| 238 |
+
for table_name, columns in schema.items():
|
| 239 |
+
logger.info("profiling table", table=table_name, columns=len(columns))
|
| 240 |
+
entries = await asyncio.to_thread(profile_table, engine, table_name, columns)
|
| 241 |
+
docs = [self._to_document(user_id, client_id, table_name, e, updated_at) for e in entries]
|
| 242 |
+
all_docs.extend(docs)
|
| 243 |
+
|
| 244 |
+
# Table-level chunk. Failures here are logged and skipped — column
|
| 245 |
+
# chunks above are already in all_docs and will still be written.
|
| 246 |
+
try:
|
| 247 |
+
row_count = await asyncio.to_thread(get_row_count, engine, table_name)
|
| 248 |
+
sample_row = (
|
| 249 |
+
await asyncio.to_thread(fetch_sample_row, engine, table_name)
|
| 250 |
+
if row_count > 0
|
| 251 |
+
else None
|
| 252 |
+
)
|
| 253 |
+
table_text = build_table_chunk(
|
| 254 |
+
table_name, row_count, columns, entries, sample_row
|
| 255 |
+
)
|
| 256 |
+
all_docs.append(
|
| 257 |
+
self._to_table_document(
|
| 258 |
+
user_id, client_id, table_name, columns, row_count, table_text, updated_at
|
| 259 |
+
)
|
| 260 |
+
)
|
| 261 |
+
except Exception as e:
|
| 262 |
+
logger.error(
|
| 263 |
+
"table chunk generation failed", table=table_name, error=str(e)
|
| 264 |
+
)
|
| 265 |
+
|
| 266 |
+
logger.info("profiled table", table=table_name, count=len(docs))
|
| 267 |
+
|
| 268 |
+
# Insert new chunks first; only delete stale chunks after the insert succeeds.
|
| 269 |
+
# Prevents data loss if aadd_documents fails — old embeddings stay queryable
|
| 270 |
+
# until they're proven replaceable. Stale rows are identified by an older
|
| 271 |
+
# updated_at than this run.
|
| 272 |
+
if not all_docs:
|
| 273 |
+
logger.warning(
|
| 274 |
+
"no docs produced from schema; skipping delete to preserve existing embeddings",
|
| 275 |
+
user_id=user_id,
|
| 276 |
+
client_id=client_id,
|
| 277 |
+
)
|
| 278 |
+
return 0
|
| 279 |
+
|
| 280 |
+
await vector_store.aadd_documents(all_docs)
|
| 281 |
+
|
| 282 |
+
async with _pgvector_engine.begin() as conn:
|
| 283 |
+
result = await conn.execute(
|
| 284 |
+
text(
|
| 285 |
+
"DELETE FROM langchain_pg_embedding "
|
| 286 |
+
"WHERE cmetadata->>'user_id' = :user_id "
|
| 287 |
+
" AND cmetadata->>'source_type' = 'database' "
|
| 288 |
+
" AND cmetadata->>'database_client_id' = :client_id "
|
| 289 |
+
" AND cmetadata->>'updated_at' < :updated_at "
|
| 290 |
+
" AND collection_id = ("
|
| 291 |
+
" SELECT uuid FROM langchain_pg_collection WHERE name = 'document_embeddings'"
|
| 292 |
+
" )"
|
| 293 |
+
),
|
| 294 |
+
{"user_id": user_id, "client_id": client_id, "updated_at": updated_at},
|
| 295 |
+
)
|
| 296 |
+
logger.info("cleared stale db embeddings", user_id=user_id, deleted=result.rowcount)
|
| 297 |
+
|
| 298 |
+
logger.info("db pipeline complete", user_id=user_id, total=len(all_docs))
|
| 299 |
+
return len(all_docs)
|
| 300 |
+
|
| 301 |
+
|
| 302 |
+
db_pipeline_service = DbPipelineService()
|
src/pipeline/db_pipeline/extractor.py
ADDED
|
@@ -0,0 +1,283 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Schema introspection and per-column profiling for a user's database.
|
| 2 |
+
|
| 3 |
+
Identifiers (table/column names) are quoted via the engine's dialect preparer,
|
| 4 |
+
which handles reserved words, mixed case, and embedded quotes correctly across
|
| 5 |
+
dialects. Values used in SQL come from SQLAlchemy inspection of the DB itself,
|
| 6 |
+
not user input.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
from typing import Optional
|
| 10 |
+
|
| 11 |
+
import pandas as pd
|
| 12 |
+
from sqlalchemy import Float, Integer, Numeric, inspect
|
| 13 |
+
from sqlalchemy.engine import Engine
|
| 14 |
+
|
| 15 |
+
from src.middlewares.logging import get_logger
|
| 16 |
+
|
| 17 |
+
logger = get_logger("db_extractor")
|
| 18 |
+
|
| 19 |
+
TOP_VALUES_THRESHOLD = 0.05 # show top values if distinct_ratio <= 5%
|
| 20 |
+
|
| 21 |
+
# Dialects where PERCENTILE_CONT(...) WITHIN GROUP is supported as an aggregate.
|
| 22 |
+
# MySQL has no percentile aggregate; BigQuery has PERCENTILE_CONT only as an
|
| 23 |
+
# analytic (window) function — both drop median and keep min/max/mean.
|
| 24 |
+
_MEDIAN_DIALECTS = frozenset({"postgresql", "mssql", "snowflake"})
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def _supports_median(engine: Engine) -> bool:
|
| 28 |
+
return engine.dialect.name in _MEDIAN_DIALECTS
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def _head_query(
|
| 32 |
+
engine: Engine,
|
| 33 |
+
select_clause: str,
|
| 34 |
+
from_clause: str,
|
| 35 |
+
n: int,
|
| 36 |
+
order_by: str = "",
|
| 37 |
+
) -> str:
|
| 38 |
+
"""LIMIT/TOP-equivalent head query for the engine's dialect."""
|
| 39 |
+
if engine.dialect.name == "mssql":
|
| 40 |
+
return f"SELECT TOP {n} {select_clause} FROM {from_clause} {order_by}".strip()
|
| 41 |
+
return f"SELECT {select_clause} FROM {from_clause} {order_by} LIMIT {n}".strip()
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def _qi(engine: Engine, name: str) -> str:
|
| 45 |
+
"""Dialect-correct identifier quoting (schema.table also handled if dotted)."""
|
| 46 |
+
preparer = engine.dialect.identifier_preparer
|
| 47 |
+
if "." in name:
|
| 48 |
+
schema, _, table = name.partition(".")
|
| 49 |
+
return f"{preparer.quote(schema)}.{preparer.quote(table)}"
|
| 50 |
+
return preparer.quote(name)
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def get_schema(
|
| 54 |
+
engine: Engine, exclude_tables: Optional[frozenset[str]] = None
|
| 55 |
+
) -> dict[str, list[dict]]:
|
| 56 |
+
"""Returns {table_name: [{name, type, is_numeric, is_primary_key, foreign_key}, ...]}."""
|
| 57 |
+
exclude = exclude_tables or frozenset()
|
| 58 |
+
inspector = inspect(engine)
|
| 59 |
+
schema = {}
|
| 60 |
+
for table_name in inspector.get_table_names():
|
| 61 |
+
if table_name in exclude:
|
| 62 |
+
continue
|
| 63 |
+
|
| 64 |
+
pk = inspector.get_pk_constraint(table_name)
|
| 65 |
+
pk_cols = set(pk["constrained_columns"]) if pk else set()
|
| 66 |
+
|
| 67 |
+
fk_map = {}
|
| 68 |
+
for fk in inspector.get_foreign_keys(table_name):
|
| 69 |
+
for col, ref_col in zip(fk["constrained_columns"], fk["referred_columns"]):
|
| 70 |
+
fk_map[col] = f"{fk['referred_table']}.{ref_col}"
|
| 71 |
+
|
| 72 |
+
cols = inspector.get_columns(table_name)
|
| 73 |
+
schema[table_name] = [
|
| 74 |
+
{
|
| 75 |
+
"name": c["name"],
|
| 76 |
+
"type": str(c["type"]),
|
| 77 |
+
"is_numeric": isinstance(c["type"], (Integer, Numeric, Float)),
|
| 78 |
+
"is_primary_key": c["name"] in pk_cols,
|
| 79 |
+
"foreign_key": fk_map.get(c["name"]),
|
| 80 |
+
}
|
| 81 |
+
for c in cols
|
| 82 |
+
]
|
| 83 |
+
logger.info("extracted schema", table_count=len(schema))
|
| 84 |
+
return schema
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def get_row_count(engine: Engine, table_name: str) -> int:
|
| 88 |
+
# Cast to plain int — pandas returns numpy.int64 which fails JSONB serialization
|
| 89 |
+
# when the value lands in PGVector cmetadata via the table-level chunk.
|
| 90 |
+
return int(pd.read_sql(f"SELECT COUNT(*) FROM {_qi(engine, table_name)}", engine).iloc[0, 0])
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def profile_column(
|
| 94 |
+
engine: Engine,
|
| 95 |
+
table_name: str,
|
| 96 |
+
col_name: str,
|
| 97 |
+
is_numeric: bool,
|
| 98 |
+
row_count: int,
|
| 99 |
+
) -> dict:
|
| 100 |
+
"""Returns null_count, distinct_count, min/max, top values, and sample values."""
|
| 101 |
+
if row_count == 0:
|
| 102 |
+
return {
|
| 103 |
+
"null_count": 0,
|
| 104 |
+
"distinct_count": 0,
|
| 105 |
+
"distinct_ratio": 0.0,
|
| 106 |
+
"sample_values": [],
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
qt = _qi(engine, table_name)
|
| 110 |
+
qc = _qi(engine, col_name)
|
| 111 |
+
|
| 112 |
+
# Combined stats query: null_count, distinct_count, and min/max (if numeric).
|
| 113 |
+
# One round-trip instead of two.
|
| 114 |
+
select_cols = [
|
| 115 |
+
f"COUNT(*) - COUNT({qc}) AS nulls",
|
| 116 |
+
f"COUNT(DISTINCT {qc}) AS distincts",
|
| 117 |
+
]
|
| 118 |
+
if is_numeric:
|
| 119 |
+
select_cols.append(f"MIN({qc}) AS min_val")
|
| 120 |
+
select_cols.append(f"MAX({qc}) AS max_val")
|
| 121 |
+
select_cols.append(f"AVG({qc}) AS mean_val")
|
| 122 |
+
if _supports_median(engine):
|
| 123 |
+
select_cols.append(
|
| 124 |
+
f"PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY {qc}) AS median_val"
|
| 125 |
+
)
|
| 126 |
+
stats = pd.read_sql(f"SELECT {', '.join(select_cols)} FROM {qt}", engine)
|
| 127 |
+
|
| 128 |
+
null_count = int(stats.iloc[0]["nulls"])
|
| 129 |
+
distinct_count = int(stats.iloc[0]["distincts"])
|
| 130 |
+
distinct_ratio = distinct_count / row_count if row_count > 0 else 0
|
| 131 |
+
|
| 132 |
+
profile = {
|
| 133 |
+
"null_count": null_count,
|
| 134 |
+
"distinct_count": distinct_count,
|
| 135 |
+
"distinct_ratio": round(distinct_ratio, 4),
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
if is_numeric:
|
| 139 |
+
profile["min"] = stats.iloc[0]["min_val"]
|
| 140 |
+
profile["max"] = stats.iloc[0]["max_val"]
|
| 141 |
+
profile["mean"] = stats.iloc[0]["mean_val"]
|
| 142 |
+
if _supports_median(engine):
|
| 143 |
+
profile["median"] = stats.iloc[0]["median_val"]
|
| 144 |
+
|
| 145 |
+
if 0 < distinct_ratio <= TOP_VALUES_THRESHOLD:
|
| 146 |
+
top_sql = _head_query(
|
| 147 |
+
engine,
|
| 148 |
+
select_clause=f"{qc}, COUNT(*) AS cnt",
|
| 149 |
+
from_clause=f"{qt} GROUP BY {qc}",
|
| 150 |
+
n=10,
|
| 151 |
+
order_by="ORDER BY cnt DESC",
|
| 152 |
+
)
|
| 153 |
+
top = pd.read_sql(top_sql, engine)
|
| 154 |
+
profile["top_values"] = list(zip(top.iloc[:, 0].tolist(), top["cnt"].tolist()))
|
| 155 |
+
|
| 156 |
+
sample = pd.read_sql(_head_query(engine, qc, qt, 5), engine)
|
| 157 |
+
profile["sample_values"] = sample.iloc[:, 0].tolist()
|
| 158 |
+
|
| 159 |
+
return profile
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
def profile_table(engine: Engine, table_name: str, columns: list[dict]) -> list[dict]:
|
| 163 |
+
"""Profile every column in a table. Returns [{col, profile, text}, ...].
|
| 164 |
+
|
| 165 |
+
Per-column errors are logged and skipped so one bad column doesn't abort
|
| 166 |
+
the whole table.
|
| 167 |
+
"""
|
| 168 |
+
row_count = get_row_count(engine, table_name)
|
| 169 |
+
if row_count == 0:
|
| 170 |
+
logger.info("skipping empty table", table=table_name)
|
| 171 |
+
return []
|
| 172 |
+
|
| 173 |
+
results = []
|
| 174 |
+
for col in columns:
|
| 175 |
+
try:
|
| 176 |
+
profile = profile_column(
|
| 177 |
+
engine, table_name, col["name"], col.get("is_numeric", False), row_count
|
| 178 |
+
)
|
| 179 |
+
text = build_text(table_name, row_count, col, profile)
|
| 180 |
+
results.append({"col": col, "profile": profile, "text": text})
|
| 181 |
+
except Exception as e:
|
| 182 |
+
logger.error(
|
| 183 |
+
"column profiling failed",
|
| 184 |
+
table=table_name,
|
| 185 |
+
column=col["name"],
|
| 186 |
+
error=str(e),
|
| 187 |
+
)
|
| 188 |
+
continue
|
| 189 |
+
return results
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
def fetch_sample_row(engine: Engine, table_name: str) -> Optional[dict]:
|
| 193 |
+
"""First row of the table as a dict, or None if the table is empty.
|
| 194 |
+
|
| 195 |
+
Reuses _qi for dialect-correct quoting and _head_query for TOP/LIMIT.
|
| 196 |
+
"""
|
| 197 |
+
qt = _qi(engine, table_name)
|
| 198 |
+
sql = _head_query(engine, "*", qt, 1)
|
| 199 |
+
df = pd.read_sql(sql, engine)
|
| 200 |
+
if df.empty:
|
| 201 |
+
return None
|
| 202 |
+
return df.iloc[0].to_dict()
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
def build_table_chunk(
|
| 206 |
+
table_name: str,
|
| 207 |
+
row_count: int,
|
| 208 |
+
columns: list[dict],
|
| 209 |
+
column_profiles: list[dict],
|
| 210 |
+
sample_row: Optional[dict],
|
| 211 |
+
) -> str:
|
| 212 |
+
"""Build the table-level chunk text.
|
| 213 |
+
|
| 214 |
+
Format (lines omitted when not applicable):
|
| 215 |
+
Table: {name} ({row_count} rows)
|
| 216 |
+
Primary key: {pk_cols}
|
| 217 |
+
Foreign keys: {col} -> {target_table}.{target_col}, ...
|
| 218 |
+
Columns ({n}): {col1}, {col2}, ...
|
| 219 |
+
Numeric ranges: {col} [{min}-{max}], ...
|
| 220 |
+
Sample row: {dict}
|
| 221 |
+
|
| 222 |
+
Pure formatter — no DB I/O. column_profiles is the output of profile_table
|
| 223 |
+
and is reused so we don't re-introspect.
|
| 224 |
+
"""
|
| 225 |
+
lines = [f"Table: {table_name} ({row_count} rows)"]
|
| 226 |
+
|
| 227 |
+
pk_cols = [c["name"] for c in columns if c.get("is_primary_key")]
|
| 228 |
+
if pk_cols:
|
| 229 |
+
lines.append(f"Primary key: {', '.join(pk_cols)}")
|
| 230 |
+
|
| 231 |
+
fk_parts = [
|
| 232 |
+
f"{c['name']} -> {c['foreign_key']}" for c in columns if c.get("foreign_key")
|
| 233 |
+
]
|
| 234 |
+
if fk_parts:
|
| 235 |
+
lines.append(f"Foreign keys: {', '.join(fk_parts)}")
|
| 236 |
+
|
| 237 |
+
col_names = [c["name"] for c in columns]
|
| 238 |
+
lines.append(f"Columns ({len(col_names)}): {', '.join(col_names)}")
|
| 239 |
+
|
| 240 |
+
range_parts = []
|
| 241 |
+
for entry in column_profiles:
|
| 242 |
+
col = entry["col"]
|
| 243 |
+
profile = entry["profile"]
|
| 244 |
+
if not col.get("is_numeric"):
|
| 245 |
+
continue
|
| 246 |
+
mn = profile.get("min")
|
| 247 |
+
mx = profile.get("max")
|
| 248 |
+
if mn is None or mx is None:
|
| 249 |
+
continue
|
| 250 |
+
range_parts.append(f"{col['name']} [{mn}-{mx}]")
|
| 251 |
+
if range_parts:
|
| 252 |
+
lines.append(f"Numeric ranges: {', '.join(range_parts)}")
|
| 253 |
+
|
| 254 |
+
if sample_row is not None:
|
| 255 |
+
lines.append(f"Sample row: {sample_row}")
|
| 256 |
+
|
| 257 |
+
return "\n".join(lines)
|
| 258 |
+
|
| 259 |
+
|
| 260 |
+
def build_text(table_name: str, row_count: int, col: dict, profile: dict) -> str:
|
| 261 |
+
col_name = col["name"]
|
| 262 |
+
col_type = col["type"]
|
| 263 |
+
|
| 264 |
+
key_label = ""
|
| 265 |
+
if col.get("is_primary_key"):
|
| 266 |
+
key_label = " [PRIMARY KEY]"
|
| 267 |
+
elif col.get("foreign_key"):
|
| 268 |
+
key_label = f" [FK -> {col['foreign_key']}]"
|
| 269 |
+
|
| 270 |
+
text = f"Table: {table_name} ({row_count} rows)\n"
|
| 271 |
+
text += f"Column: {col_name} ({col_type}){key_label}\n"
|
| 272 |
+
text += f"Null count: {profile['null_count']}\n"
|
| 273 |
+
text += f"Distinct count: {profile['distinct_count']} ({profile['distinct_ratio']:.1%})\n"
|
| 274 |
+
if "min" in profile:
|
| 275 |
+
text += f"Min: {profile['min']}, Max: {profile['max']}\n"
|
| 276 |
+
text += f"Mean: {profile['mean']}\n"
|
| 277 |
+
if profile.get("median") is not None:
|
| 278 |
+
text += f"Median: {profile['median']}\n"
|
| 279 |
+
if "top_values" in profile:
|
| 280 |
+
top_str = ", ".join(f"{v} ({c})" for v, c in profile["top_values"])
|
| 281 |
+
text += f"Top values: {top_str}\n"
|
| 282 |
+
text += f"Sample values: {profile['sample_values']}"
|
| 283 |
+
return text
|
src/pipeline/document_pipeline/__init__.py
ADDED
|
File without changes
|
src/pipeline/document_pipeline/document_pipeline.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Document upload and processing pipeline."""
|
| 2 |
+
|
| 3 |
+
from fastapi import HTTPException, UploadFile
|
| 4 |
+
from sqlalchemy.ext.asyncio import AsyncSession
|
| 5 |
+
|
| 6 |
+
from src.document.document_service import document_service
|
| 7 |
+
from src.knowledge.processing_service import knowledge_processor
|
| 8 |
+
from src.knowledge.parquet_service import delete_document_parquets
|
| 9 |
+
from src.middlewares.logging import get_logger
|
| 10 |
+
from src.storage.az_blob.az_blob import blob_storage
|
| 11 |
+
|
| 12 |
+
logger = get_logger("document_pipeline")
|
| 13 |
+
|
| 14 |
+
# NOTE: Keep in sync with _DOC_TYPES in src/api/v1/document.py
|
| 15 |
+
SUPPORTED_FILE_TYPES = ["pdf", "docx", "txt", "csv", "xlsx"]
|
| 16 |
+
MAX_FILE_SIZE_BYTES = 10 * 1024 * 1024 # 10 MB
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class DocumentPipeline:
|
| 20 |
+
"""Orchestrates the full document upload, process, and delete flows."""
|
| 21 |
+
|
| 22 |
+
async def upload(self, file: UploadFile, user_id: str, db: AsyncSession) -> dict:
|
| 23 |
+
"""Validate → upload to blob → save to DB."""
|
| 24 |
+
content = await file.read()
|
| 25 |
+
if not file.filename:
|
| 26 |
+
raise HTTPException(status_code=400, detail="Filename is required.")
|
| 27 |
+
file_type = file.filename.split(".")[-1].lower() if "." in file.filename else "txt"
|
| 28 |
+
|
| 29 |
+
if len(content) > MAX_FILE_SIZE_BYTES:
|
| 30 |
+
raise HTTPException(
|
| 31 |
+
status_code=400,
|
| 32 |
+
detail="File size exceeds maximum allowed size of 10 MB.",
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
if file_type not in SUPPORTED_FILE_TYPES:
|
| 36 |
+
raise HTTPException(
|
| 37 |
+
status_code=400,
|
| 38 |
+
detail=f"Unsupported file type. Supported: {', '.join(SUPPORTED_FILE_TYPES)}",
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
blob_name = await blob_storage.upload_file(content, file.filename, user_id)
|
| 42 |
+
document = await document_service.create_document(
|
| 43 |
+
db=db,
|
| 44 |
+
user_id=user_id,
|
| 45 |
+
filename=file.filename,
|
| 46 |
+
blob_name=blob_name,
|
| 47 |
+
file_size=len(content),
|
| 48 |
+
file_type=file_type,
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
logger.info(f"Uploaded document {document.id} for user {user_id}")
|
| 52 |
+
return {"id": document.id, "filename": document.filename, "status": document.status}
|
| 53 |
+
|
| 54 |
+
async def process(self, document_id: str, user_id: str, db: AsyncSession) -> dict:
|
| 55 |
+
"""Validate ownership → extract text → chunk → ingest to vector store."""
|
| 56 |
+
document = await document_service.get_document(db, document_id)
|
| 57 |
+
|
| 58 |
+
if not document:
|
| 59 |
+
raise HTTPException(status_code=404, detail="Document not found")
|
| 60 |
+
if document.user_id != user_id:
|
| 61 |
+
raise HTTPException(status_code=403, detail="Access denied")
|
| 62 |
+
|
| 63 |
+
try:
|
| 64 |
+
await document_service.update_document_status(db, document_id, "processing")
|
| 65 |
+
chunks_count = await knowledge_processor.process_document(document, db)
|
| 66 |
+
await document_service.update_document_status(db, document_id, "completed")
|
| 67 |
+
|
| 68 |
+
logger.info(f"Processed document {document_id}: {chunks_count} chunks")
|
| 69 |
+
return {"document_id": document_id, "chunks_processed": chunks_count}
|
| 70 |
+
|
| 71 |
+
except Exception as e:
|
| 72 |
+
logger.error(f"Processing failed for document {document_id}", error=str(e))
|
| 73 |
+
await document_service.update_document_status(db, document_id, "failed", str(e))
|
| 74 |
+
raise HTTPException(status_code=500, detail=f"Processing failed: {str(e)}")
|
| 75 |
+
|
| 76 |
+
async def delete(self, document_id: str, user_id: str, db: AsyncSession) -> dict:
|
| 77 |
+
"""Validate ownership → delete from blob and DB."""
|
| 78 |
+
document = await document_service.get_document(db, document_id)
|
| 79 |
+
|
| 80 |
+
if not document:
|
| 81 |
+
raise HTTPException(status_code=404, detail="Document not found")
|
| 82 |
+
if document.user_id != user_id:
|
| 83 |
+
raise HTTPException(status_code=403, detail="Access denied")
|
| 84 |
+
|
| 85 |
+
await document_service.delete_document(db, document_id)
|
| 86 |
+
|
| 87 |
+
if document.file_type in ("csv", "xlsx"):
|
| 88 |
+
await delete_document_parquets(user_id, document_id)
|
| 89 |
+
|
| 90 |
+
logger.info(f"Deleted document {document_id} for user {user_id}")
|
| 91 |
+
return {"document_id": document_id}
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
document_pipeline = DocumentPipeline()
|
src/query/__init__.py
ADDED
|
File without changes
|
src/query/base.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Shared contract for query executors."""
|
| 2 |
+
|
| 3 |
+
from abc import ABC, abstractmethod
|
| 4 |
+
from dataclasses import dataclass, field
|
| 5 |
+
|
| 6 |
+
from sqlalchemy.ext.asyncio import AsyncSession
|
| 7 |
+
|
| 8 |
+
from src.rag.base import RetrievalResult
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
@dataclass
|
| 12 |
+
class QueryResult:
|
| 13 |
+
source_type: str # "database" or "document"
|
| 14 |
+
source_id: str # database_client_id or document_id
|
| 15 |
+
table_or_file: str
|
| 16 |
+
columns: list[str]
|
| 17 |
+
rows: list[dict]
|
| 18 |
+
row_count: int
|
| 19 |
+
metadata: dict = field(default_factory=dict)
|
| 20 |
+
# metadata should include "column_types": {"col_name": "dtype"} when available
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class BaseExecutor(ABC):
|
| 24 |
+
@abstractmethod
|
| 25 |
+
async def execute(
|
| 26 |
+
self,
|
| 27 |
+
results: list[RetrievalResult],
|
| 28 |
+
user_id: str,
|
| 29 |
+
db: AsyncSession,
|
| 30 |
+
question: str,
|
| 31 |
+
limit: int = 100,
|
| 32 |
+
) -> list[QueryResult]: ...
|
src/query/executors/__init__.py
ADDED
|
File without changes
|
src/query/executors/db_executor.py
ADDED
|
@@ -0,0 +1,648 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Executor for registered database sources (source_type="database").
|
| 2 |
+
|
| 3 |
+
Flow per (client_id, question):
|
| 4 |
+
1. Collect all relevant (table_name, column_name) pairs from retrieval results.
|
| 5 |
+
2. Fetch the FULL schema for those tables from PGVector (not just top-k columns).
|
| 6 |
+
3. Build a schema context string and send to LLM → structured SQLQuery output.
|
| 7 |
+
4. Validate via sqlglot: SELECT-only, schema-grounded, LIMIT enforced.
|
| 8 |
+
5. Execute on the user's DB via engine_scope + asyncio.to_thread.
|
| 9 |
+
6. Return QueryResult per client_id (may span multiple tables via JOINs).
|
| 10 |
+
|
| 11 |
+
Supported db_types: postgres, supabase, mysql.
|
| 12 |
+
Other types are skipped with a warning — they do not raise.
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
import asyncio
|
| 16 |
+
from collections import defaultdict
|
| 17 |
+
from typing import Any
|
| 18 |
+
|
| 19 |
+
import sqlglot
|
| 20 |
+
import sqlglot.expressions as exp
|
| 21 |
+
import tiktoken
|
| 22 |
+
from langchain_core.prompts import ChatPromptTemplate
|
| 23 |
+
from langchain_openai import AzureChatOpenAI
|
| 24 |
+
from sqlalchemy import text
|
| 25 |
+
from sqlalchemy.ext.asyncio import AsyncSession
|
| 26 |
+
|
| 27 |
+
from src.config.settings import settings
|
| 28 |
+
from src.database_client.database_client_service import database_client_service
|
| 29 |
+
from src.db.postgres.connection import _pgvector_engine
|
| 30 |
+
from src.middlewares.logging import get_logger
|
| 31 |
+
from src.models.sql_query import SQLQuery
|
| 32 |
+
from src.pipeline.db_pipeline import db_pipeline_service
|
| 33 |
+
from src.query.base import BaseExecutor, QueryResult
|
| 34 |
+
from src.rag.base import RetrievalResult
|
| 35 |
+
from src.utils.db_credential_encryption import decrypt_credentials_dict
|
| 36 |
+
|
| 37 |
+
logger = get_logger("db_executor")
|
| 38 |
+
|
| 39 |
+
_enc = tiktoken.get_encoding("cl100k_base")
|
| 40 |
+
|
| 41 |
+
_SUPPORTED_DB_TYPES = {"postgres", "supabase", "mysql"}
|
| 42 |
+
_MAX_RETRIES = 3
|
| 43 |
+
_MAX_LIMIT = 500
|
| 44 |
+
_FK_EXPANSION_MAX_TABLES = 5
|
| 45 |
+
|
| 46 |
+
_SQL_SYSTEM_PROMPT = """\
|
| 47 |
+
You are a SQL data analyst working with a user's database.
|
| 48 |
+
Generate a single SQL SELECT statement that answers the user's question.
|
| 49 |
+
|
| 50 |
+
Database dialect: {dialect}
|
| 51 |
+
|
| 52 |
+
Rules:
|
| 53 |
+
- ONLY reference tables and columns listed in the schema below. Do not invent names.
|
| 54 |
+
- Always include a LIMIT clause (max {limit}).
|
| 55 |
+
- Do not use DELETE, UPDATE, INSERT, DROP, TRUNCATE, ALTER, CREATE, or any DDL.
|
| 56 |
+
- Prefer explicit JOINs over subqueries when combining tables.
|
| 57 |
+
- For aggregations, always alias the result column (e.g. COUNT(*) AS order_count).
|
| 58 |
+
- For date filtering, use dialect-appropriate functions ({dialect} syntax).
|
| 59 |
+
|
| 60 |
+
Schema:
|
| 61 |
+
{schema}
|
| 62 |
+
|
| 63 |
+
{error_section}"""
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
class DbExecutor(BaseExecutor):
|
| 67 |
+
def __init__(self) -> None:
|
| 68 |
+
self._llm = AzureChatOpenAI(
|
| 69 |
+
azure_deployment=settings.azureai_deployment_name_4o,
|
| 70 |
+
openai_api_version=settings.azureai_api_version_4o,
|
| 71 |
+
azure_endpoint=settings.azureai_endpoint_url_4o,
|
| 72 |
+
api_key=settings.azureai_api_key_4o,
|
| 73 |
+
temperature=0,
|
| 74 |
+
)
|
| 75 |
+
self._prompt = ChatPromptTemplate.from_messages([
|
| 76 |
+
("system", _SQL_SYSTEM_PROMPT),
|
| 77 |
+
("human", "{question}"),
|
| 78 |
+
])
|
| 79 |
+
self._chain = self._prompt | self._llm.with_structured_output(SQLQuery)
|
| 80 |
+
|
| 81 |
+
# ------------------------------------------------------------------
|
| 82 |
+
# Public interface
|
| 83 |
+
# ------------------------------------------------------------------
|
| 84 |
+
|
| 85 |
+
async def execute(
|
| 86 |
+
self,
|
| 87 |
+
results: list[RetrievalResult],
|
| 88 |
+
user_id: str,
|
| 89 |
+
db: AsyncSession,
|
| 90 |
+
question: str,
|
| 91 |
+
limit: int = 100,
|
| 92 |
+
) -> list[QueryResult]:
|
| 93 |
+
db_results = [r for r in results if r.source_type == "database"]
|
| 94 |
+
if not db_results:
|
| 95 |
+
return []
|
| 96 |
+
|
| 97 |
+
# Group by client_id — one SQL generation + execution pass per client
|
| 98 |
+
by_client: dict[str, list[RetrievalResult]] = defaultdict(list)
|
| 99 |
+
for r in db_results:
|
| 100 |
+
client_id = r.metadata.get("database_client_id", "")
|
| 101 |
+
if client_id:
|
| 102 |
+
by_client[client_id].append(r)
|
| 103 |
+
else:
|
| 104 |
+
logger.warning("db result missing database_client_id, skipping")
|
| 105 |
+
|
| 106 |
+
query_results: list[QueryResult] = []
|
| 107 |
+
for client_id, client_results in by_client.items():
|
| 108 |
+
try:
|
| 109 |
+
qr = await self._execute_for_client(client_id, client_results, user_id, db, question, limit)
|
| 110 |
+
if qr:
|
| 111 |
+
query_results.append(qr)
|
| 112 |
+
except Exception as e:
|
| 113 |
+
logger.error("db executor failed for client", client_id=client_id, error=str(e))
|
| 114 |
+
|
| 115 |
+
return query_results
|
| 116 |
+
|
| 117 |
+
# ------------------------------------------------------------------
|
| 118 |
+
# Per-client execution
|
| 119 |
+
# ------------------------------------------------------------------
|
| 120 |
+
|
| 121 |
+
async def _execute_for_client(
|
| 122 |
+
self,
|
| 123 |
+
client_id: str,
|
| 124 |
+
results: list[RetrievalResult],
|
| 125 |
+
user_id: str,
|
| 126 |
+
db: AsyncSession,
|
| 127 |
+
question: str,
|
| 128 |
+
limit: int,
|
| 129 |
+
) -> QueryResult | None:
|
| 130 |
+
client = await database_client_service.get(db, client_id)
|
| 131 |
+
if not client:
|
| 132 |
+
logger.warning("database client not found", client_id=client_id)
|
| 133 |
+
return None
|
| 134 |
+
if client.user_id != user_id:
|
| 135 |
+
logger.warning("client ownership mismatch", client_id=client_id)
|
| 136 |
+
return None
|
| 137 |
+
if client.db_type not in _SUPPORTED_DB_TYPES:
|
| 138 |
+
logger.warning("unsupported db_type for query execution", db_type=client.db_type)
|
| 139 |
+
return None
|
| 140 |
+
|
| 141 |
+
# Hit tables = tables retrieval pointed at directly. Get full per-column
|
| 142 |
+
# schema for these. Related tables (one FK hop away, both directions) are
|
| 143 |
+
# fetched separately in abbreviated form to give the LLM enough context
|
| 144 |
+
# to JOIN without paying the per-column profile token cost.
|
| 145 |
+
hit_tables = list({
|
| 146 |
+
r.metadata.get("data", {}).get("table_name")
|
| 147 |
+
for r in results
|
| 148 |
+
if r.metadata.get("data", {}).get("table_name")
|
| 149 |
+
})
|
| 150 |
+
if not hit_tables:
|
| 151 |
+
logger.warning("no table_name on any retrieval result", client_id=client_id)
|
| 152 |
+
return None
|
| 153 |
+
|
| 154 |
+
full_schema = await self._fetch_full_schema(client_id, hit_tables, user_id)
|
| 155 |
+
if not full_schema:
|
| 156 |
+
logger.warning("no schema found in vector store", client_id=client_id, tables=hit_tables)
|
| 157 |
+
return None
|
| 158 |
+
|
| 159 |
+
related_tables = await self._find_related_tables(client_id, user_id, hit_tables)
|
| 160 |
+
related_schema = (
|
| 161 |
+
await self._fetch_abbreviated_schema(client_id, user_id, related_tables)
|
| 162 |
+
if related_tables else {}
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
schema_ctx = self._build_schema_context(full_schema, related_schema)
|
| 166 |
+
capped_limit = min(limit, _MAX_LIMIT)
|
| 167 |
+
dialect = client.db_type
|
| 168 |
+
|
| 169 |
+
# SQL generation with retry
|
| 170 |
+
validated_sql: str | None = None
|
| 171 |
+
prev_error: str = ""
|
| 172 |
+
prev_reasoning: str = ""
|
| 173 |
+
for attempt in range(_MAX_RETRIES):
|
| 174 |
+
if prev_error:
|
| 175 |
+
error_section = (
|
| 176 |
+
f"Previous attempt reasoning: {prev_reasoning}\n"
|
| 177 |
+
f"Previous attempt failed: {prev_error}\n"
|
| 178 |
+
"Fix the issue above."
|
| 179 |
+
)
|
| 180 |
+
else:
|
| 181 |
+
error_section = ""
|
| 182 |
+
try:
|
| 183 |
+
prompt_text = schema_ctx + error_section + question
|
| 184 |
+
input_tokens = len(_enc.encode(prompt_text))
|
| 185 |
+
logger.info("sql generation input tokens", attempt=attempt + 1, tokens=input_tokens)
|
| 186 |
+
|
| 187 |
+
result: SQLQuery = await self._chain.ainvoke({
|
| 188 |
+
"schema": schema_ctx,
|
| 189 |
+
"dialect": dialect,
|
| 190 |
+
"limit": capped_limit,
|
| 191 |
+
"error_section": error_section,
|
| 192 |
+
"question": question,
|
| 193 |
+
})
|
| 194 |
+
sql = result.sql.strip()
|
| 195 |
+
allowed_tables = set(full_schema) | set(related_schema)
|
| 196 |
+
column_map: dict[str, set[str]] = {
|
| 197 |
+
t: {c["name"] for c in cols} for t, cols in full_schema.items()
|
| 198 |
+
}
|
| 199 |
+
for t, info in related_schema.items():
|
| 200 |
+
column_map[t] = set(info.get("column_names") or [])
|
| 201 |
+
validation_error = self._validate(sql, allowed_tables, capped_limit, column_map)
|
| 202 |
+
if validation_error:
|
| 203 |
+
prev_error = validation_error
|
| 204 |
+
prev_reasoning = result.reasoning
|
| 205 |
+
logger.warning("sql validation failed", attempt=attempt + 1, error=validation_error)
|
| 206 |
+
continue
|
| 207 |
+
validated_sql = self._enforce_limit(sql, capped_limit)
|
| 208 |
+
output_tokens = len(_enc.encode(result.sql)) + len(_enc.encode(result.reasoning))
|
| 209 |
+
logger.info(
|
| 210 |
+
"sql generated",
|
| 211 |
+
attempt=attempt + 1,
|
| 212 |
+
input_tokens=input_tokens,
|
| 213 |
+
output_tokens=output_tokens,
|
| 214 |
+
total_tokens=input_tokens + output_tokens,
|
| 215 |
+
reasoning=result.reasoning,
|
| 216 |
+
)
|
| 217 |
+
break
|
| 218 |
+
except Exception as e:
|
| 219 |
+
prev_error = str(e)
|
| 220 |
+
logger.warning("sql generation error", attempt=attempt + 1, error=prev_error)
|
| 221 |
+
|
| 222 |
+
if not validated_sql:
|
| 223 |
+
logger.error("sql generation failed after retries", client_id=client_id)
|
| 224 |
+
return None
|
| 225 |
+
|
| 226 |
+
# Execute on user's DB
|
| 227 |
+
creds = decrypt_credentials_dict(client.credentials)
|
| 228 |
+
with db_pipeline_service.engine_scope(client.db_type, creds) as engine:
|
| 229 |
+
rows = await asyncio.to_thread(self._run_sql, engine, validated_sql)
|
| 230 |
+
|
| 231 |
+
column_types = {
|
| 232 |
+
col["name"]: col["type"]
|
| 233 |
+
for cols in full_schema.values()
|
| 234 |
+
for col in cols
|
| 235 |
+
}
|
| 236 |
+
columns = list(rows[0].keys()) if rows else []
|
| 237 |
+
|
| 238 |
+
return QueryResult(
|
| 239 |
+
source_type="database",
|
| 240 |
+
source_id=client_id,
|
| 241 |
+
table_or_file=", ".join(hit_tables),
|
| 242 |
+
columns=columns,
|
| 243 |
+
rows=rows,
|
| 244 |
+
row_count=len(rows),
|
| 245 |
+
metadata={
|
| 246 |
+
"db_type": client.db_type,
|
| 247 |
+
"client_name": client.name,
|
| 248 |
+
"sql": validated_sql,
|
| 249 |
+
"column_types": {c: column_types.get(c, "unknown") for c in columns},
|
| 250 |
+
},
|
| 251 |
+
)
|
| 252 |
+
|
| 253 |
+
# ------------------------------------------------------------------
|
| 254 |
+
# Schema helpers
|
| 255 |
+
# ------------------------------------------------------------------
|
| 256 |
+
|
| 257 |
+
async def _find_related_tables(
|
| 258 |
+
self,
|
| 259 |
+
client_id: str,
|
| 260 |
+
user_id: str,
|
| 261 |
+
hit_tables: list[str],
|
| 262 |
+
) -> list[str]:
|
| 263 |
+
"""One-hop FK neighbours of `hit_tables`, both directions, excluding hits.
|
| 264 |
+
|
| 265 |
+
Prefers chunk_level='table' rows; if none exist for the client (legacy
|
| 266 |
+
ingest predating Phase 1), falls back to aggregating from column-chunk
|
| 267 |
+
metadata. Returns [] when no FK metadata is available.
|
| 268 |
+
|
| 269 |
+
Capped at _FK_EXPANSION_MAX_TABLES, ranked by edge count desc then
|
| 270 |
+
table name asc. A warning is logged when the cap kicks in.
|
| 271 |
+
"""
|
| 272 |
+
if not hit_tables:
|
| 273 |
+
return []
|
| 274 |
+
|
| 275 |
+
hit_set = set(hit_tables)
|
| 276 |
+
# edge_counts[related_table] = number of FK edges connecting it to the hit set
|
| 277 |
+
edge_counts: dict[str, int] = defaultdict(int)
|
| 278 |
+
|
| 279 |
+
# ---- Primary path: table-level chunks ----
|
| 280 |
+
sql = text("""
|
| 281 |
+
SELECT lpe.cmetadata
|
| 282 |
+
FROM langchain_pg_embedding lpe
|
| 283 |
+
JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
|
| 284 |
+
WHERE lpc.name = 'document_embeddings'
|
| 285 |
+
AND lpe.cmetadata->>'user_id' = :user_id
|
| 286 |
+
AND lpe.cmetadata->>'source_type' = 'database'
|
| 287 |
+
AND lpe.cmetadata->>'database_client_id' = :client_id
|
| 288 |
+
AND lpe.cmetadata->>'chunk_level' = 'table'
|
| 289 |
+
""")
|
| 290 |
+
async with _pgvector_engine.connect() as conn:
|
| 291 |
+
result = await conn.execute(sql, {"user_id": user_id, "client_id": client_id})
|
| 292 |
+
table_rows = result.fetchall()
|
| 293 |
+
|
| 294 |
+
if table_rows:
|
| 295 |
+
for row in table_rows:
|
| 296 |
+
data = row.cmetadata.get("data", {})
|
| 297 |
+
table = data.get("table_name")
|
| 298 |
+
fks = data.get("foreign_keys") or []
|
| 299 |
+
if not table:
|
| 300 |
+
continue
|
| 301 |
+
if table in hit_set:
|
| 302 |
+
# Outgoing: this hit's FKs point at related tables
|
| 303 |
+
for fk in fks:
|
| 304 |
+
target = fk.get("target_table")
|
| 305 |
+
if target and target not in hit_set:
|
| 306 |
+
edge_counts[target] += 1
|
| 307 |
+
else:
|
| 308 |
+
# Incoming: this non-hit table's FKs point into the hit set
|
| 309 |
+
for fk in fks:
|
| 310 |
+
target = fk.get("target_table")
|
| 311 |
+
if target in hit_set:
|
| 312 |
+
edge_counts[table] += 1
|
| 313 |
+
else:
|
| 314 |
+
# ---- Fallback: aggregate from column chunks ----
|
| 315 |
+
sql = text("""
|
| 316 |
+
SELECT lpe.cmetadata->'data'->>'table_name' AS src_table,
|
| 317 |
+
lpe.cmetadata->'data'->>'foreign_key' AS fk
|
| 318 |
+
FROM langchain_pg_embedding lpe
|
| 319 |
+
JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
|
| 320 |
+
WHERE lpc.name = 'document_embeddings'
|
| 321 |
+
AND lpe.cmetadata->>'user_id' = :user_id
|
| 322 |
+
AND lpe.cmetadata->>'source_type' = 'database'
|
| 323 |
+
AND lpe.cmetadata->>'database_client_id' = :client_id
|
| 324 |
+
AND lpe.cmetadata->>'chunk_level' = 'column'
|
| 325 |
+
AND lpe.cmetadata->'data'->>'foreign_key' IS NOT NULL
|
| 326 |
+
""")
|
| 327 |
+
async with _pgvector_engine.connect() as conn:
|
| 328 |
+
result = await conn.execute(sql, {"user_id": user_id, "client_id": client_id})
|
| 329 |
+
col_rows = result.fetchall()
|
| 330 |
+
|
| 331 |
+
for row in col_rows:
|
| 332 |
+
src = row.src_table
|
| 333 |
+
fk = row.fk
|
| 334 |
+
if not src or not fk:
|
| 335 |
+
continue
|
| 336 |
+
target = fk.split(".", 1)[0]
|
| 337 |
+
if src in hit_set and target and target not in hit_set:
|
| 338 |
+
edge_counts[target] += 1
|
| 339 |
+
elif src not in hit_set and target in hit_set:
|
| 340 |
+
edge_counts[src] += 1
|
| 341 |
+
|
| 342 |
+
if not edge_counts:
|
| 343 |
+
return []
|
| 344 |
+
|
| 345 |
+
ranked = sorted(edge_counts.items(), key=lambda kv: (-kv[1], kv[0]))
|
| 346 |
+
if len(ranked) > _FK_EXPANSION_MAX_TABLES:
|
| 347 |
+
logger.warning(
|
| 348 |
+
"fk expansion cap hit",
|
| 349 |
+
client_id=client_id,
|
| 350 |
+
total=len(ranked),
|
| 351 |
+
cap=_FK_EXPANSION_MAX_TABLES,
|
| 352 |
+
dropped=[t for t, _ in ranked[_FK_EXPANSION_MAX_TABLES:]],
|
| 353 |
+
)
|
| 354 |
+
ranked = ranked[:_FK_EXPANSION_MAX_TABLES]
|
| 355 |
+
|
| 356 |
+
related = [t for t, _ in ranked]
|
| 357 |
+
logger.info("fk-related tables", hit=sorted(hit_set), related=related)
|
| 358 |
+
return related
|
| 359 |
+
|
| 360 |
+
async def _fetch_abbreviated_schema(
|
| 361 |
+
self,
|
| 362 |
+
client_id: str,
|
| 363 |
+
user_id: str,
|
| 364 |
+
table_names: list[str],
|
| 365 |
+
) -> dict[str, dict[str, Any]]:
|
| 366 |
+
"""Abbreviated schema: name, row_count, PK, FKs, column names — no profiles.
|
| 367 |
+
|
| 368 |
+
Prefers chunk_level='table' rows. Falls back to aggregating column-chunk
|
| 369 |
+
metadata when table chunks are missing for a given table_name.
|
| 370 |
+
|
| 371 |
+
Returns {table_name: {"row_count": int|None, "primary_key": [str],
|
| 372 |
+
"foreign_keys": [{column, target_table, target_column}],
|
| 373 |
+
"column_names": [str]}}.
|
| 374 |
+
"""
|
| 375 |
+
if not table_names:
|
| 376 |
+
return {}
|
| 377 |
+
|
| 378 |
+
placeholders = ", ".join(f":t{i}" for i in range(len(table_names)))
|
| 379 |
+
params: dict[str, Any] = {"user_id": user_id, "client_id": client_id}
|
| 380 |
+
for i, name in enumerate(table_names):
|
| 381 |
+
params[f"t{i}"] = name
|
| 382 |
+
|
| 383 |
+
# Primary path: one row per table from chunk_level='table'
|
| 384 |
+
sql_table = text(f"""
|
| 385 |
+
SELECT lpe.cmetadata
|
| 386 |
+
FROM langchain_pg_embedding lpe
|
| 387 |
+
JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
|
| 388 |
+
WHERE lpc.name = 'document_embeddings'
|
| 389 |
+
AND lpe.cmetadata->>'user_id' = :user_id
|
| 390 |
+
AND lpe.cmetadata->>'source_type' = 'database'
|
| 391 |
+
AND lpe.cmetadata->>'database_client_id' = :client_id
|
| 392 |
+
AND lpe.cmetadata->>'chunk_level' = 'table'
|
| 393 |
+
AND lpe.cmetadata->'data'->>'table_name' IN ({placeholders})
|
| 394 |
+
""")
|
| 395 |
+
async with _pgvector_engine.connect() as conn:
|
| 396 |
+
result = await conn.execute(sql_table, params)
|
| 397 |
+
t_rows = result.fetchall()
|
| 398 |
+
|
| 399 |
+
out: dict[str, dict[str, Any]] = {}
|
| 400 |
+
for row in t_rows:
|
| 401 |
+
data = row.cmetadata.get("data", {})
|
| 402 |
+
tname = data.get("table_name")
|
| 403 |
+
if not tname:
|
| 404 |
+
continue
|
| 405 |
+
out[tname] = {
|
| 406 |
+
"row_count": data.get("row_count"),
|
| 407 |
+
"primary_key": list(data.get("primary_key") or []),
|
| 408 |
+
"foreign_keys": list(data.get("foreign_keys") or []),
|
| 409 |
+
"column_names": list(data.get("column_names") or []),
|
| 410 |
+
}
|
| 411 |
+
|
| 412 |
+
# Fallback for tables with no table-chunk: aggregate column chunks
|
| 413 |
+
missing = [t for t in table_names if t not in out]
|
| 414 |
+
if missing:
|
| 415 |
+
placeholders_m = ", ".join(f":m{i}" for i in range(len(missing)))
|
| 416 |
+
params_m: dict[str, Any] = {"user_id": user_id, "client_id": client_id}
|
| 417 |
+
for i, name in enumerate(missing):
|
| 418 |
+
params_m[f"m{i}"] = name
|
| 419 |
+
sql_col = text(f"""
|
| 420 |
+
SELECT lpe.cmetadata
|
| 421 |
+
FROM langchain_pg_embedding lpe
|
| 422 |
+
JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
|
| 423 |
+
WHERE lpc.name = 'document_embeddings'
|
| 424 |
+
AND lpe.cmetadata->>'user_id' = :user_id
|
| 425 |
+
AND lpe.cmetadata->>'source_type' = 'database'
|
| 426 |
+
AND lpe.cmetadata->>'database_client_id' = :client_id
|
| 427 |
+
AND lpe.cmetadata->>'chunk_level' = 'column'
|
| 428 |
+
AND lpe.cmetadata->'data'->>'table_name' IN ({placeholders_m})
|
| 429 |
+
ORDER BY lpe.cmetadata->'data'->>'table_name', lpe.cmetadata->'data'->>'column_name'
|
| 430 |
+
""")
|
| 431 |
+
async with _pgvector_engine.connect() as conn:
|
| 432 |
+
result = await conn.execute(sql_col, params_m)
|
| 433 |
+
c_rows = result.fetchall()
|
| 434 |
+
|
| 435 |
+
agg: dict[str, dict[str, Any]] = {
|
| 436 |
+
t: {"row_count": None, "primary_key": [], "foreign_keys": [], "column_names": []}
|
| 437 |
+
for t in missing
|
| 438 |
+
}
|
| 439 |
+
for row in c_rows:
|
| 440 |
+
data = row.cmetadata.get("data", {})
|
| 441 |
+
tname = data.get("table_name")
|
| 442 |
+
cname = data.get("column_name")
|
| 443 |
+
if not tname or tname not in agg or not cname:
|
| 444 |
+
continue
|
| 445 |
+
bucket = agg[tname]
|
| 446 |
+
bucket["column_names"].append(cname)
|
| 447 |
+
if data.get("is_primary_key"):
|
| 448 |
+
bucket["primary_key"].append(cname)
|
| 449 |
+
fk = data.get("foreign_key")
|
| 450 |
+
if fk:
|
| 451 |
+
target_table, _, target_col = fk.partition(".")
|
| 452 |
+
bucket["foreign_keys"].append({
|
| 453 |
+
"column": cname,
|
| 454 |
+
"target_table": target_table,
|
| 455 |
+
"target_column": target_col,
|
| 456 |
+
})
|
| 457 |
+
for t, v in agg.items():
|
| 458 |
+
if v["column_names"]:
|
| 459 |
+
out[t] = v
|
| 460 |
+
|
| 461 |
+
return out
|
| 462 |
+
|
| 463 |
+
async def _fetch_full_schema(
|
| 464 |
+
self,
|
| 465 |
+
client_id: str,
|
| 466 |
+
table_names: list[str],
|
| 467 |
+
user_id: str,
|
| 468 |
+
) -> dict[str, list[dict[str, Any]]]:
|
| 469 |
+
"""Fetch ALL column chunks for the given tables from PGVector.
|
| 470 |
+
|
| 471 |
+
Returns {table_name: [{"name": ..., "type": ..., "is_primary_key": ...,
|
| 472 |
+
"foreign_key": ..., "content": ...}]}
|
| 473 |
+
"""
|
| 474 |
+
placeholders = ", ".join(f":t{i}" for i in range(len(table_names)))
|
| 475 |
+
sql = text(f"""
|
| 476 |
+
SELECT lpe.cmetadata, lpe.document
|
| 477 |
+
FROM langchain_pg_embedding lpe
|
| 478 |
+
JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
|
| 479 |
+
WHERE lpc.name = 'document_embeddings'
|
| 480 |
+
AND lpe.cmetadata->>'user_id' = :user_id
|
| 481 |
+
AND lpe.cmetadata->>'source_type' = 'database'
|
| 482 |
+
AND lpe.cmetadata->>'chunk_level' = 'column'
|
| 483 |
+
AND lpe.cmetadata->>'database_client_id' = :client_id
|
| 484 |
+
AND lpe.cmetadata->'data'->>'table_name' IN ({placeholders})
|
| 485 |
+
ORDER BY lpe.cmetadata->'data'->>'table_name', lpe.cmetadata->'data'->>'column_name'
|
| 486 |
+
""")
|
| 487 |
+
|
| 488 |
+
params: dict[str, Any] = {"user_id": user_id, "client_id": client_id}
|
| 489 |
+
for i, name in enumerate(table_names):
|
| 490 |
+
params[f"t{i}"] = name
|
| 491 |
+
|
| 492 |
+
async with _pgvector_engine.connect() as conn:
|
| 493 |
+
result = await conn.execute(sql, params)
|
| 494 |
+
rows = result.fetchall()
|
| 495 |
+
|
| 496 |
+
schema: dict[str, list[dict[str, Any]]] = defaultdict(list)
|
| 497 |
+
for row in rows:
|
| 498 |
+
data = row.cmetadata.get("data", {})
|
| 499 |
+
table = data.get("table_name")
|
| 500 |
+
if table:
|
| 501 |
+
schema[table].append({
|
| 502 |
+
"name": data.get("column_name", ""),
|
| 503 |
+
"type": data.get("column_type", ""),
|
| 504 |
+
"is_primary_key": data.get("is_primary_key", False),
|
| 505 |
+
"foreign_key": data.get("foreign_key"),
|
| 506 |
+
"content": row.document, # chunk text includes top values / samples
|
| 507 |
+
})
|
| 508 |
+
return dict(schema)
|
| 509 |
+
|
| 510 |
+
def _build_schema_context(
|
| 511 |
+
self,
|
| 512 |
+
schema: dict[str, list[dict[str, Any]]],
|
| 513 |
+
related_schema: dict[str, dict[str, Any]] | None = None,
|
| 514 |
+
) -> str:
|
| 515 |
+
lines: list[str] = []
|
| 516 |
+
for table, columns in schema.items():
|
| 517 |
+
lines.append(f"Table: {table}")
|
| 518 |
+
for col in columns:
|
| 519 |
+
flags = []
|
| 520 |
+
if col["is_primary_key"]:
|
| 521 |
+
flags.append("PRIMARY KEY")
|
| 522 |
+
if col["foreign_key"]:
|
| 523 |
+
flags.append(f"FK -> {col['foreign_key']}")
|
| 524 |
+
flag_str = f" [{', '.join(flags)}]" if flags else ""
|
| 525 |
+
lines.append(f" - {col['name']} {col['type']}{flag_str}")
|
| 526 |
+
# Include sample/top-values line from chunk content if present
|
| 527 |
+
for line in col["content"].splitlines():
|
| 528 |
+
if line.startswith(("Top values:", "Sample values:")):
|
| 529 |
+
lines.append(f" {line}")
|
| 530 |
+
break
|
| 531 |
+
lines.append("")
|
| 532 |
+
|
| 533 |
+
related_block = self._build_related_schema_block(related_schema or {})
|
| 534 |
+
if related_block:
|
| 535 |
+
lines.append(related_block)
|
| 536 |
+
|
| 537 |
+
return "\n".join(lines).strip()
|
| 538 |
+
|
| 539 |
+
def _build_related_schema_block(self, related_schema: dict[str, dict[str, Any]]) -> str:
|
| 540 |
+
"""Format the abbreviated FK-related-tables section. Empty string when no related."""
|
| 541 |
+
if not related_schema:
|
| 542 |
+
return ""
|
| 543 |
+
lines: list[str] = ["Related tables (one hop via FK, abbreviated — use for JOINs only):"]
|
| 544 |
+
for table, info in related_schema.items():
|
| 545 |
+
row_count = info.get("row_count")
|
| 546 |
+
header = f"- {table} ({row_count} rows)" if row_count is not None else f"- {table}"
|
| 547 |
+
lines.append(header)
|
| 548 |
+
pk = info.get("primary_key") or []
|
| 549 |
+
lines.append(f" Primary key: {', '.join(pk) if pk else '(none)'}")
|
| 550 |
+
fks = info.get("foreign_keys") or []
|
| 551 |
+
if fks:
|
| 552 |
+
fk_strs = [
|
| 553 |
+
f"{fk.get('column')} -> {fk.get('target_table')}.{fk.get('target_column')}"
|
| 554 |
+
for fk in fks
|
| 555 |
+
]
|
| 556 |
+
lines.append(f" Foreign keys: {', '.join(fk_strs)}")
|
| 557 |
+
else:
|
| 558 |
+
lines.append(" Foreign keys: (none)")
|
| 559 |
+
cols = info.get("column_names") or []
|
| 560 |
+
lines.append(f" Columns: {', '.join(cols)}")
|
| 561 |
+
return "\n".join(lines)
|
| 562 |
+
|
| 563 |
+
# ------------------------------------------------------------------
|
| 564 |
+
# Guardrails
|
| 565 |
+
# ------------------------------------------------------------------
|
| 566 |
+
|
| 567 |
+
def _validate(
|
| 568 |
+
self,
|
| 569 |
+
sql: str,
|
| 570 |
+
allowed_tables: set[str],
|
| 571 |
+
limit: int,
|
| 572 |
+
column_map: dict[str, set[str]] | None = None,
|
| 573 |
+
) -> str:
|
| 574 |
+
"""Return an error string if validation fails, empty string if OK.
|
| 575 |
+
|
| 576 |
+
`allowed_tables` is the union of hit-table names and FK-related table
|
| 577 |
+
names — both are legal targets for SELECT/JOIN.
|
| 578 |
+
|
| 579 |
+
`column_map` maps table_name → set of valid column names. When provided,
|
| 580 |
+
any qualified table.column reference not found in the map triggers a retry
|
| 581 |
+
with an informative error so the LLM can self-correct without hallucinating.
|
| 582 |
+
"""
|
| 583 |
+
# Layer 1: sqlglot parse + SELECT-only check
|
| 584 |
+
try:
|
| 585 |
+
parsed = sqlglot.parse_one(sql)
|
| 586 |
+
except sqlglot.errors.ParseError as e:
|
| 587 |
+
return f"SQL parse error: {e}"
|
| 588 |
+
|
| 589 |
+
if not isinstance(parsed, exp.Select):
|
| 590 |
+
return f"Only SELECT statements are allowed. Got: {type(parsed).__name__}"
|
| 591 |
+
|
| 592 |
+
# Check for DML anywhere in the AST (including writeable CTEs)
|
| 593 |
+
for node in parsed.find_all((exp.Insert, exp.Update, exp.Delete)):
|
| 594 |
+
return f"DML ({type(node).__name__}) is not allowed."
|
| 595 |
+
|
| 596 |
+
# Layer 2: schema grounding — table names
|
| 597 |
+
known_tables = {t.lower() for t in allowed_tables}
|
| 598 |
+
alias_to_table: dict[str, str] = {}
|
| 599 |
+
for tbl in parsed.find_all(exp.Table):
|
| 600 |
+
name = tbl.name.lower()
|
| 601 |
+
if name and name not in known_tables:
|
| 602 |
+
return f"Unknown table '{tbl.name}'. Only use tables from the schema."
|
| 603 |
+
alias = (tbl.alias or tbl.name).lower()
|
| 604 |
+
alias_to_table[alias] = name
|
| 605 |
+
|
| 606 |
+
# Layer 3: column grounding — qualified references only (table.column)
|
| 607 |
+
if column_map:
|
| 608 |
+
normalized_map = {t.lower(): {c.lower() for c in cols} for t, cols in column_map.items()}
|
| 609 |
+
for col_node in parsed.find_all(exp.Column):
|
| 610 |
+
tbl_ref = col_node.table
|
| 611 |
+
if not tbl_ref:
|
| 612 |
+
continue # unqualified — skip, can't resolve without full alias tracking
|
| 613 |
+
tbl_name = alias_to_table.get(tbl_ref.lower(), tbl_ref.lower())
|
| 614 |
+
col_name = col_node.name.lower()
|
| 615 |
+
if tbl_name in normalized_map and col_name not in normalized_map[tbl_name]:
|
| 616 |
+
available = ", ".join(sorted(normalized_map[tbl_name]))
|
| 617 |
+
return (
|
| 618 |
+
f"Column '{col_node.name}' does not exist on table '{tbl_name}'. "
|
| 619 |
+
f"Available columns: {available}."
|
| 620 |
+
)
|
| 621 |
+
|
| 622 |
+
# Layer 4: LIMIT enforcement (inject if missing — done before execution)
|
| 623 |
+
return ""
|
| 624 |
+
|
| 625 |
+
# ------------------------------------------------------------------
|
| 626 |
+
# SQL execution
|
| 627 |
+
# ------------------------------------------------------------------
|
| 628 |
+
|
| 629 |
+
def _enforce_limit(self, sql: str, limit: int) -> str:
|
| 630 |
+
"""Inject or cap LIMIT using sqlglot AST manipulation."""
|
| 631 |
+
parsed = sqlglot.parse_one(sql)
|
| 632 |
+
existing = parsed.find(exp.Limit)
|
| 633 |
+
if existing:
|
| 634 |
+
current = int(existing.expression.this)
|
| 635 |
+
if current > limit:
|
| 636 |
+
return parsed.limit(limit).sql()
|
| 637 |
+
else:
|
| 638 |
+
return parsed.limit(limit).sql()
|
| 639 |
+
return parsed.sql()
|
| 640 |
+
|
| 641 |
+
def _run_sql(self, engine: Any, sql: str) -> list[dict]:
|
| 642 |
+
# Ensure the user DB connection is a read-only credential — sqlglot validation alone is not sufficient.
|
| 643 |
+
with engine.connect() as conn:
|
| 644 |
+
result = conn.execute(text(sql))
|
| 645 |
+
return [dict(row) for row in result.mappings()]
|
| 646 |
+
|
| 647 |
+
|
| 648 |
+
db_executor = DbExecutor()
|
src/query/executors/tabular.py
ADDED
|
@@ -0,0 +1,287 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Executor for tabular document sources (source_type="document", file_type csv/xlsx).
|
| 2 |
+
|
| 3 |
+
Flow:
|
| 4 |
+
1. Group RetrievalResult chunks by (document_id, sheet_name).
|
| 5 |
+
2. Per group: download Parquet from Azure Blob → pandas DataFrame.
|
| 6 |
+
3. Build schema context from DataFrame columns + sample values.
|
| 7 |
+
4. LLM decides operation (groupby_sum, filter, top_n, etc.) via structured output.
|
| 8 |
+
5. Pandas runs the operation; retry up to 3x on error with feedback to LLM.
|
| 9 |
+
6. Fallback to raw rows if all retries fail.
|
| 10 |
+
7. Return QueryResult per group.
|
| 11 |
+
"""
|
| 12 |
+
import asyncio
|
| 13 |
+
from typing import Literal, TypedDict
|
| 14 |
+
|
| 15 |
+
import pandas as pd
|
| 16 |
+
from langchain_core.prompts import ChatPromptTemplate
|
| 17 |
+
from langchain_openai import AzureChatOpenAI
|
| 18 |
+
from pydantic import BaseModel
|
| 19 |
+
from sqlalchemy.ext.asyncio import AsyncSession
|
| 20 |
+
|
| 21 |
+
from src.config.settings import settings
|
| 22 |
+
from src.knowledge.parquet_service import download_parquet
|
| 23 |
+
from src.middlewares.logging import get_logger
|
| 24 |
+
from src.query.base import BaseExecutor, QueryResult
|
| 25 |
+
from src.rag.base import RetrievalResult
|
| 26 |
+
|
| 27 |
+
logger = get_logger("tabular_executor")
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
class _GroupInfo(TypedDict):
|
| 31 |
+
filename: str
|
| 32 |
+
file_type: str
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
_TABULAR_FILE_TYPES = ("csv", "xlsx")
|
| 36 |
+
_MAX_RETRIES = 3
|
| 37 |
+
|
| 38 |
+
_SYSTEM_PROMPT = """\
|
| 39 |
+
You are a data analyst. Given a DataFrame schema and a user question, \
|
| 40 |
+
decide which pandas operation to perform.
|
| 41 |
+
|
| 42 |
+
IMPORTANT rules:
|
| 43 |
+
- Use ONLY the exact column names as written in the schema below. Never translate or rename them.
|
| 44 |
+
- For top_n: always set value_col to the column to sort by. Do NOT use sort_col for top_n.
|
| 45 |
+
- For sort: use sort_col for the column to sort by.
|
| 46 |
+
- For filter with comparison (>, <, >=, <=, !=): set filter_operator accordingly (gt, lt, gte, lte, ne). Default is eq (==).
|
| 47 |
+
- For multi-condition filters (AND logic), use the filters field as a list of {{"col", "value", "op"}} dicts instead of filter_col/filter_value.
|
| 48 |
+
Example: status=SUCCESS AND amount_paid>200000 → filters=[{{"col":"status","value":"SUCCESS","op":"eq"}},{{"col":"amount_paid","value":"200000","op":"gt"}}]
|
| 49 |
+
- For OR conditions on a column (e.g. value is A or B), use or_filters. Combine with filters for mixed AND+OR logic.
|
| 50 |
+
Example: (status=FAILED OR status=REVERSED) AND payment_channel=X → or_filters=[{{"col":"status","value":"FAILED","op":"eq"}},{{"col":"status","value":"REVERSED","op":"eq"}}], filters=[{{"col":"payment_channel","value":"X","op":"eq"}}]
|
| 51 |
+
- For groupby with a pre-filter (e.g. count SUCCESS per channel): use filters or or_filters to narrow rows first, then use groupby_count/groupby_sum/groupby_avg on the filtered data by setting both filters and group_col.
|
| 52 |
+
|
| 53 |
+
Schema:
|
| 54 |
+
{schema}
|
| 55 |
+
|
| 56 |
+
{error_section}"""
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
class TabularOperation(BaseModel):
|
| 60 |
+
operation: Literal[
|
| 61 |
+
"filter", "groupby_sum", "groupby_avg", "groupby_count",
|
| 62 |
+
"top_n", "sort", "aggregate", "raw"
|
| 63 |
+
]
|
| 64 |
+
group_col: str | None = None # for groupby_*
|
| 65 |
+
value_col: str | None = None # for groupby_*, top_n, aggregate
|
| 66 |
+
filter_col: str | None = None # for single filter
|
| 67 |
+
filter_value: str | None = None # for single filter
|
| 68 |
+
filter_operator: Literal["eq", "ne", "gt", "gte", "lt", "lte"] = "eq" # for single filter
|
| 69 |
+
filters: list[dict] | None = None # for multi-condition AND: [{"col": ..., "value": ..., "op": ...}]
|
| 70 |
+
or_filters: list[dict] | None = None # for OR conditions, applied before AND filters
|
| 71 |
+
sort_col: str | None = None # for sort
|
| 72 |
+
ascending: bool = True # for sort
|
| 73 |
+
n: int | None = None # for top_n
|
| 74 |
+
agg_func: Literal["sum", "avg", "min", "max", "count"] | None = None # for aggregate
|
| 75 |
+
reasoning: str
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def _get_filter_mask(df: pd.DataFrame, col: str, value: str, operator: str) -> pd.Series:
|
| 79 |
+
numeric = pd.to_numeric(df[col], errors="coerce")
|
| 80 |
+
if operator == "eq":
|
| 81 |
+
return df[col].astype(str) == str(value)
|
| 82 |
+
elif operator == "ne":
|
| 83 |
+
return df[col].astype(str) != str(value)
|
| 84 |
+
elif operator == "gt":
|
| 85 |
+
return numeric > float(value)
|
| 86 |
+
elif operator == "gte":
|
| 87 |
+
return numeric >= float(value)
|
| 88 |
+
elif operator == "lt":
|
| 89 |
+
return numeric < float(value)
|
| 90 |
+
elif operator == "lte":
|
| 91 |
+
return numeric <= float(value)
|
| 92 |
+
raise ValueError(f"Unknown operator: {operator}")
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def _apply_single_filter(df: pd.DataFrame, col: str, value: str, operator: str) -> pd.DataFrame:
|
| 96 |
+
return df[_get_filter_mask(df, col, value, operator)]
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
def _build_schema_context(df: pd.DataFrame) -> str:
|
| 100 |
+
lines = []
|
| 101 |
+
for col in df.columns:
|
| 102 |
+
sample = df[col].dropna().head(3).tolist()
|
| 103 |
+
lines.append(f"- {col} ({df[col].dtype}): sample values: {sample}")
|
| 104 |
+
return "\n".join(lines)
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def _apply_operation(df: pd.DataFrame, op: TabularOperation, limit: int) -> pd.DataFrame:
|
| 108 |
+
if op.operation == "groupby_sum":
|
| 109 |
+
if not op.group_col or not op.value_col:
|
| 110 |
+
raise ValueError(f"groupby_sum requires group_col and value_col, got {op}")
|
| 111 |
+
return df.groupby(op.group_col)[op.value_col].sum().reset_index().nlargest(limit, op.value_col)
|
| 112 |
+
elif op.operation == "groupby_avg":
|
| 113 |
+
if not op.group_col or not op.value_col:
|
| 114 |
+
raise ValueError(f"groupby_avg requires group_col and value_col, got {op}")
|
| 115 |
+
return df.groupby(op.group_col)[op.value_col].mean().reset_index().nlargest(limit, op.value_col)
|
| 116 |
+
elif op.operation == "groupby_count":
|
| 117 |
+
if not op.group_col:
|
| 118 |
+
raise ValueError(f"groupby_count requires group_col, got {op}")
|
| 119 |
+
df_filtered = df.copy()
|
| 120 |
+
if op.or_filters:
|
| 121 |
+
or_mask = pd.Series([False] * len(df_filtered), index=df_filtered.index)
|
| 122 |
+
for f in op.or_filters:
|
| 123 |
+
or_mask = or_mask | _get_filter_mask(df_filtered, f["col"], f["value"], f.get("op", "eq"))
|
| 124 |
+
df_filtered = df_filtered[or_mask]
|
| 125 |
+
if op.filters:
|
| 126 |
+
for f in op.filters:
|
| 127 |
+
df_filtered = _apply_single_filter(df_filtered, f["col"], f["value"], f.get("op", "eq"))
|
| 128 |
+
elif op.filter_col and op.filter_value is not None:
|
| 129 |
+
df_filtered = _apply_single_filter(df_filtered, op.filter_col, op.filter_value, op.filter_operator)
|
| 130 |
+
return df_filtered.groupby(op.group_col).size().reset_index(name="count").nlargest(limit, "count")
|
| 131 |
+
elif op.operation == "filter":
|
| 132 |
+
result = df.copy()
|
| 133 |
+
if op.or_filters:
|
| 134 |
+
or_mask = pd.Series([False] * len(result), index=result.index)
|
| 135 |
+
for f in op.or_filters:
|
| 136 |
+
or_mask = or_mask | _get_filter_mask(result, f["col"], f["value"], f.get("op", "eq"))
|
| 137 |
+
result = result[or_mask]
|
| 138 |
+
if op.filters:
|
| 139 |
+
for f in op.filters:
|
| 140 |
+
result = _apply_single_filter(result, f["col"], f["value"], f.get("op", "eq"))
|
| 141 |
+
elif op.filter_col and op.filter_value is not None and not op.or_filters:
|
| 142 |
+
result = _apply_single_filter(result, op.filter_col, op.filter_value, op.filter_operator)
|
| 143 |
+
elif not op.or_filters and not op.filters and (not op.filter_col or op.filter_value is None):
|
| 144 |
+
raise ValueError(f"filter requires filter_col/filter_value or filters or or_filters, got {op}")
|
| 145 |
+
return result.head(limit)
|
| 146 |
+
elif op.operation == "top_n":
|
| 147 |
+
col = op.value_col
|
| 148 |
+
if not col:
|
| 149 |
+
raise ValueError(f"top_n requires value_col, got {op}")
|
| 150 |
+
n = op.n or limit
|
| 151 |
+
return df.nlargest(n, col)
|
| 152 |
+
elif op.operation == "sort":
|
| 153 |
+
if not op.sort_col:
|
| 154 |
+
raise ValueError(f"sort requires sort_col, got {op}")
|
| 155 |
+
return df.sort_values(op.sort_col, ascending=op.ascending).head(limit)
|
| 156 |
+
elif op.operation == "aggregate":
|
| 157 |
+
if not op.value_col or not op.agg_func:
|
| 158 |
+
raise ValueError(f"aggregate requires value_col and agg_func, got {op}")
|
| 159 |
+
funcs = {"sum": "sum", "avg": "mean", "min": "min", "max": "max", "count": "count"}
|
| 160 |
+
value = getattr(df[op.value_col], funcs[op.agg_func])()
|
| 161 |
+
return pd.DataFrame([{op.value_col: value, "operation": op.agg_func}])
|
| 162 |
+
else: # "raw"
|
| 163 |
+
return df.head(limit)
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
class TabularExecutor(BaseExecutor):
|
| 167 |
+
def __init__(self) -> None:
|
| 168 |
+
self._llm = AzureChatOpenAI(
|
| 169 |
+
azure_deployment=settings.azureai_deployment_name_4o,
|
| 170 |
+
openai_api_version=settings.azureai_api_version_4o,
|
| 171 |
+
azure_endpoint=settings.azureai_endpoint_url_4o,
|
| 172 |
+
api_key=settings.azureai_api_key_4o,
|
| 173 |
+
temperature=0,
|
| 174 |
+
)
|
| 175 |
+
self._prompt = ChatPromptTemplate.from_messages([
|
| 176 |
+
("system", _SYSTEM_PROMPT),
|
| 177 |
+
("human", "{question}"),
|
| 178 |
+
])
|
| 179 |
+
self._chain = self._prompt | self._llm.with_structured_output(TabularOperation)
|
| 180 |
+
|
| 181 |
+
async def execute(
|
| 182 |
+
self,
|
| 183 |
+
results: list[RetrievalResult],
|
| 184 |
+
user_id: str,
|
| 185 |
+
_db: AsyncSession,
|
| 186 |
+
question: str,
|
| 187 |
+
limit: int = 100,
|
| 188 |
+
) -> list[QueryResult]:
|
| 189 |
+
tabular = [
|
| 190 |
+
r for r in results
|
| 191 |
+
if r.source_type == "document"
|
| 192 |
+
and r.metadata.get("data", {}).get("file_type") in _TABULAR_FILE_TYPES
|
| 193 |
+
]
|
| 194 |
+
|
| 195 |
+
if not tabular:
|
| 196 |
+
return []
|
| 197 |
+
|
| 198 |
+
# Group by (document_id, sheet_name) — one parquet download per group
|
| 199 |
+
groups: dict[tuple[str, str | None], _GroupInfo] = {}
|
| 200 |
+
for r in tabular:
|
| 201 |
+
data = r.metadata.get("data", {})
|
| 202 |
+
doc_id = data.get("document_id")
|
| 203 |
+
if not doc_id:
|
| 204 |
+
continue
|
| 205 |
+
sheet_name = data.get("sheet_name") # None for CSV
|
| 206 |
+
key = (doc_id, sheet_name)
|
| 207 |
+
if key not in groups:
|
| 208 |
+
groups[key] = {
|
| 209 |
+
"filename": data.get("filename", ""),
|
| 210 |
+
"file_type": data.get("file_type", ""),
|
| 211 |
+
}
|
| 212 |
+
|
| 213 |
+
async def _process_group(
|
| 214 |
+
doc_id: str, sheet_name: str | None, info: _GroupInfo
|
| 215 |
+
) -> QueryResult | None:
|
| 216 |
+
try:
|
| 217 |
+
df = await download_parquet(user_id, doc_id, sheet_name)
|
| 218 |
+
df_result = await self._query_with_agent(df, question, limit)
|
| 219 |
+
|
| 220 |
+
table_label = info["filename"]
|
| 221 |
+
if sheet_name:
|
| 222 |
+
table_label += f" / sheet: {sheet_name}"
|
| 223 |
+
|
| 224 |
+
logger.info(
|
| 225 |
+
"tabular query complete",
|
| 226 |
+
document_id=doc_id,
|
| 227 |
+
sheet=sheet_name,
|
| 228 |
+
file_type=info["file_type"],
|
| 229 |
+
rows=len(df_result),
|
| 230 |
+
columns=len(df_result.columns),
|
| 231 |
+
)
|
| 232 |
+
return QueryResult(
|
| 233 |
+
source_type="document",
|
| 234 |
+
source_id=doc_id,
|
| 235 |
+
table_or_file=table_label,
|
| 236 |
+
columns=list(df_result.columns),
|
| 237 |
+
rows=df_result.to_dict(orient="records"),
|
| 238 |
+
row_count=len(df_result),
|
| 239 |
+
)
|
| 240 |
+
except Exception as e:
|
| 241 |
+
logger.error(
|
| 242 |
+
"tabular query failed",
|
| 243 |
+
document_id=doc_id,
|
| 244 |
+
sheet=sheet_name,
|
| 245 |
+
error=str(e),
|
| 246 |
+
)
|
| 247 |
+
return None
|
| 248 |
+
|
| 249 |
+
gathered = await asyncio.gather(*[
|
| 250 |
+
_process_group(doc_id, sheet_name, info)
|
| 251 |
+
for (doc_id, sheet_name), info in groups.items()
|
| 252 |
+
])
|
| 253 |
+
return [r for r in gathered if r is not None]
|
| 254 |
+
|
| 255 |
+
async def _query_with_agent(
|
| 256 |
+
self, df: pd.DataFrame, question: str, limit: int
|
| 257 |
+
) -> pd.DataFrame:
|
| 258 |
+
schema_ctx = _build_schema_context(df)
|
| 259 |
+
prev_error = ""
|
| 260 |
+
|
| 261 |
+
for attempt in range(_MAX_RETRIES):
|
| 262 |
+
error_section = (
|
| 263 |
+
f"Previous attempt failed: {prev_error}\nFix the issue."
|
| 264 |
+
if prev_error else ""
|
| 265 |
+
)
|
| 266 |
+
try:
|
| 267 |
+
op: TabularOperation = await self._chain.ainvoke({
|
| 268 |
+
"schema": schema_ctx,
|
| 269 |
+
"error_section": error_section,
|
| 270 |
+
"question": question,
|
| 271 |
+
})
|
| 272 |
+
logger.info(
|
| 273 |
+
"tabular operation decided",
|
| 274 |
+
operation=op.operation,
|
| 275 |
+
reasoning=op.reasoning,
|
| 276 |
+
)
|
| 277 |
+
return _apply_operation(df, op, limit)
|
| 278 |
+
except Exception as e:
|
| 279 |
+
prev_error = str(e)
|
| 280 |
+
logger.warning("tabular agent error", attempt=attempt + 1, error=prev_error)
|
| 281 |
+
|
| 282 |
+
# Fallback: return raw rows
|
| 283 |
+
logger.warning("tabular agent failed after retries, returning raw rows")
|
| 284 |
+
return df.head(limit)
|
| 285 |
+
|
| 286 |
+
|
| 287 |
+
tabular_executor = TabularExecutor()
|
src/query/query_executor.py
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""QueryExecutor — dispatches retrieval results to the appropriate executor by source_type."""
|
| 2 |
+
|
| 3 |
+
import asyncio
|
| 4 |
+
|
| 5 |
+
from sqlalchemy.ext.asyncio import AsyncSession
|
| 6 |
+
|
| 7 |
+
from src.middlewares.logging import get_logger
|
| 8 |
+
from src.query.base import QueryResult
|
| 9 |
+
from src.query.executors.db_executor import db_executor
|
| 10 |
+
from src.query.executors.tabular import tabular_executor
|
| 11 |
+
from src.rag.base import RetrievalResult
|
| 12 |
+
|
| 13 |
+
logger = get_logger("query_executor")
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class QueryExecutor:
|
| 17 |
+
async def execute(
|
| 18 |
+
self,
|
| 19 |
+
results: list[RetrievalResult],
|
| 20 |
+
user_id: str,
|
| 21 |
+
db: AsyncSession,
|
| 22 |
+
question: str,
|
| 23 |
+
limit: int = 100,
|
| 24 |
+
) -> list[QueryResult]:
|
| 25 |
+
batches = await asyncio.gather(
|
| 26 |
+
db_executor.execute(results, user_id, db, question, limit),
|
| 27 |
+
tabular_executor.execute(results, user_id, db, question, limit),
|
| 28 |
+
return_exceptions=True,
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
query_results: list[QueryResult] = []
|
| 32 |
+
for batch in batches:
|
| 33 |
+
if isinstance(batch, Exception):
|
| 34 |
+
logger.error("executor failed", error=str(batch))
|
| 35 |
+
continue
|
| 36 |
+
query_results.extend(batch)
|
| 37 |
+
|
| 38 |
+
logger.info("query execution complete", total=len(query_results))
|
| 39 |
+
return query_results
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
query_executor = QueryExecutor()
|
src/rag/base.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Shared contract for all retriever implementations."""
|
| 2 |
+
|
| 3 |
+
from abc import ABC, abstractmethod
|
| 4 |
+
from dataclasses import dataclass
|
| 5 |
+
from typing import Any
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
@dataclass
|
| 9 |
+
class RetrievalResult:
|
| 10 |
+
content: str
|
| 11 |
+
metadata: dict[str, Any]
|
| 12 |
+
score: float
|
| 13 |
+
source_type: str # "document" | "database"
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class BaseRetriever(ABC):
|
| 17 |
+
@abstractmethod
|
| 18 |
+
async def retrieve(
|
| 19 |
+
self, query: str, user_id: str, k: int = 5
|
| 20 |
+
) -> list[RetrievalResult]: ...
|
src/rag/retriever.py
CHANGED
|
@@ -1,69 +1,45 @@
|
|
| 1 |
-
"""
|
| 2 |
|
| 3 |
-
import hashlib
|
| 4 |
-
import json
|
| 5 |
-
from src.db.postgres.vector_store import get_vector_store
|
| 6 |
-
from src.db.redis.connection import get_redis
|
| 7 |
from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
|
| 8 |
from src.middlewares.logging import get_logger
|
| 9 |
-
from
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
logger = get_logger("retriever")
|
| 12 |
|
| 13 |
-
_RETRIEVAL_CACHE_TTL = 3600 # 1 hour
|
| 14 |
-
|
| 15 |
|
| 16 |
class RetrieverService:
|
| 17 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
def __init__(self):
|
| 20 |
-
self.
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
async def retrieve(
|
| 23 |
self,
|
| 24 |
query: str,
|
| 25 |
user_id: str,
|
| 26 |
db: AsyncSession,
|
| 27 |
-
k: int = 5
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
Returns:
|
| 32 |
-
List of dicts with keys: content, metadata
|
| 33 |
-
metadata includes: document_id, user_id, filename, chunk_index, page_label (if PDF)
|
| 34 |
-
"""
|
| 35 |
try:
|
| 36 |
-
|
| 37 |
-
query_hash = hashlib.md5(query.encode()).hexdigest()
|
| 38 |
-
cache_key = f"retrieval:{user_id}:{query_hash}:{k}"
|
| 39 |
-
|
| 40 |
-
cached = await redis.get(cache_key)
|
| 41 |
-
if cached:
|
| 42 |
-
logger.info("Returning cached retrieval results")
|
| 43 |
-
return json.loads(cached)
|
| 44 |
-
|
| 45 |
-
logger.info(f"Retrieving for user {user_id}, query: {query[:50]}...")
|
| 46 |
-
|
| 47 |
-
docs = await self.vector_store.asimilarity_search(
|
| 48 |
-
query=query,
|
| 49 |
-
k=k,
|
| 50 |
-
filter={"user_id": user_id}
|
| 51 |
-
)
|
| 52 |
-
|
| 53 |
-
results = [
|
| 54 |
-
{
|
| 55 |
-
"content": doc.page_content,
|
| 56 |
-
"metadata": doc.metadata,
|
| 57 |
-
}
|
| 58 |
-
for doc in docs
|
| 59 |
-
]
|
| 60 |
-
|
| 61 |
-
logger.info(f"Retrieved {len(results)} chunks")
|
| 62 |
-
await redis.setex(cache_key, _RETRIEVAL_CACHE_TTL, json.dumps(results))
|
| 63 |
-
return results
|
| 64 |
-
|
| 65 |
except Exception as e:
|
| 66 |
-
logger.error("
|
| 67 |
return []
|
| 68 |
|
| 69 |
|
|
|
|
| 1 |
+
"""Public retrieval API — thin wrapper around RetrievalRouter."""
|
| 2 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
from sqlalchemy.ext.asyncio import AsyncSession
|
| 4 |
+
|
| 5 |
from src.middlewares.logging import get_logger
|
| 6 |
+
from src.rag.base import RetrievalResult
|
| 7 |
+
from src.rag.retrievers.document import document_retriever
|
| 8 |
+
from src.rag.retrievers.schema import schema_retriever
|
| 9 |
+
from src.rag.router import RetrievalRouter, SourceHint
|
| 10 |
|
| 11 |
logger = get_logger("retriever")
|
| 12 |
|
|
|
|
|
|
|
| 13 |
|
| 14 |
class RetrieverService:
|
| 15 |
+
"""Public retrieval service used by chat.py and search tools.
|
| 16 |
+
|
| 17 |
+
Delegates to RetrievalRouter which dispatches based on source_hint.
|
| 18 |
+
Returns RetrievalResult objects directly so downstream consumers
|
| 19 |
+
(db_executor, tabular_executor) can be fed without lossy dict
|
| 20 |
+
conversion. The `db` parameter is accepted for call-site compatibility
|
| 21 |
+
but currently unused — retrieval reads PGVector via _pgvector_engine
|
| 22 |
+
inside each retriever.
|
| 23 |
+
"""
|
| 24 |
|
| 25 |
def __init__(self):
|
| 26 |
+
self._router = RetrievalRouter(
|
| 27 |
+
schema_retriever=schema_retriever,
|
| 28 |
+
document_retriever=document_retriever,
|
| 29 |
+
)
|
| 30 |
|
| 31 |
async def retrieve(
|
| 32 |
self,
|
| 33 |
query: str,
|
| 34 |
user_id: str,
|
| 35 |
db: AsyncSession,
|
| 36 |
+
k: int = 5,
|
| 37 |
+
source_hint: SourceHint = "both",
|
| 38 |
+
) -> list[RetrievalResult]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
try:
|
| 40 |
+
return await self._router.retrieve(query, user_id, source_hint, k)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
except Exception as e:
|
| 42 |
+
logger.error("retrieval failed", error=str(e))
|
| 43 |
return []
|
| 44 |
|
| 45 |
|
src/rag/retrievers/__init__.py
ADDED
|
File without changes
|
src/rag/retrievers/baseline.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Service for retrieving relevant documents from vector store."""
|
| 2 |
+
|
| 3 |
+
import hashlib
|
| 4 |
+
import json
|
| 5 |
+
from src.db.postgres.vector_store import get_vector_store
|
| 6 |
+
from src.db.redis.connection import get_redis
|
| 7 |
+
from sqlalchemy.ext.asyncio import AsyncSession
|
| 8 |
+
from src.middlewares.logging import get_logger
|
| 9 |
+
from typing import List, Dict, Any
|
| 10 |
+
|
| 11 |
+
logger = get_logger("retriever")
|
| 12 |
+
|
| 13 |
+
_RETRIEVAL_CACHE_TTL = 3600 # 1 hour
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class BaselineRetrieverService:
|
| 17 |
+
"""Baseline (pre-Phase-1) retriever — preserved for benchmark comparison.
|
| 18 |
+
|
| 19 |
+
Renamed from RetrieverService so it doesn't shadow the production wrapper
|
| 20 |
+
at src/rag/retriever.py. Production code imports from src.rag.retriever;
|
| 21 |
+
benchmark scripts that want this baseline must import explicitly from
|
| 22 |
+
src.rag.retrievers.baseline.
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
def __init__(self):
|
| 26 |
+
self.vector_store = get_vector_store()
|
| 27 |
+
|
| 28 |
+
async def retrieve(
|
| 29 |
+
self,
|
| 30 |
+
query: str,
|
| 31 |
+
user_id: str,
|
| 32 |
+
db: AsyncSession,
|
| 33 |
+
k: int = 5
|
| 34 |
+
) -> List[Dict[str, Any]]:
|
| 35 |
+
"""Retrieve relevant chunks for a query, scoped to the user's documents.
|
| 36 |
+
|
| 37 |
+
Returns:
|
| 38 |
+
List of dicts with keys: content, metadata
|
| 39 |
+
metadata includes: document_id, user_id, filename, chunk_index, page_label (if PDF)
|
| 40 |
+
"""
|
| 41 |
+
try:
|
| 42 |
+
redis = await get_redis()
|
| 43 |
+
query_hash = hashlib.md5(query.encode()).hexdigest()
|
| 44 |
+
cache_key = f"retrieval:{user_id}:{query_hash}:{k}"
|
| 45 |
+
|
| 46 |
+
cached = await redis.get(cache_key)
|
| 47 |
+
if cached:
|
| 48 |
+
logger.info("Returning cached retrieval results")
|
| 49 |
+
return json.loads(cached)
|
| 50 |
+
|
| 51 |
+
logger.info(f"Retrieving for user {user_id}, query: {query[:50]}...")
|
| 52 |
+
|
| 53 |
+
docs = await self.vector_store.asimilarity_search(
|
| 54 |
+
query=query,
|
| 55 |
+
k=k,
|
| 56 |
+
filter={"user_id": user_id}
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
results = [
|
| 60 |
+
{
|
| 61 |
+
"content": doc.page_content,
|
| 62 |
+
"metadata": doc.metadata,
|
| 63 |
+
}
|
| 64 |
+
for doc in docs
|
| 65 |
+
]
|
| 66 |
+
|
| 67 |
+
logger.info(f"Retrieved {len(results)} chunks")
|
| 68 |
+
await redis.setex(cache_key, _RETRIEVAL_CACHE_TTL, json.dumps(results))
|
| 69 |
+
return results
|
| 70 |
+
|
| 71 |
+
except Exception as e:
|
| 72 |
+
logger.error("Retrieval failed", error=str(e))
|
| 73 |
+
return []
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
baseline_retriever = BaselineRetrieverService()
|
src/rag/retrievers/document.py
ADDED
|
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Document retriever — handles PDF, DOCX, TXT chunks (source_type="document", non-tabular)."""
|
| 2 |
+
|
| 3 |
+
import math
|
| 4 |
+
|
| 5 |
+
from langchain_postgres import PGVector
|
| 6 |
+
from langchain_postgres.vectorstores import DistanceStrategy
|
| 7 |
+
from langchain_openai import AzureOpenAIEmbeddings
|
| 8 |
+
from sqlalchemy import text
|
| 9 |
+
|
| 10 |
+
from src.config.settings import settings
|
| 11 |
+
from src.db.postgres.connection import _pgvector_engine
|
| 12 |
+
from src.db.postgres.vector_store import get_vector_store
|
| 13 |
+
from src.middlewares.logging import get_logger
|
| 14 |
+
from src.rag.base import BaseRetriever, RetrievalResult
|
| 15 |
+
|
| 16 |
+
logger = get_logger("document_retriever")
|
| 17 |
+
|
| 18 |
+
# Change this one line to switch retrieval method
|
| 19 |
+
# Options: "mmr" | "cosine" | "euclidean" | "inner_product" | "manhattan"
|
| 20 |
+
_RETRIEVAL_METHOD = "mmr"
|
| 21 |
+
|
| 22 |
+
_TABULAR_TYPES = {"csv", "xlsx"}
|
| 23 |
+
_FETCH_K = 20
|
| 24 |
+
_LAMBDA_MULT = 0.5
|
| 25 |
+
_COLLECTION_NAME = "document_embeddings"
|
| 26 |
+
|
| 27 |
+
_embeddings = AzureOpenAIEmbeddings(
|
| 28 |
+
azure_deployment=settings.azureai_deployment_name_embedding,
|
| 29 |
+
openai_api_version=settings.azureai_api_version_embedding,
|
| 30 |
+
azure_endpoint=settings.azureai_endpoint_url_embedding,
|
| 31 |
+
api_key=settings.azureai_api_key_embedding,
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
_euclidean_store = PGVector(
|
| 35 |
+
embeddings=_embeddings,
|
| 36 |
+
connection=_pgvector_engine,
|
| 37 |
+
collection_name=_COLLECTION_NAME,
|
| 38 |
+
distance_strategy=DistanceStrategy.EUCLIDEAN,
|
| 39 |
+
use_jsonb=True,
|
| 40 |
+
async_mode=True,
|
| 41 |
+
create_extension=False,
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
_ip_store = PGVector(
|
| 45 |
+
embeddings=_embeddings,
|
| 46 |
+
connection=_pgvector_engine,
|
| 47 |
+
collection_name=_COLLECTION_NAME,
|
| 48 |
+
distance_strategy=DistanceStrategy.MAX_INNER_PRODUCT,
|
| 49 |
+
use_jsonb=True,
|
| 50 |
+
async_mode=True,
|
| 51 |
+
create_extension=False,
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
_MANHATTAN_SQL = text("""
|
| 55 |
+
SELECT
|
| 56 |
+
lpe.document,
|
| 57 |
+
lpe.cmetadata,
|
| 58 |
+
lpe.embedding <+> CAST(:embedding AS vector) AS distance
|
| 59 |
+
FROM langchain_pg_embedding lpe
|
| 60 |
+
JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
|
| 61 |
+
WHERE lpc.name = :collection
|
| 62 |
+
AND lpe.cmetadata->>'user_id' = :user_id
|
| 63 |
+
AND lpe.cmetadata->>'source_type' = 'document'
|
| 64 |
+
ORDER BY distance ASC
|
| 65 |
+
LIMIT :k
|
| 66 |
+
""")
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
class DocumentRetriever(BaseRetriever):
|
| 70 |
+
def __init__(self) -> None:
|
| 71 |
+
self.vector_store = get_vector_store()
|
| 72 |
+
|
| 73 |
+
async def retrieve(
|
| 74 |
+
self, query: str, user_id: str, k: int = 5
|
| 75 |
+
) -> list[RetrievalResult]:
|
| 76 |
+
filter_ = {"user_id": user_id, "source_type": "document"}
|
| 77 |
+
fetch_k = k + len(_TABULAR_TYPES)
|
| 78 |
+
|
| 79 |
+
if _RETRIEVAL_METHOD == "manhattan":
|
| 80 |
+
return await self._retrieve_manhattan(query, user_id, k, fetch_k)
|
| 81 |
+
|
| 82 |
+
if _RETRIEVAL_METHOD == "mmr":
|
| 83 |
+
docs = await self.vector_store.amax_marginal_relevance_search(
|
| 84 |
+
query=query,
|
| 85 |
+
k=fetch_k,
|
| 86 |
+
fetch_k=_FETCH_K,
|
| 87 |
+
lambda_mult=_LAMBDA_MULT,
|
| 88 |
+
filter=filter_,
|
| 89 |
+
)
|
| 90 |
+
cosine = await self.vector_store.asimilarity_search_with_score(
|
| 91 |
+
query=query, k=fetch_k, filter=filter_,
|
| 92 |
+
)
|
| 93 |
+
score_map = {doc.page_content: score for doc, score in cosine}
|
| 94 |
+
docs_with_scores = [(doc, score_map.get(doc.page_content, 0.0)) for doc in docs]
|
| 95 |
+
elif _RETRIEVAL_METHOD == "euclidean":
|
| 96 |
+
docs_with_scores = await _euclidean_store.asimilarity_search_with_score(
|
| 97 |
+
query=query, k=fetch_k, filter=filter_,
|
| 98 |
+
)
|
| 99 |
+
elif _RETRIEVAL_METHOD == "inner_product":
|
| 100 |
+
docs_with_scores = await _ip_store.asimilarity_search_with_score(
|
| 101 |
+
query=query, k=fetch_k, filter=filter_,
|
| 102 |
+
)
|
| 103 |
+
else: # cosine
|
| 104 |
+
docs_with_scores = await self.vector_store.asimilarity_search_with_score(
|
| 105 |
+
query=query, k=fetch_k, filter=filter_,
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
results = []
|
| 109 |
+
for doc, score in docs_with_scores:
|
| 110 |
+
file_type = doc.metadata.get("data", {}).get("file_type", "")
|
| 111 |
+
if file_type not in _TABULAR_TYPES:
|
| 112 |
+
results.append(RetrievalResult(
|
| 113 |
+
content=doc.page_content,
|
| 114 |
+
metadata=doc.metadata,
|
| 115 |
+
score=score,
|
| 116 |
+
source_type="document",
|
| 117 |
+
))
|
| 118 |
+
if len(results) == k:
|
| 119 |
+
break
|
| 120 |
+
|
| 121 |
+
logger.info("retrieved chunks", method=_RETRIEVAL_METHOD, count=len(results))
|
| 122 |
+
return results
|
| 123 |
+
|
| 124 |
+
async def _retrieve_manhattan(
|
| 125 |
+
self, query: str, user_id: str, k: int, fetch_k: int
|
| 126 |
+
) -> list[RetrievalResult]:
|
| 127 |
+
query_vector = await _embeddings.aembed_query(query)
|
| 128 |
+
if not all(math.isfinite(v) for v in query_vector):
|
| 129 |
+
raise ValueError("Embedding vector contains NaN or Infinity values.")
|
| 130 |
+
vector_str = "[" + ",".join(str(v) for v in query_vector) + "]"
|
| 131 |
+
|
| 132 |
+
async with _pgvector_engine.connect() as conn:
|
| 133 |
+
result = await conn.execute(_MANHATTAN_SQL, {
|
| 134 |
+
"embedding": vector_str,
|
| 135 |
+
"collection": _COLLECTION_NAME,
|
| 136 |
+
"user_id": user_id,
|
| 137 |
+
"k": fetch_k,
|
| 138 |
+
})
|
| 139 |
+
rows = result.fetchall()
|
| 140 |
+
|
| 141 |
+
results = []
|
| 142 |
+
for row in rows:
|
| 143 |
+
file_type = row.cmetadata.get("data", {}).get("file_type", "")
|
| 144 |
+
if file_type not in _TABULAR_TYPES:
|
| 145 |
+
results.append(RetrievalResult(
|
| 146 |
+
content=row.document,
|
| 147 |
+
metadata=row.cmetadata,
|
| 148 |
+
score=float(row.distance),
|
| 149 |
+
source_type="document",
|
| 150 |
+
))
|
| 151 |
+
if len(results) == k:
|
| 152 |
+
break
|
| 153 |
+
|
| 154 |
+
logger.info("retrieved chunks", method="manhattan", count=len(results))
|
| 155 |
+
return results
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
document_retriever = DocumentRetriever()
|
src/rag/retrievers/schema.py
ADDED
|
@@ -0,0 +1,411 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Schema retriever — handles DB schemas (source_type="database") and tabular file
|
| 2 |
+
columns stored as source_type="document" with file_type in ("csv","xlsx").
|
| 3 |
+
|
| 4 |
+
Strategy: hybrid_bm25 — RRF merge of dense cosine search (DB columns + DB tables
|
| 5 |
+
+ tabular columns + tabular sheets) and PostgreSQL full-text search (DB columns only).
|
| 6 |
+
Embeds the query once, fans out five legs in parallel.
|
| 7 |
+
|
| 8 |
+
The DB-tables leg surfaces table-level summary chunks (chunk_level='table') as
|
| 9 |
+
a recall signal for multi-table questions: when a relevant table's columns
|
| 10 |
+
don't individually win on similarity, the table chunk can still pull the table
|
| 11 |
+
into the hit set, where db_executor's downstream full-schema fetch picks up
|
| 12 |
+
the per-column detail.
|
| 13 |
+
|
| 14 |
+
FTS requires a GIN index on langchain_pg_embedding.document (created by init_db.py).
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
import asyncio
|
| 18 |
+
|
| 19 |
+
from sqlalchemy import text
|
| 20 |
+
|
| 21 |
+
from src.db.postgres.connection import _pgvector_engine
|
| 22 |
+
from src.db.postgres.vector_store import get_vector_store
|
| 23 |
+
from src.middlewares.logging import get_logger
|
| 24 |
+
from src.rag.base import BaseRetriever, RetrievalResult
|
| 25 |
+
|
| 26 |
+
logger = get_logger("schema_retriever")
|
| 27 |
+
|
| 28 |
+
_TABULAR_FILE_TYPES = ("csv", "xlsx")
|
| 29 |
+
_TABLE_CHUNK_K_MULTIPLIER = 2 # how many table chunks to pull before RRF
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
class SchemaRetriever(BaseRetriever):
|
| 33 |
+
def __init__(self):
|
| 34 |
+
self.vector_store = get_vector_store()
|
| 35 |
+
|
| 36 |
+
# ------------------------------------------------------------------
|
| 37 |
+
# Internal helpers
|
| 38 |
+
# ------------------------------------------------------------------
|
| 39 |
+
|
| 40 |
+
async def _embed_query(self, query: str) -> list[float]:
|
| 41 |
+
return await asyncio.to_thread(self.vector_store.embeddings.embed_query, query)
|
| 42 |
+
|
| 43 |
+
async def _search_db(
|
| 44 |
+
self, embedding: list[float], user_id: str, k: int
|
| 45 |
+
) -> list[RetrievalResult]:
|
| 46 |
+
"""Cosine vector search over database chunks."""
|
| 47 |
+
emb_str = "[" + ",".join(str(x) for x in embedding) + "]"
|
| 48 |
+
|
| 49 |
+
sql = text(f"""
|
| 50 |
+
SELECT lpe.document, lpe.cmetadata,
|
| 51 |
+
1.0 - (lpe.embedding <=> '{emb_str}'::vector) AS score
|
| 52 |
+
FROM langchain_pg_embedding lpe
|
| 53 |
+
JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
|
| 54 |
+
WHERE lpc.name = 'document_embeddings'
|
| 55 |
+
AND lpe.cmetadata->>'user_id' = :user_id
|
| 56 |
+
AND lpe.cmetadata->>'source_type' = 'database'
|
| 57 |
+
AND lpe.cmetadata->>'chunk_level' = 'column'
|
| 58 |
+
ORDER BY lpe.embedding <=> '{emb_str}'::vector ASC
|
| 59 |
+
LIMIT :k
|
| 60 |
+
""")
|
| 61 |
+
|
| 62 |
+
async with _pgvector_engine.connect() as conn:
|
| 63 |
+
result = await conn.execute(sql, {"user_id": user_id, "k": k * 4})
|
| 64 |
+
rows = result.fetchall()
|
| 65 |
+
|
| 66 |
+
return [
|
| 67 |
+
RetrievalResult(
|
| 68 |
+
content=row.document,
|
| 69 |
+
metadata=row.cmetadata,
|
| 70 |
+
score=float(row.score),
|
| 71 |
+
source_type="database",
|
| 72 |
+
)
|
| 73 |
+
for row in rows
|
| 74 |
+
]
|
| 75 |
+
|
| 76 |
+
async def _search_db_tables(
|
| 77 |
+
self, embedding: list[float], user_id: str, k: int
|
| 78 |
+
) -> list[RetrievalResult]:
|
| 79 |
+
"""Cosine vector search over database TABLE-level chunks.
|
| 80 |
+
|
| 81 |
+
Recall channel for multi-table questions. The chunk's content is
|
| 82 |
+
discarded downstream — db_executor only consumes its `data.table_name`
|
| 83 |
+
to seed full-schema fetch.
|
| 84 |
+
"""
|
| 85 |
+
emb_str = "[" + ",".join(str(x) for x in embedding) + "]"
|
| 86 |
+
|
| 87 |
+
sql = text(f"""
|
| 88 |
+
SELECT lpe.document, lpe.cmetadata,
|
| 89 |
+
1.0 - (lpe.embedding <=> '{emb_str}'::vector) AS score
|
| 90 |
+
FROM langchain_pg_embedding lpe
|
| 91 |
+
JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
|
| 92 |
+
WHERE lpc.name = 'document_embeddings'
|
| 93 |
+
AND lpe.cmetadata->>'user_id' = :user_id
|
| 94 |
+
AND lpe.cmetadata->>'source_type' = 'database'
|
| 95 |
+
AND lpe.cmetadata->>'chunk_level' = 'table'
|
| 96 |
+
ORDER BY lpe.embedding <=> '{emb_str}'::vector ASC
|
| 97 |
+
LIMIT :k
|
| 98 |
+
""")
|
| 99 |
+
|
| 100 |
+
async with _pgvector_engine.connect() as conn:
|
| 101 |
+
result = await conn.execute(
|
| 102 |
+
sql, {"user_id": user_id, "k": k * _TABLE_CHUNK_K_MULTIPLIER}
|
| 103 |
+
)
|
| 104 |
+
rows = result.fetchall()
|
| 105 |
+
|
| 106 |
+
return [
|
| 107 |
+
RetrievalResult(
|
| 108 |
+
content=row.document,
|
| 109 |
+
metadata=row.cmetadata,
|
| 110 |
+
score=float(row.score),
|
| 111 |
+
source_type="database",
|
| 112 |
+
)
|
| 113 |
+
for row in rows
|
| 114 |
+
]
|
| 115 |
+
|
| 116 |
+
async def _search_tabular(
|
| 117 |
+
self, embedding: list[float], user_id: str, k: int
|
| 118 |
+
) -> list[RetrievalResult]:
|
| 119 |
+
"""Cosine vector search over tabular document chunks (csv/xlsx)."""
|
| 120 |
+
emb_str = "[" + ",".join(str(x) for x in embedding) + "]"
|
| 121 |
+
|
| 122 |
+
sql = text(f"""
|
| 123 |
+
SELECT lpe.document, lpe.cmetadata,
|
| 124 |
+
1.0 - (lpe.embedding <=> '{emb_str}'::vector) AS score
|
| 125 |
+
FROM langchain_pg_embedding lpe
|
| 126 |
+
JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
|
| 127 |
+
WHERE lpc.name = 'document_embeddings'
|
| 128 |
+
AND lpe.cmetadata->>'user_id' = :user_id
|
| 129 |
+
AND lpe.cmetadata->>'source_type' = 'document'
|
| 130 |
+
AND lpe.cmetadata->>'chunk_level' = 'column'
|
| 131 |
+
AND (lpe.cmetadata->'data'->>'file_type' = 'csv'
|
| 132 |
+
OR lpe.cmetadata->'data'->>'file_type' = 'xlsx')
|
| 133 |
+
ORDER BY lpe.embedding <=> '{emb_str}'::vector ASC
|
| 134 |
+
LIMIT :k
|
| 135 |
+
""")
|
| 136 |
+
|
| 137 |
+
async with _pgvector_engine.connect() as conn:
|
| 138 |
+
result = await conn.execute(sql, {"user_id": user_id, "k": k * 4})
|
| 139 |
+
rows = result.fetchall()
|
| 140 |
+
|
| 141 |
+
return [
|
| 142 |
+
RetrievalResult(
|
| 143 |
+
content=row.document,
|
| 144 |
+
metadata=row.cmetadata,
|
| 145 |
+
score=float(row.score),
|
| 146 |
+
source_type="document",
|
| 147 |
+
)
|
| 148 |
+
for row in rows
|
| 149 |
+
]
|
| 150 |
+
|
| 151 |
+
async def _search_tabular_sheets(
|
| 152 |
+
self, embedding: list[float], user_id: str, k: int
|
| 153 |
+
) -> list[RetrievalResult]:
|
| 154 |
+
"""Leg 5: sheet-level summary chunks from CSV/XLSX files."""
|
| 155 |
+
emb_str = "[" + ",".join(str(x) for x in embedding) + "]"
|
| 156 |
+
|
| 157 |
+
sql = text(f"""
|
| 158 |
+
SELECT lpe.document, lpe.cmetadata,
|
| 159 |
+
1.0 - (lpe.embedding <=> '{emb_str}'::vector) AS score
|
| 160 |
+
FROM langchain_pg_embedding lpe
|
| 161 |
+
JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
|
| 162 |
+
WHERE lpc.name = 'document_embeddings'
|
| 163 |
+
AND lpe.cmetadata->>'user_id' = :user_id
|
| 164 |
+
AND lpe.cmetadata->>'source_type' = 'document'
|
| 165 |
+
AND lpe.cmetadata->>'chunk_level' = 'sheet'
|
| 166 |
+
AND (lpe.cmetadata->'data'->>'file_type' = 'csv'
|
| 167 |
+
OR lpe.cmetadata->'data'->>'file_type' = 'xlsx')
|
| 168 |
+
ORDER BY lpe.embedding <=> '{emb_str}'::vector ASC
|
| 169 |
+
LIMIT :k
|
| 170 |
+
""")
|
| 171 |
+
|
| 172 |
+
async with _pgvector_engine.connect() as conn:
|
| 173 |
+
result = await conn.execute(sql, {"user_id": user_id, "k": k})
|
| 174 |
+
rows = result.fetchall()
|
| 175 |
+
|
| 176 |
+
return [
|
| 177 |
+
RetrievalResult(
|
| 178 |
+
content=row.document,
|
| 179 |
+
metadata=row.cmetadata,
|
| 180 |
+
score=float(row.score),
|
| 181 |
+
source_type="document",
|
| 182 |
+
)
|
| 183 |
+
for row in rows
|
| 184 |
+
]
|
| 185 |
+
|
| 186 |
+
async def _search_fts_db(self, query: str, user_id: str, k: int) -> list[RetrievalResult]:
|
| 187 |
+
"""Full-text search over DB schema chunks using PostgreSQL tsvector."""
|
| 188 |
+
sql = text("""
|
| 189 |
+
SELECT lpe.document, lpe.cmetadata,
|
| 190 |
+
ts_rank(to_tsvector('english', lpe.document),
|
| 191 |
+
plainto_tsquery('english', :query)) AS rank
|
| 192 |
+
FROM langchain_pg_embedding lpe
|
| 193 |
+
JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
|
| 194 |
+
WHERE lpc.name = 'document_embeddings'
|
| 195 |
+
AND lpe.cmetadata->>'user_id' = :user_id
|
| 196 |
+
AND lpe.cmetadata->>'source_type' = 'database'
|
| 197 |
+
AND lpe.cmetadata->>'chunk_level' = 'column'
|
| 198 |
+
AND to_tsvector('english', lpe.document) @@ plainto_tsquery('english', :query)
|
| 199 |
+
ORDER BY rank DESC
|
| 200 |
+
LIMIT :k
|
| 201 |
+
""")
|
| 202 |
+
|
| 203 |
+
async with _pgvector_engine.connect() as conn:
|
| 204 |
+
result = await conn.execute(sql, {"query": query, "user_id": user_id, "k": k})
|
| 205 |
+
rows = result.fetchall()
|
| 206 |
+
|
| 207 |
+
return [
|
| 208 |
+
RetrievalResult(
|
| 209 |
+
content=row.document,
|
| 210 |
+
metadata=row.cmetadata,
|
| 211 |
+
score=float(row.rank),
|
| 212 |
+
source_type="database",
|
| 213 |
+
)
|
| 214 |
+
for row in rows
|
| 215 |
+
]
|
| 216 |
+
|
| 217 |
+
def _rank_tabular_sheets(
|
| 218 |
+
self,
|
| 219 |
+
sheet_results: list[RetrievalResult],
|
| 220 |
+
column_results: list[RetrievalResult],
|
| 221 |
+
top_k: int,
|
| 222 |
+
k_rrf: int = 60,
|
| 223 |
+
) -> list[RetrievalResult]:
|
| 224 |
+
"""Rank tabular sheets by RRF across two voting legs:
|
| 225 |
+
L1 (primary): sheet-chunk cosine score
|
| 226 |
+
L2 (vote): best column-chunk position per (doc_id, sheet_name)
|
| 227 |
+
|
| 228 |
+
Returns top-k sheet-level RetrievalResults. The full column list of
|
| 229 |
+
each sheet is already in the sheet chunk's data.column_names from
|
| 230 |
+
ingestion, so downstream tabular_executor can read full sheet context.
|
| 231 |
+
|
| 232 |
+
For sheets surfaced by column votes but missing a sheet chunk (rare —
|
| 233 |
+
ingestion always creates one), a minimal stub is returned and
|
| 234 |
+
tabular_executor falls back to reading columns from the parquet.
|
| 235 |
+
"""
|
| 236 |
+
# L1: sheets indexed by (doc_id, sheet_name) from sheet chunks
|
| 237 |
+
sheet_index: dict[tuple, RetrievalResult] = {}
|
| 238 |
+
sheet_ranked: list[tuple] = []
|
| 239 |
+
for r in sheet_results:
|
| 240 |
+
d = r.metadata.get("data", {})
|
| 241 |
+
key = (d.get("document_id"), d.get("sheet_name"))
|
| 242 |
+
if key[0] and key not in sheet_index:
|
| 243 |
+
sheet_index[key] = r
|
| 244 |
+
sheet_ranked.append(key)
|
| 245 |
+
|
| 246 |
+
# L2: sheets ranked by first-appearance in column-chunk results
|
| 247 |
+
col_sheet_ranked: list[tuple] = []
|
| 248 |
+
seen: set[tuple] = set()
|
| 249 |
+
for r in column_results:
|
| 250 |
+
d = r.metadata.get("data", {})
|
| 251 |
+
key = (d.get("document_id"), d.get("sheet_name"))
|
| 252 |
+
if key[0] and key not in seen:
|
| 253 |
+
col_sheet_ranked.append(key)
|
| 254 |
+
seen.add(key)
|
| 255 |
+
|
| 256 |
+
# RRF over (doc_id, sheet_name) across the two legs
|
| 257 |
+
rrf_scores: dict[tuple, float] = {}
|
| 258 |
+
for ranked_list in [sheet_ranked, col_sheet_ranked]:
|
| 259 |
+
for rank, key in enumerate(ranked_list):
|
| 260 |
+
rrf_scores[key] = rrf_scores.get(key, 0.0) + 1.0 / (k_rrf + rank + 1)
|
| 261 |
+
|
| 262 |
+
top_sheets = sorted(rrf_scores, key=lambda k: rrf_scores[k], reverse=True)[:top_k]
|
| 263 |
+
|
| 264 |
+
results: list[RetrievalResult] = []
|
| 265 |
+
for key in top_sheets:
|
| 266 |
+
if key in sheet_index:
|
| 267 |
+
r = sheet_index[key]
|
| 268 |
+
r.score = rrf_scores[key]
|
| 269 |
+
results.append(r)
|
| 270 |
+
else:
|
| 271 |
+
# Surfaced by column votes only — build stub from a representative
|
| 272 |
+
# column result so tabular_executor can group correctly.
|
| 273 |
+
doc_id, sheet_name = key
|
| 274 |
+
rep = next(
|
| 275 |
+
(r for r in column_results
|
| 276 |
+
if r.metadata.get("data", {}).get("document_id") == doc_id
|
| 277 |
+
and r.metadata.get("data", {}).get("sheet_name") == sheet_name),
|
| 278 |
+
None,
|
| 279 |
+
)
|
| 280 |
+
if rep is None:
|
| 281 |
+
continue
|
| 282 |
+
stub_data = dict(rep.metadata.get("data", {}))
|
| 283 |
+
stub_data.pop("column_name", None)
|
| 284 |
+
stub_data.pop("column_type", None)
|
| 285 |
+
results.append(RetrievalResult(
|
| 286 |
+
content=f"Sheet: {stub_data.get('filename', '')}"
|
| 287 |
+
+ (f" / sheet: {sheet_name}" if sheet_name else ""),
|
| 288 |
+
metadata={**rep.metadata, "data": stub_data, "chunk_level": "sheet"},
|
| 289 |
+
score=rrf_scores[key],
|
| 290 |
+
source_type="document",
|
| 291 |
+
))
|
| 292 |
+
return results
|
| 293 |
+
|
| 294 |
+
def _rank_db_tables(
|
| 295 |
+
self,
|
| 296 |
+
tbl_results: list[RetrievalResult],
|
| 297 |
+
col_results: list[RetrievalResult],
|
| 298 |
+
fts_results: list[RetrievalResult],
|
| 299 |
+
top_k: int,
|
| 300 |
+
k_rrf: int = 60,
|
| 301 |
+
) -> list[RetrievalResult]:
|
| 302 |
+
"""Rank DB tables by RRF across three legs:
|
| 303 |
+
L1 (primary): table-summary chunk similarity
|
| 304 |
+
L2 (vote): best column-chunk position per table
|
| 305 |
+
L3 (vote): best FTS position per table
|
| 306 |
+
|
| 307 |
+
Returns top-k table-chunk RetrievalResults. For tables surfaced by
|
| 308 |
+
L2/L3 but missing a table chunk, a minimal stub is returned so that
|
| 309 |
+
db_executor._fetch_full_schema can seed off data.table_name.
|
| 310 |
+
"""
|
| 311 |
+
# L1: tables ranked by table-chunk cosine score
|
| 312 |
+
tbl_index: dict[str, RetrievalResult] = {}
|
| 313 |
+
tbl_ranked: list[str] = []
|
| 314 |
+
for r in tbl_results:
|
| 315 |
+
tname = r.metadata.get("data", {}).get("table_name")
|
| 316 |
+
if tname and tname not in tbl_index:
|
| 317 |
+
tbl_index[tname] = r
|
| 318 |
+
tbl_ranked.append(tname)
|
| 319 |
+
|
| 320 |
+
# L2: tables ranked by first-appearance in column-chunk list (best col score)
|
| 321 |
+
col_table_ranked: list[str] = []
|
| 322 |
+
seen: set[str] = set()
|
| 323 |
+
for r in col_results:
|
| 324 |
+
tname = r.metadata.get("data", {}).get("table_name")
|
| 325 |
+
if tname and tname not in seen:
|
| 326 |
+
col_table_ranked.append(tname)
|
| 327 |
+
seen.add(tname)
|
| 328 |
+
|
| 329 |
+
# L3: tables ranked by first-appearance in FTS list
|
| 330 |
+
fts_table_ranked: list[str] = []
|
| 331 |
+
seen = set()
|
| 332 |
+
for r in fts_results:
|
| 333 |
+
tname = r.metadata.get("data", {}).get("table_name")
|
| 334 |
+
if tname and tname not in seen:
|
| 335 |
+
fts_table_ranked.append(tname)
|
| 336 |
+
seen.add(tname)
|
| 337 |
+
|
| 338 |
+
# RRF over table names across the three legs
|
| 339 |
+
rrf_scores: dict[str, float] = {}
|
| 340 |
+
for ranked_list in [tbl_ranked, col_table_ranked, fts_table_ranked]:
|
| 341 |
+
for rank, tname in enumerate(ranked_list):
|
| 342 |
+
rrf_scores[tname] = rrf_scores.get(tname, 0.0) + 1.0 / (k_rrf + rank + 1)
|
| 343 |
+
|
| 344 |
+
top_tables = sorted(rrf_scores, key=lambda t: rrf_scores[t], reverse=True)[:top_k]
|
| 345 |
+
|
| 346 |
+
results: list[RetrievalResult] = []
|
| 347 |
+
for tname in top_tables:
|
| 348 |
+
if tname in tbl_index:
|
| 349 |
+
r = tbl_index[tname]
|
| 350 |
+
r.score = rrf_scores[tname]
|
| 351 |
+
results.append(r)
|
| 352 |
+
else:
|
| 353 |
+
# Surfaced by column/FTS votes with no table chunk — minimal stub
|
| 354 |
+
results.append(RetrievalResult(
|
| 355 |
+
content=f"Table: {tname}",
|
| 356 |
+
metadata={"data": {"table_name": tname}, "source_type": "database"},
|
| 357 |
+
score=rrf_scores[tname],
|
| 358 |
+
source_type="database",
|
| 359 |
+
))
|
| 360 |
+
return results
|
| 361 |
+
|
| 362 |
+
# ------------------------------------------------------------------
|
| 363 |
+
# Public interface — called by the router
|
| 364 |
+
# ------------------------------------------------------------------
|
| 365 |
+
|
| 366 |
+
async def retrieve(self, query: str, user_id: str, k: int = 5) -> list[RetrievalResult]:
|
| 367 |
+
"""Table-first retrieval for DB sources; chunk-level for tabular.
|
| 368 |
+
|
| 369 |
+
DB tables are ranked via RRF across three legs:
|
| 370 |
+
L1 (primary): table-summary chunk similarity
|
| 371 |
+
L2 (vote): top-K column-chunk cosine, grouped by table
|
| 372 |
+
L3 (vote): top-K FTS column hits, grouped by table
|
| 373 |
+
|
| 374 |
+
db_executor downstream fetches the full per-column schema for the
|
| 375 |
+
ranked table set via _fetch_full_schema — the column chunks returned
|
| 376 |
+
here are intentionally NOT used as the schema source, only for voting.
|
| 377 |
+
|
| 378 |
+
Tabular (CSV/XLSX) sheets are ranked via RRF across two legs:
|
| 379 |
+
L1: sheet-chunk cosine
|
| 380 |
+
L2: column-chunk votes (best position per sheet)
|
| 381 |
+
Returns sheet-level RetrievalResults so tabular_executor receives
|
| 382 |
+
full sheet context (all columns) rather than fragmented column hits.
|
| 383 |
+
"""
|
| 384 |
+
embedding = await self._embed_query(query)
|
| 385 |
+
db_col_results, db_tbl_results, tabular_results, fts_results, sheet_results = await asyncio.gather(
|
| 386 |
+
self._search_db(embedding, user_id, k),
|
| 387 |
+
self._search_db_tables(embedding, user_id, k),
|
| 388 |
+
self._search_tabular(embedding, user_id, k),
|
| 389 |
+
self._search_fts_db(query, user_id, k * 4),
|
| 390 |
+
self._search_tabular_sheets(embedding, user_id, k),
|
| 391 |
+
)
|
| 392 |
+
|
| 393 |
+
db_ranked = self._rank_db_tables(db_tbl_results, db_col_results, fts_results, top_k=k)
|
| 394 |
+
tabular_ranked = self._rank_tabular_sheets(sheet_results, tabular_results, top_k=k)
|
| 395 |
+
|
| 396 |
+
results = sorted(db_ranked + tabular_ranked, key=lambda r: r.score, reverse=True)
|
| 397 |
+
logger.info(
|
| 398 |
+
"schema retrieval",
|
| 399 |
+
count=len(results),
|
| 400 |
+
db_tables_ranked=len(db_ranked),
|
| 401 |
+
db_cols=len(db_col_results),
|
| 402 |
+
db_tables=len(db_tbl_results),
|
| 403 |
+
tabular_cols=len(tabular_results),
|
| 404 |
+
tabular_sheets=len(sheet_results),
|
| 405 |
+
tabular_ranked=len(tabular_ranked),
|
| 406 |
+
fts=len(fts_results),
|
| 407 |
+
)
|
| 408 |
+
return results
|
| 409 |
+
|
| 410 |
+
|
| 411 |
+
schema_retriever = SchemaRetriever()
|
src/rag/router.py
ADDED
|
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Routes retrieval requests to the appropriate retriever based on source_hint.
|
| 2 |
+
|
| 3 |
+
Cross-retriever merging uses Reciprocal Rank Fusion (RRF) on per-retriever
|
| 4 |
+
ranked lists — score scales differ across retrievers (RRF, cosine, distance)
|
| 5 |
+
and aren't directly comparable, so we rank-merge instead of score-merge.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import asyncio
|
| 9 |
+
import hashlib
|
| 10 |
+
import json
|
| 11 |
+
from dataclasses import asdict
|
| 12 |
+
from typing import Literal
|
| 13 |
+
|
| 14 |
+
from src.db.redis.connection import get_redis
|
| 15 |
+
from src.middlewares.logging import get_logger
|
| 16 |
+
from src.rag.base import BaseRetriever, RetrievalResult
|
| 17 |
+
|
| 18 |
+
logger = get_logger("retrieval_router")
|
| 19 |
+
|
| 20 |
+
_CACHE_TTL = 3600 # 1 hour
|
| 21 |
+
_CACHE_KEY_PREFIX = "retrieval"
|
| 22 |
+
_RRF_K = 60 # standard RRF constant
|
| 23 |
+
SourceHint = Literal["document", "schema", "both"]
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def _result_dedup_key(r: RetrievalResult) -> tuple:
|
| 27 |
+
"""Cross-retriever dedup key — distinguishes DB columns vs DB tables vs
|
| 28 |
+
tabular columns vs prose chunks vs sheet-level chunks."""
|
| 29 |
+
data = r.metadata.get("data", {})
|
| 30 |
+
return (
|
| 31 |
+
r.source_type,
|
| 32 |
+
data.get("table_name"),
|
| 33 |
+
data.get("column_name"),
|
| 34 |
+
data.get("filename"),
|
| 35 |
+
data.get("sheet_name"),
|
| 36 |
+
data.get("chunk_index"), # disambiguates multiple prose chunks per doc
|
| 37 |
+
r.metadata.get("chunk_level"), # distinguishes sheet vs column chunks
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def _rrf_merge(
|
| 42 |
+
ranked_lists: list[list[RetrievalResult]],
|
| 43 |
+
top_k: int,
|
| 44 |
+
k_rrf: int = _RRF_K,
|
| 45 |
+
) -> list[RetrievalResult]:
|
| 46 |
+
"""Reciprocal Rank Fusion across retriever batches.
|
| 47 |
+
|
| 48 |
+
Each input list is treated as already best-first ordered. Items are
|
| 49 |
+
deduped via _result_dedup_key and re-ranked by aggregated reciprocal
|
| 50 |
+
rank across all lists. Score on the returned RetrievalResult is the
|
| 51 |
+
aggregated RRF score (uniform scale across legs).
|
| 52 |
+
"""
|
| 53 |
+
scores: dict[tuple, float] = {}
|
| 54 |
+
index: dict[tuple, RetrievalResult] = {}
|
| 55 |
+
|
| 56 |
+
for ranked in ranked_lists:
|
| 57 |
+
for rank, result in enumerate(ranked):
|
| 58 |
+
key = _result_dedup_key(result)
|
| 59 |
+
scores[key] = scores.get(key, 0.0) + 1.0 / (k_rrf + rank + 1)
|
| 60 |
+
# Keep the first occurrence; metadata is identical for the same
|
| 61 |
+
# key across lists, so any copy is fine.
|
| 62 |
+
if key not in index:
|
| 63 |
+
index[key] = result
|
| 64 |
+
|
| 65 |
+
merged = sorted(index.values(), key=lambda r: scores[_result_dedup_key(r)], reverse=True)
|
| 66 |
+
# Overwrite score with RRF score so downstream consumers see a uniform scale.
|
| 67 |
+
for r in merged:
|
| 68 |
+
r.score = scores[_result_dedup_key(r)]
|
| 69 |
+
return merged[:top_k]
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
async def invalidate_retrieval_cache(user_id: str) -> int:
|
| 73 |
+
"""Delete every cached retrieval entry for `user_id`.
|
| 74 |
+
|
| 75 |
+
Called by ingest/upload/delete API handlers after a successful write so
|
| 76 |
+
the next retrieval picks up the new data instead of stale cached top-k.
|
| 77 |
+
Returns the number of keys removed.
|
| 78 |
+
"""
|
| 79 |
+
redis = await get_redis()
|
| 80 |
+
pattern = f"{_CACHE_KEY_PREFIX}:{user_id}:*"
|
| 81 |
+
keys = [key async for key in redis.scan_iter(match=pattern)]
|
| 82 |
+
if not keys:
|
| 83 |
+
return 0
|
| 84 |
+
deleted = await redis.delete(*keys)
|
| 85 |
+
logger.info("retrieval cache invalidated", user_id=user_id, deleted=deleted)
|
| 86 |
+
return int(deleted)
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
class RetrievalRouter:
|
| 90 |
+
def __init__(
|
| 91 |
+
self,
|
| 92 |
+
schema_retriever: BaseRetriever,
|
| 93 |
+
document_retriever: BaseRetriever,
|
| 94 |
+
):
|
| 95 |
+
self._retrievers: dict[str, BaseRetriever] = {
|
| 96 |
+
"schema": schema_retriever,
|
| 97 |
+
"document": document_retriever,
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
def _route(self, source_hint: SourceHint) -> list[tuple[str, BaseRetriever]]:
|
| 101 |
+
if source_hint == "schema":
|
| 102 |
+
return [("schema", self._retrievers["schema"])]
|
| 103 |
+
if source_hint == "document":
|
| 104 |
+
return [("document", self._retrievers["document"])]
|
| 105 |
+
return list(self._retrievers.items())
|
| 106 |
+
|
| 107 |
+
async def retrieve(
|
| 108 |
+
self,
|
| 109 |
+
query: str,
|
| 110 |
+
user_id: str,
|
| 111 |
+
source_hint: SourceHint = "both",
|
| 112 |
+
k: int = 10,
|
| 113 |
+
) -> list[RetrievalResult]:
|
| 114 |
+
redis = await get_redis()
|
| 115 |
+
query_hash = hashlib.md5(query.encode()).hexdigest()
|
| 116 |
+
cache_key = f"{_CACHE_KEY_PREFIX}:{user_id}:{source_hint}:{query_hash}:{k}"
|
| 117 |
+
|
| 118 |
+
cached = await redis.get(cache_key)
|
| 119 |
+
if cached:
|
| 120 |
+
try:
|
| 121 |
+
raw = json.loads(cached)
|
| 122 |
+
logger.info("returning cached retrieval results", source_hint=source_hint)
|
| 123 |
+
return [RetrievalResult(**r) for r in raw]
|
| 124 |
+
except Exception:
|
| 125 |
+
logger.warning("corrupted retrieval cache, fetching fresh", cache_key=cache_key)
|
| 126 |
+
|
| 127 |
+
results = await self._retrieve_uncached(query, user_id, source_hint, k)
|
| 128 |
+
|
| 129 |
+
# Empty-result fallback: orchestrator may have misclassified intent.
|
| 130 |
+
# Retry once with "both" before giving up. No-op when source_hint is
|
| 131 |
+
# already "both".
|
| 132 |
+
if not results and source_hint != "both":
|
| 133 |
+
logger.warning(
|
| 134 |
+
"empty retrieval, falling back to source_hint='both'",
|
| 135 |
+
original_source_hint=source_hint,
|
| 136 |
+
)
|
| 137 |
+
results = await self._retrieve_uncached(query, user_id, "both", k)
|
| 138 |
+
|
| 139 |
+
await redis.setex(
|
| 140 |
+
cache_key,
|
| 141 |
+
_CACHE_TTL,
|
| 142 |
+
json.dumps([asdict(r) for r in results]),
|
| 143 |
+
)
|
| 144 |
+
return results
|
| 145 |
+
|
| 146 |
+
async def _retrieve_uncached(
|
| 147 |
+
self,
|
| 148 |
+
query: str,
|
| 149 |
+
user_id: str,
|
| 150 |
+
source_hint: SourceHint,
|
| 151 |
+
k: int,
|
| 152 |
+
) -> list[RetrievalResult]:
|
| 153 |
+
routed = self._route(source_hint)
|
| 154 |
+
batches = await asyncio.gather(
|
| 155 |
+
*[r.retrieve(query, user_id, k) for _, r in routed],
|
| 156 |
+
return_exceptions=True,
|
| 157 |
+
)
|
| 158 |
+
|
| 159 |
+
valid_lists: list[list[RetrievalResult]] = []
|
| 160 |
+
per_retriever: dict[str, int | str] = {}
|
| 161 |
+
for (name, _), batch in zip(routed, batches):
|
| 162 |
+
if isinstance(batch, Exception):
|
| 163 |
+
logger.error("retriever failed", retriever=name, error=str(batch))
|
| 164 |
+
per_retriever[name] = "error"
|
| 165 |
+
continue
|
| 166 |
+
valid_lists.append(batch)
|
| 167 |
+
per_retriever[name] = len(batch)
|
| 168 |
+
|
| 169 |
+
results = _rrf_merge(valid_lists, top_k=k)
|
| 170 |
+
|
| 171 |
+
logger.info(
|
| 172 |
+
"router result",
|
| 173 |
+
source_hint=source_hint,
|
| 174 |
+
per_retriever=per_retriever,
|
| 175 |
+
final_count=len(results),
|
| 176 |
+
top_score=results[0].score if results else None,
|
| 177 |
+
bottom_score=results[-1].score if results else None,
|
| 178 |
+
)
|
| 179 |
+
return results
|
src/storage/az_blob/az_blob.py
CHANGED
|
@@ -57,6 +57,22 @@ class AzureBlobStorage:
|
|
| 57 |
logger.error(f"Failed to download blob {blob_name}", error=str(e))
|
| 58 |
raise
|
| 59 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
async def delete_file(self, blob_name: str) -> bool:
|
| 61 |
"""Delete file from Azure Blob Storage."""
|
| 62 |
try:
|
|
@@ -71,6 +87,24 @@ class AzureBlobStorage:
|
|
| 71 |
logger.error(f"Failed to delete blob {blob_name}", error=str(e))
|
| 72 |
return False
|
| 73 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
|
| 75 |
# Singleton instance
|
| 76 |
blob_storage = AzureBlobStorage()
|
|
|
|
| 57 |
logger.error(f"Failed to download blob {blob_name}", error=str(e))
|
| 58 |
raise
|
| 59 |
|
| 60 |
+
async def upload_bytes(self, content: bytes, blob_name: str) -> str:
|
| 61 |
+
"""Upload bytes to Azure Blob Storage using a specific blob name.
|
| 62 |
+
|
| 63 |
+
Unlike upload_file(), this does not generate a UUID name — caller controls the blob_name.
|
| 64 |
+
Used for Parquet files where the name must be deterministic (derived from document_id).
|
| 65 |
+
"""
|
| 66 |
+
try:
|
| 67 |
+
async with self._get_blob_client(blob_name) as blob_client:
|
| 68 |
+
logger.info(f"Uploading bytes to blob {blob_name}")
|
| 69 |
+
await blob_client.upload_blob(content, overwrite=True)
|
| 70 |
+
logger.info(f"Successfully uploaded {blob_name}")
|
| 71 |
+
return blob_name
|
| 72 |
+
except Exception as e:
|
| 73 |
+
logger.error(f"Failed to upload bytes to {blob_name}", error=str(e))
|
| 74 |
+
raise
|
| 75 |
+
|
| 76 |
async def delete_file(self, blob_name: str) -> bool:
|
| 77 |
"""Delete file from Azure Blob Storage."""
|
| 78 |
try:
|
|
|
|
| 87 |
logger.error(f"Failed to delete blob {blob_name}", error=str(e))
|
| 88 |
return False
|
| 89 |
|
| 90 |
+
async def delete_blobs_with_prefix(self, prefix: str) -> int:
|
| 91 |
+
"""Delete all blobs whose name starts with prefix. Returns count deleted.
|
| 92 |
+
|
| 93 |
+
Used to delete all Parquet files for a document in one call.
|
| 94 |
+
"""
|
| 95 |
+
from azure.storage.blob.aio import ContainerClient
|
| 96 |
+
container_url = f"{self.account_url}/{self.container_name}?{self.sas_token}"
|
| 97 |
+
deleted = 0
|
| 98 |
+
try:
|
| 99 |
+
async with ContainerClient.from_container_url(container_url) as container:
|
| 100 |
+
async for blob in container.list_blobs(name_starts_with=prefix):
|
| 101 |
+
await container.delete_blob(blob.name)
|
| 102 |
+
deleted += 1
|
| 103 |
+
logger.info(f"Deleted {deleted} blobs with prefix {prefix}")
|
| 104 |
+
except Exception as e:
|
| 105 |
+
logger.error(f"Failed to delete blobs with prefix {prefix}", error=str(e))
|
| 106 |
+
return deleted
|
| 107 |
+
|
| 108 |
|
| 109 |
# Singleton instance
|
| 110 |
blob_storage = AzureBlobStorage()
|
src/tools/search.py
CHANGED
|
@@ -34,10 +34,10 @@ async def search_documents(
|
|
| 34 |
|
| 35 |
formatted_results = []
|
| 36 |
for result in results:
|
| 37 |
-
filename = result
|
| 38 |
-
page = result
|
| 39 |
source_label = f"{filename}, p.{page}" if page else filename
|
| 40 |
-
formatted_results.append(f"[Source: {source_label}]\n{result
|
| 41 |
|
| 42 |
return "\n".join(formatted_results)
|
| 43 |
|
|
|
|
| 34 |
|
| 35 |
formatted_results = []
|
| 36 |
for result in results:
|
| 37 |
+
filename = result.metadata.get("filename", "Unknown")
|
| 38 |
+
page = result.metadata.get("page_label")
|
| 39 |
source_label = f"{filename}, p.{page}" if page else filename
|
| 40 |
+
formatted_results.append(f"[Source: {source_label}]\n{result.content}\n")
|
| 41 |
|
| 42 |
return "\n".join(formatted_results)
|
| 43 |
|
src/utils/db_credential_encryption.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Fernet encryption utilities for user-registered database credentials.
|
| 2 |
+
|
| 3 |
+
Encryption key is sourced from `dataeyond__db__credential__key` env variable,
|
| 4 |
+
intentionally separate from the user-auth bcrypt salt (`emarcal__bcrypt__salt`).
|
| 5 |
+
|
| 6 |
+
Usage:
|
| 7 |
+
from src.utils.db_credential_encryption import encrypt_credentials_dict, decrypt_credentials_dict
|
| 8 |
+
|
| 9 |
+
# Before INSERT:
|
| 10 |
+
safe_creds = encrypt_credentials_dict(raw_credentials)
|
| 11 |
+
|
| 12 |
+
# After SELECT:
|
| 13 |
+
plain_creds = decrypt_credentials_dict(row.credentials)
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
from cryptography.fernet import Fernet
|
| 17 |
+
from src.config.settings import settings
|
| 18 |
+
|
| 19 |
+
# Sensitive credential field names that must be encrypted at rest.
|
| 20 |
+
# Covers all supported DB types:
|
| 21 |
+
# - password : postgres, mysql, sqlserver, supabase, snowflake
|
| 22 |
+
# - service_account_json : bigquery
|
| 23 |
+
SENSITIVE_FIELDS: frozenset[str] = frozenset({"password", "service_account_json"})
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def _get_cipher() -> Fernet:
|
| 27 |
+
key = settings.dataeyond_db_credential_key
|
| 28 |
+
if not key:
|
| 29 |
+
raise ValueError(
|
| 30 |
+
"dataeyond__db__credential__key is not set. "
|
| 31 |
+
"Generate one with: Fernet.generate_key().decode()"
|
| 32 |
+
)
|
| 33 |
+
return Fernet(key.encode())
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def encrypt_credential(value: str) -> str:
|
| 37 |
+
"""Encrypt a single credential string value."""
|
| 38 |
+
return _get_cipher().encrypt(value.encode()).decode()
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def decrypt_credential(value: str) -> str:
|
| 42 |
+
"""Decrypt a single Fernet-encrypted credential string."""
|
| 43 |
+
return _get_cipher().decrypt(value.encode()).decode()
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def encrypt_credentials_dict(creds: dict) -> dict:
|
| 47 |
+
"""Return a copy of the credentials dict with sensitive fields encrypted.
|
| 48 |
+
|
| 49 |
+
Call this before inserting a new DatabaseClient record.
|
| 50 |
+
"""
|
| 51 |
+
cipher = _get_cipher()
|
| 52 |
+
result = dict(creds)
|
| 53 |
+
for field in SENSITIVE_FIELDS:
|
| 54 |
+
if result.get(field):
|
| 55 |
+
result[field] = cipher.encrypt(result[field].encode()).decode()
|
| 56 |
+
return result
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def decrypt_credentials_dict(creds: dict) -> dict:
|
| 60 |
+
"""Return a copy of the credentials dict with sensitive fields decrypted.
|
| 61 |
+
|
| 62 |
+
Call this after fetching a DatabaseClient record from DB.
|
| 63 |
+
"""
|
| 64 |
+
cipher = _get_cipher()
|
| 65 |
+
result = dict(creds)
|
| 66 |
+
for field in SENSITIVE_FIELDS:
|
| 67 |
+
if result.get(field):
|
| 68 |
+
result[field] = cipher.decrypt(result[field].encode()).decode()
|
| 69 |
+
return result
|
| 70 |
+
|
uv.lock
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
version = 1
|
| 2 |
-
revision =
|
| 3 |
requires-python = "==3.12.*"
|
| 4 |
resolution-markers = [
|
| 5 |
"python_full_version >= '3.12.4'",
|
|
@@ -39,6 +39,7 @@ dependencies = [
|
|
| 39 |
{ name = "orjson" },
|
| 40 |
{ name = "pandas" },
|
| 41 |
{ name = "passlib", extra = ["bcrypt"] },
|
|
|
|
| 42 |
{ name = "pgvector" },
|
| 43 |
{ name = "plotly" },
|
| 44 |
{ name = "presidio-analyzer" },
|
|
@@ -46,10 +47,15 @@ dependencies = [
|
|
| 46 |
{ name = "prometheus-client" },
|
| 47 |
{ name = "psycopg", extra = ["binary", "pool"] },
|
| 48 |
{ name = "psycopg2" },
|
|
|
|
| 49 |
{ name = "pydantic" },
|
| 50 |
{ name = "pydantic-settings" },
|
| 51 |
{ name = "pymongo" },
|
|
|
|
|
|
|
| 52 |
{ name = "pypdf" },
|
|
|
|
|
|
|
| 53 |
{ name = "python-docx" },
|
| 54 |
{ name = "python-dotenv" },
|
| 55 |
{ name = "python-multipart" },
|
|
@@ -57,8 +63,11 @@ dependencies = [
|
|
| 57 |
{ name = "redis" },
|
| 58 |
{ name = "sentence-transformers" },
|
| 59 |
{ name = "slowapi" },
|
|
|
|
| 60 |
{ name = "spacy" },
|
| 61 |
{ name = "sqlalchemy", extra = ["asyncio"] },
|
|
|
|
|
|
|
| 62 |
{ name = "sse-starlette" },
|
| 63 |
{ name = "starlette" },
|
| 64 |
{ name = "structlog" },
|
|
@@ -80,11 +89,8 @@ dev = [
|
|
| 80 |
|
| 81 |
[package.dev-dependencies]
|
| 82 |
dev = [
|
| 83 |
-
{ name = "mypy" },
|
| 84 |
-
{ name = "pre-commit" },
|
| 85 |
{ name = "pytest" },
|
| 86 |
{ name = "pytest-asyncio" },
|
| 87 |
-
{ name = "pytest-cov" },
|
| 88 |
{ name = "ruff" },
|
| 89 |
]
|
| 90 |
|
|
@@ -120,6 +126,7 @@ requires-dist = [
|
|
| 120 |
{ name = "orjson", specifier = "==3.10.12" },
|
| 121 |
{ name = "pandas", specifier = "==2.2.3" },
|
| 122 |
{ name = "passlib", extras = ["bcrypt"], specifier = "==1.7.4" },
|
|
|
|
| 123 |
{ name = "pgvector", specifier = "==0.3.6" },
|
| 124 |
{ name = "plotly", specifier = "==5.24.1" },
|
| 125 |
{ name = "pre-commit", marker = "extra == 'dev'", specifier = "==4.0.1" },
|
|
@@ -128,10 +135,15 @@ requires-dist = [
|
|
| 128 |
{ name = "prometheus-client", specifier = "==0.21.1" },
|
| 129 |
{ name = "psycopg", extras = ["binary", "pool"], specifier = "==3.2.3" },
|
| 130 |
{ name = "psycopg2", specifier = ">=2.9.11" },
|
|
|
|
| 131 |
{ name = "pydantic", specifier = "==2.10.3" },
|
| 132 |
{ name = "pydantic-settings", specifier = "==2.7.0" },
|
| 133 |
{ name = "pymongo", specifier = ">=4.14.0" },
|
|
|
|
|
|
|
| 134 |
{ name = "pypdf", specifier = "==5.1.0" },
|
|
|
|
|
|
|
| 135 |
{ name = "pytest", marker = "extra == 'dev'", specifier = "==8.3.4" },
|
| 136 |
{ name = "pytest-asyncio", marker = "extra == 'dev'", specifier = "==0.24.0" },
|
| 137 |
{ name = "pytest-cov", marker = "extra == 'dev'", specifier = "==6.0.0" },
|
|
@@ -143,8 +155,11 @@ requires-dist = [
|
|
| 143 |
{ name = "ruff", marker = "extra == 'dev'", specifier = "==0.8.4" },
|
| 144 |
{ name = "sentence-transformers", specifier = "==3.3.1" },
|
| 145 |
{ name = "slowapi", specifier = "==0.1.9" },
|
|
|
|
| 146 |
{ name = "spacy", specifier = "==3.8.3" },
|
| 147 |
{ name = "sqlalchemy", extras = ["asyncio"], specifier = "==2.0.36" },
|
|
|
|
|
|
|
| 148 |
{ name = "sse-starlette", specifier = "==2.1.3" },
|
| 149 |
{ name = "starlette", specifier = "==0.41.3" },
|
| 150 |
{ name = "structlog", specifier = "==24.4.0" },
|
|
@@ -156,12 +171,9 @@ provides-extras = ["dev"]
|
|
| 156 |
|
| 157 |
[package.metadata.requires-dev]
|
| 158 |
dev = [
|
| 159 |
-
{ name = "
|
| 160 |
-
{ name = "
|
| 161 |
-
{ name = "
|
| 162 |
-
{ name = "pytest-asyncio", specifier = "==0.24.0" },
|
| 163 |
-
{ name = "pytest-cov", specifier = "==6.0.0" },
|
| 164 |
-
{ name = "ruff", specifier = "==0.8.4" },
|
| 165 |
]
|
| 166 |
|
| 167 |
[[package]]
|
|
@@ -280,6 +292,15 @@ wheels = [
|
|
| 280 |
{ url = "https://files.pythonhosted.org/packages/13/b5/7af0cb920a476dccd612fbc9a21a3745fb29b1fcd74636078db8f7ba294c/APScheduler-3.10.4-py3-none-any.whl", hash = "sha256:fb91e8a768632a4756a585f79ec834e0e27aad5860bac7eaa523d9ccefd87661", size = 59303, upload-time = "2023-08-19T16:44:56.814Z" },
|
| 281 |
]
|
| 282 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 283 |
[[package]]
|
| 284 |
name = "asyncpg"
|
| 285 |
version = "0.30.0"
|
|
@@ -428,6 +449,34 @@ wheels = [
|
|
| 428 |
{ url = "https://files.pythonhosted.org/packages/20/07/fb43edc2ff0a6a367e4a94fc39eb3b85aa1e55e24cc857af2db145ce9f0d/blis-1.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:f20f7ad69aaffd1ce14fe77de557b6df9b61e0c9e582f75a843715d836b5c8af", size = 6192759, upload-time = "2025-11-17T12:27:56.176Z" },
|
| 429 |
]
|
| 430 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 431 |
[[package]]
|
| 432 |
name = "cachetools"
|
| 433 |
version = "5.5.0"
|
|
@@ -941,6 +990,109 @@ wheels = [
|
|
| 941 |
{ url = "https://files.pythonhosted.org/packages/d5/1f/5f4a3cd9e4440e9d9bc78ad0a91a1c8d46b4d429d5239ebe6793c9fe5c41/fsspec-2026.3.0-py3-none-any.whl", hash = "sha256:d2ceafaad1b3457968ed14efa28798162f1638dbb5d2a6868a2db002a5ee39a4", size = 202595, upload-time = "2026-03-27T19:11:13.595Z" },
|
| 942 |
]
|
| 943 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 944 |
[[package]]
|
| 945 |
name = "greenlet"
|
| 946 |
version = "3.3.2"
|
|
@@ -958,6 +1110,41 @@ wheels = [
|
|
| 958 |
{ url = "https://files.pythonhosted.org/packages/58/2e/fe7f36ff1982d6b10a60d5e0740c759259a7d6d2e1dc41da6d96de32fff6/greenlet-3.3.2-cp312-cp312-win_arm64.whl", hash = "sha256:d3a62fa76a32b462a97198e4c9e99afb9ab375115e74e9a83ce180e7a496f643", size = 230331, upload-time = "2026-02-20T20:17:23.34Z" },
|
| 959 |
]
|
| 960 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 961 |
[[package]]
|
| 962 |
name = "h11"
|
| 963 |
version = "0.16.0"
|
|
@@ -1127,6 +1314,15 @@ wheels = [
|
|
| 1127 |
{ url = "https://files.pythonhosted.org/packages/67/8a/a342b2f0251f3dac4ca17618265d93bf244a2a4d089126e81e4c1056ac50/jiter-0.13.0-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7bb00b6d26db67a05fe3e12c76edc75f32077fb51deed13822dc648fa373bc19", size = 343768, upload-time = "2026-02-02T12:37:55.055Z" },
|
| 1128 |
]
|
| 1129 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1130 |
[[package]]
|
| 1131 |
name = "joblib"
|
| 1132 |
version = "1.5.3"
|
|
@@ -1954,6 +2150,18 @@ bcrypt = [
|
|
| 1954 |
{ name = "bcrypt" },
|
| 1955 |
]
|
| 1956 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1957 |
[[package]]
|
| 1958 |
name = "pgvector"
|
| 1959 |
version = "0.3.6"
|
|
@@ -2121,6 +2329,33 @@ wheels = [
|
|
| 2121 |
{ url = "https://files.pythonhosted.org/packages/5b/5a/bc7b4a4ef808fa59a816c17b20c4bef6884daebbdf627ff2a161da67da19/propcache-0.4.1-py3-none-any.whl", hash = "sha256:af2a6052aeb6cf17d3e46ee169099044fd8224cbaf75c76a2ef596e8163e2237", size = 13305, upload-time = "2025-10-08T19:49:00.792Z" },
|
| 2122 |
]
|
| 2123 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2124 |
[[package]]
|
| 2125 |
name = "psycopg"
|
| 2126 |
version = "3.2.3"
|
|
@@ -2181,6 +2416,42 @@ wheels = [
|
|
| 2181 |
{ url = "https://files.pythonhosted.org/packages/b5/bf/635fbe5dd10ed200afbbfbe98f8602829252ca1cce81cc48fb25ed8dadc0/psycopg2-2.9.11-cp312-cp312-win_amd64.whl", hash = "sha256:e03e4a6dbe87ff81540b434f2e5dc2bddad10296db5eea7bdc995bf5f4162938", size = 2713969, upload-time = "2025-10-10T11:10:15.946Z" },
|
| 2182 |
]
|
| 2183 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2184 |
[[package]]
|
| 2185 |
name = "pycparser"
|
| 2186 |
version = "3.0"
|
|
@@ -2310,6 +2581,43 @@ wheels = [
|
|
| 2310 |
{ url = "https://files.pythonhosted.org/packages/60/4c/33f75713d50d5247f2258405142c0318ff32c6f8976171c4fcae87a9dbdf/pymongo-4.16.0-cp312-cp312-win_arm64.whl", hash = "sha256:dfc320f08ea9a7ec5b2403dc4e8150636f0d6150f4b9792faaae539c88e7db3b", size = 892971, upload-time = "2026-01-07T18:04:35.594Z" },
|
| 2311 |
]
|
| 2312 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2313 |
[[package]]
|
| 2314 |
name = "pyparsing"
|
| 2315 |
version = "3.3.2"
|
|
@@ -2328,6 +2636,28 @@ wheels = [
|
|
| 2328 |
{ url = "https://files.pythonhosted.org/packages/04/fc/6f52588ac1cb4400a7804ef88d0d4e00cfe57a7ac6793ec3b00de5a8758b/pypdf-5.1.0-py3-none-any.whl", hash = "sha256:3bd4f503f4ebc58bae40d81e81a9176c400cbbac2ba2d877367595fb524dfdfc", size = 297976, upload-time = "2024-10-27T19:46:44.439Z" },
|
| 2329 |
]
|
| 2330 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2331 |
[[package]]
|
| 2332 |
name = "pytest"
|
| 2333 |
version = "8.3.4"
|
|
@@ -2610,6 +2940,18 @@ wheels = [
|
|
| 2610 |
{ url = "https://files.pythonhosted.org/packages/13/9f/026e18ca7d7766783d779dae5e9c656746c6ede36ef73c6d934aaf4a6dec/ruff-0.8.4-py3-none-win_arm64.whl", hash = "sha256:9183dd615d8df50defa8b1d9a074053891ba39025cf5ae88e8bcb52edcc4bf08", size = 9074500, upload-time = "2024-12-19T13:36:23.92Z" },
|
| 2611 |
]
|
| 2612 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2613 |
[[package]]
|
| 2614 |
name = "safetensors"
|
| 2615 |
version = "0.7.0"
|
|
@@ -2764,6 +3106,60 @@ wheels = [
|
|
| 2764 |
{ url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" },
|
| 2765 |
]
|
| 2766 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2767 |
[[package]]
|
| 2768 |
name = "spacy"
|
| 2769 |
version = "3.8.3"
|
|
@@ -2842,6 +3238,31 @@ asyncio = [
|
|
| 2842 |
{ name = "greenlet" },
|
| 2843 |
]
|
| 2844 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2845 |
[[package]]
|
| 2846 |
name = "srsly"
|
| 2847 |
version = "2.5.3"
|
|
@@ -3015,6 +3436,15 @@ wheels = [
|
|
| 3015 |
{ url = "https://files.pythonhosted.org/packages/72/f4/0de46cfa12cdcbcd464cc59fde36912af405696f687e53a091fb432f694c/tokenizers-0.22.2-cp39-abi3-win_arm64.whl", hash = "sha256:9ce725d22864a1e965217204946f830c37876eee3b2ba6fc6255e8e903d5fcbc", size = 2612133, upload-time = "2026-01-05T10:45:17.232Z" },
|
| 3016 |
]
|
| 3017 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3018 |
[[package]]
|
| 3019 |
name = "torch"
|
| 3020 |
version = "2.11.0"
|
|
|
|
| 1 |
version = 1
|
| 2 |
+
revision = 3
|
| 3 |
requires-python = "==3.12.*"
|
| 4 |
resolution-markers = [
|
| 5 |
"python_full_version >= '3.12.4'",
|
|
|
|
| 39 |
{ name = "orjson" },
|
| 40 |
{ name = "pandas" },
|
| 41 |
{ name = "passlib", extra = ["bcrypt"] },
|
| 42 |
+
{ name = "pdf2image" },
|
| 43 |
{ name = "pgvector" },
|
| 44 |
{ name = "plotly" },
|
| 45 |
{ name = "presidio-analyzer" },
|
|
|
|
| 47 |
{ name = "prometheus-client" },
|
| 48 |
{ name = "psycopg", extra = ["binary", "pool"] },
|
| 49 |
{ name = "psycopg2" },
|
| 50 |
+
{ name = "pyarrow" },
|
| 51 |
{ name = "pydantic" },
|
| 52 |
{ name = "pydantic-settings" },
|
| 53 |
{ name = "pymongo" },
|
| 54 |
+
{ name = "pymssql" },
|
| 55 |
+
{ name = "pymysql" },
|
| 56 |
{ name = "pypdf" },
|
| 57 |
+
{ name = "pypdf2" },
|
| 58 |
+
{ name = "pytesseract" },
|
| 59 |
{ name = "python-docx" },
|
| 60 |
{ name = "python-dotenv" },
|
| 61 |
{ name = "python-multipart" },
|
|
|
|
| 63 |
{ name = "redis" },
|
| 64 |
{ name = "sentence-transformers" },
|
| 65 |
{ name = "slowapi" },
|
| 66 |
+
{ name = "snowflake-sqlalchemy" },
|
| 67 |
{ name = "spacy" },
|
| 68 |
{ name = "sqlalchemy", extra = ["asyncio"] },
|
| 69 |
+
{ name = "sqlalchemy-bigquery" },
|
| 70 |
+
{ name = "sqlglot" },
|
| 71 |
{ name = "sse-starlette" },
|
| 72 |
{ name = "starlette" },
|
| 73 |
{ name = "structlog" },
|
|
|
|
| 89 |
|
| 90 |
[package.dev-dependencies]
|
| 91 |
dev = [
|
|
|
|
|
|
|
| 92 |
{ name = "pytest" },
|
| 93 |
{ name = "pytest-asyncio" },
|
|
|
|
| 94 |
{ name = "ruff" },
|
| 95 |
]
|
| 96 |
|
|
|
|
| 126 |
{ name = "orjson", specifier = "==3.10.12" },
|
| 127 |
{ name = "pandas", specifier = "==2.2.3" },
|
| 128 |
{ name = "passlib", extras = ["bcrypt"], specifier = "==1.7.4" },
|
| 129 |
+
{ name = "pdf2image", specifier = ">=1.17.0" },
|
| 130 |
{ name = "pgvector", specifier = "==0.3.6" },
|
| 131 |
{ name = "plotly", specifier = "==5.24.1" },
|
| 132 |
{ name = "pre-commit", marker = "extra == 'dev'", specifier = "==4.0.1" },
|
|
|
|
| 135 |
{ name = "prometheus-client", specifier = "==0.21.1" },
|
| 136 |
{ name = "psycopg", extras = ["binary", "pool"], specifier = "==3.2.3" },
|
| 137 |
{ name = "psycopg2", specifier = ">=2.9.11" },
|
| 138 |
+
{ name = "pyarrow", specifier = ">=24.0.0" },
|
| 139 |
{ name = "pydantic", specifier = "==2.10.3" },
|
| 140 |
{ name = "pydantic-settings", specifier = "==2.7.0" },
|
| 141 |
{ name = "pymongo", specifier = ">=4.14.0" },
|
| 142 |
+
{ name = "pymssql", specifier = ">=2.3.0" },
|
| 143 |
+
{ name = "pymysql", specifier = ">=1.1.1" },
|
| 144 |
{ name = "pypdf", specifier = "==5.1.0" },
|
| 145 |
+
{ name = "pypdf2", specifier = ">=3.0.1" },
|
| 146 |
+
{ name = "pytesseract", specifier = ">=0.3.13" },
|
| 147 |
{ name = "pytest", marker = "extra == 'dev'", specifier = "==8.3.4" },
|
| 148 |
{ name = "pytest-asyncio", marker = "extra == 'dev'", specifier = "==0.24.0" },
|
| 149 |
{ name = "pytest-cov", marker = "extra == 'dev'", specifier = "==6.0.0" },
|
|
|
|
| 155 |
{ name = "ruff", marker = "extra == 'dev'", specifier = "==0.8.4" },
|
| 156 |
{ name = "sentence-transformers", specifier = "==3.3.1" },
|
| 157 |
{ name = "slowapi", specifier = "==0.1.9" },
|
| 158 |
+
{ name = "snowflake-sqlalchemy", specifier = ">=1.7.0" },
|
| 159 |
{ name = "spacy", specifier = "==3.8.3" },
|
| 160 |
{ name = "sqlalchemy", extras = ["asyncio"], specifier = "==2.0.36" },
|
| 161 |
+
{ name = "sqlalchemy-bigquery", specifier = ">=1.11.0" },
|
| 162 |
+
{ name = "sqlglot", specifier = ">=25.0.0" },
|
| 163 |
{ name = "sse-starlette", specifier = "==2.1.3" },
|
| 164 |
{ name = "starlette", specifier = "==0.41.3" },
|
| 165 |
{ name = "structlog", specifier = "==24.4.0" },
|
|
|
|
| 171 |
|
| 172 |
[package.metadata.requires-dev]
|
| 173 |
dev = [
|
| 174 |
+
{ name = "pytest", specifier = ">=8.3.4" },
|
| 175 |
+
{ name = "pytest-asyncio", specifier = ">=0.24.0" },
|
| 176 |
+
{ name = "ruff", specifier = ">=0.8.4" },
|
|
|
|
|
|
|
|
|
|
| 177 |
]
|
| 178 |
|
| 179 |
[[package]]
|
|
|
|
| 292 |
{ url = "https://files.pythonhosted.org/packages/13/b5/7af0cb920a476dccd612fbc9a21a3745fb29b1fcd74636078db8f7ba294c/APScheduler-3.10.4-py3-none-any.whl", hash = "sha256:fb91e8a768632a4756a585f79ec834e0e27aad5860bac7eaa523d9ccefd87661", size = 59303, upload-time = "2023-08-19T16:44:56.814Z" },
|
| 293 |
]
|
| 294 |
|
| 295 |
+
[[package]]
|
| 296 |
+
name = "asn1crypto"
|
| 297 |
+
version = "1.5.1"
|
| 298 |
+
source = { registry = "https://pypi.org/simple" }
|
| 299 |
+
sdist = { url = "https://files.pythonhosted.org/packages/de/cf/d547feed25b5244fcb9392e288ff9fdc3280b10260362fc45d37a798a6ee/asn1crypto-1.5.1.tar.gz", hash = "sha256:13ae38502be632115abf8a24cbe5f4da52e3b5231990aff31123c805306ccb9c", size = 121080, upload-time = "2022-03-15T14:46:52.889Z" }
|
| 300 |
+
wheels = [
|
| 301 |
+
{ url = "https://files.pythonhosted.org/packages/c9/7f/09065fd9e27da0eda08b4d6897f1c13535066174cc023af248fc2a8d5e5a/asn1crypto-1.5.1-py2.py3-none-any.whl", hash = "sha256:db4e40728b728508912cbb3d44f19ce188f218e9eba635821bb4b68564f8fd67", size = 105045, upload-time = "2022-03-15T14:46:51.055Z" },
|
| 302 |
+
]
|
| 303 |
+
|
| 304 |
[[package]]
|
| 305 |
name = "asyncpg"
|
| 306 |
version = "0.30.0"
|
|
|
|
| 449 |
{ url = "https://files.pythonhosted.org/packages/20/07/fb43edc2ff0a6a367e4a94fc39eb3b85aa1e55e24cc857af2db145ce9f0d/blis-1.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:f20f7ad69aaffd1ce14fe77de557b6df9b61e0c9e582f75a843715d836b5c8af", size = 6192759, upload-time = "2025-11-17T12:27:56.176Z" },
|
| 450 |
]
|
| 451 |
|
| 452 |
+
[[package]]
|
| 453 |
+
name = "boto3"
|
| 454 |
+
version = "1.42.89"
|
| 455 |
+
source = { registry = "https://pypi.org/simple" }
|
| 456 |
+
dependencies = [
|
| 457 |
+
{ name = "botocore" },
|
| 458 |
+
{ name = "jmespath" },
|
| 459 |
+
{ name = "s3transfer" },
|
| 460 |
+
]
|
| 461 |
+
sdist = { url = "https://files.pythonhosted.org/packages/bb/0c/f7bccb22b245cabf392816baba20f9e95f78ace7dbc580fd40136e80e732/boto3-1.42.89.tar.gz", hash = "sha256:3e43aacc0801bba9bcd23a8c271c089af297a69565f783fcdd357ae0e330bf1e", size = 113165, upload-time = "2026-04-13T19:36:17.516Z" }
|
| 462 |
+
wheels = [
|
| 463 |
+
{ url = "https://files.pythonhosted.org/packages/b9/33/55103ba5ef9975ea54b8d39e69b76eb6e9fded3beae5f01065e26951a3a1/boto3-1.42.89-py3-none-any.whl", hash = "sha256:6204b189f4d0c655535f43d7eaa57ff4e8d965b8463c97e45952291211162932", size = 140556, upload-time = "2026-04-13T19:36:13.894Z" },
|
| 464 |
+
]
|
| 465 |
+
|
| 466 |
+
[[package]]
|
| 467 |
+
name = "botocore"
|
| 468 |
+
version = "1.42.89"
|
| 469 |
+
source = { registry = "https://pypi.org/simple" }
|
| 470 |
+
dependencies = [
|
| 471 |
+
{ name = "jmespath" },
|
| 472 |
+
{ name = "python-dateutil" },
|
| 473 |
+
{ name = "urllib3" },
|
| 474 |
+
]
|
| 475 |
+
sdist = { url = "https://files.pythonhosted.org/packages/0f/cc/e6be943efa9051bd15c2ee14077c2b10d6e27c9e9385fc43a03a5c4ed8b5/botocore-1.42.89.tar.gz", hash = "sha256:95ac52f472dad29942f3088b278ab493044516c16dbf9133c975af16527baa99", size = 15206290, upload-time = "2026-04-13T19:36:02.321Z" }
|
| 476 |
+
wheels = [
|
| 477 |
+
{ url = "https://files.pythonhosted.org/packages/91/f1/90a7b8eda38b7c3a65ca7ee0075bdf310b6b471cb1b95fab6e8994323a50/botocore-1.42.89-py3-none-any.whl", hash = "sha256:d9b786c8d9db6473063b4cc5be0ba7e6a381082307bd6afb69d4216f9fa95f35", size = 14887287, upload-time = "2026-04-13T19:35:56.677Z" },
|
| 478 |
+
]
|
| 479 |
+
|
| 480 |
[[package]]
|
| 481 |
name = "cachetools"
|
| 482 |
version = "5.5.0"
|
|
|
|
| 990 |
{ url = "https://files.pythonhosted.org/packages/d5/1f/5f4a3cd9e4440e9d9bc78ad0a91a1c8d46b4d429d5239ebe6793c9fe5c41/fsspec-2026.3.0-py3-none-any.whl", hash = "sha256:d2ceafaad1b3457968ed14efa28798162f1638dbb5d2a6868a2db002a5ee39a4", size = 202595, upload-time = "2026-03-27T19:11:13.595Z" },
|
| 991 |
]
|
| 992 |
|
| 993 |
+
[[package]]
|
| 994 |
+
name = "google-api-core"
|
| 995 |
+
version = "2.30.3"
|
| 996 |
+
source = { registry = "https://pypi.org/simple" }
|
| 997 |
+
dependencies = [
|
| 998 |
+
{ name = "google-auth" },
|
| 999 |
+
{ name = "googleapis-common-protos" },
|
| 1000 |
+
{ name = "proto-plus" },
|
| 1001 |
+
{ name = "protobuf" },
|
| 1002 |
+
{ name = "requests" },
|
| 1003 |
+
]
|
| 1004 |
+
sdist = { url = "https://files.pythonhosted.org/packages/16/ce/502a57fb0ec752026d24df1280b162294b22a0afb98a326084f9a979138b/google_api_core-2.30.3.tar.gz", hash = "sha256:e601a37f148585319b26db36e219df68c5d07b6382cff2d580e83404e44d641b", size = 177001, upload-time = "2026-04-10T00:41:28.035Z" }
|
| 1005 |
+
wheels = [
|
| 1006 |
+
{ url = "https://files.pythonhosted.org/packages/03/15/e56f351cf6ef1cfea58e6ac226a7318ed1deb2218c4b3cc9bd9e4b786c5a/google_api_core-2.30.3-py3-none-any.whl", hash = "sha256:a85761ba72c444dad5d611c2220633480b2b6be2521eca69cca2dbb3ffd6bfe8", size = 173274, upload-time = "2026-04-09T22:57:16.198Z" },
|
| 1007 |
+
]
|
| 1008 |
+
|
| 1009 |
+
[package.optional-dependencies]
|
| 1010 |
+
grpc = [
|
| 1011 |
+
{ name = "grpcio" },
|
| 1012 |
+
{ name = "grpcio-status" },
|
| 1013 |
+
]
|
| 1014 |
+
|
| 1015 |
+
[[package]]
|
| 1016 |
+
name = "google-auth"
|
| 1017 |
+
version = "2.49.2"
|
| 1018 |
+
source = { registry = "https://pypi.org/simple" }
|
| 1019 |
+
dependencies = [
|
| 1020 |
+
{ name = "cryptography" },
|
| 1021 |
+
{ name = "pyasn1-modules" },
|
| 1022 |
+
]
|
| 1023 |
+
sdist = { url = "https://files.pythonhosted.org/packages/c6/fc/e925290a1ad95c975c459e2df070fac2b90954e13a0370ac505dff78cb99/google_auth-2.49.2.tar.gz", hash = "sha256:c1ae38500e73065dcae57355adb6278cf8b5c8e391994ae9cbadbcb9631ab409", size = 333958, upload-time = "2026-04-10T00:41:21.888Z" }
|
| 1024 |
+
wheels = [
|
| 1025 |
+
{ url = "https://files.pythonhosted.org/packages/73/76/d241a5c927433420507215df6cac1b1fa4ac0ba7a794df42a84326c68da8/google_auth-2.49.2-py3-none-any.whl", hash = "sha256:c2720924dfc82dedb962c9f52cabb2ab16714fd0a6a707e40561d217574ed6d5", size = 240638, upload-time = "2026-04-10T00:41:14.501Z" },
|
| 1026 |
+
]
|
| 1027 |
+
|
| 1028 |
+
[[package]]
|
| 1029 |
+
name = "google-cloud-bigquery"
|
| 1030 |
+
version = "3.41.0"
|
| 1031 |
+
source = { registry = "https://pypi.org/simple" }
|
| 1032 |
+
dependencies = [
|
| 1033 |
+
{ name = "google-api-core", extra = ["grpc"] },
|
| 1034 |
+
{ name = "google-auth" },
|
| 1035 |
+
{ name = "google-cloud-core" },
|
| 1036 |
+
{ name = "google-resumable-media" },
|
| 1037 |
+
{ name = "packaging" },
|
| 1038 |
+
{ name = "python-dateutil" },
|
| 1039 |
+
{ name = "requests" },
|
| 1040 |
+
]
|
| 1041 |
+
sdist = { url = "https://files.pythonhosted.org/packages/ce/13/6515c7aab55a4a0cf708ffd309fb9af5bab54c13e32dc22c5acd6497193c/google_cloud_bigquery-3.41.0.tar.gz", hash = "sha256:2217e488b47ed576360c9b2cc07d59d883a54b83167c0ef37f915c26b01a06fe", size = 513434, upload-time = "2026-03-30T22:50:55.347Z" }
|
| 1042 |
+
wheels = [
|
| 1043 |
+
{ url = "https://files.pythonhosted.org/packages/40/33/1d3902efadef9194566d499d61507e1f038454e0b55499d2d7f8ab2a4fee/google_cloud_bigquery-3.41.0-py3-none-any.whl", hash = "sha256:2a5b5a737b401cbd824a6e5eac7554100b878668d908e6548836b5d8aaa4dcaa", size = 262343, upload-time = "2026-03-30T22:48:45.444Z" },
|
| 1044 |
+
]
|
| 1045 |
+
|
| 1046 |
+
[[package]]
|
| 1047 |
+
name = "google-cloud-core"
|
| 1048 |
+
version = "2.5.1"
|
| 1049 |
+
source = { registry = "https://pypi.org/simple" }
|
| 1050 |
+
dependencies = [
|
| 1051 |
+
{ name = "google-api-core" },
|
| 1052 |
+
{ name = "google-auth" },
|
| 1053 |
+
]
|
| 1054 |
+
sdist = { url = "https://files.pythonhosted.org/packages/dc/24/6ca08b0a03c7b0c620427503ab00353a4ae806b848b93bcea18b6b76fde6/google_cloud_core-2.5.1.tar.gz", hash = "sha256:3dc94bdec9d05a31d9f355045ed0f369fbc0d8c665076c734f065d729800f811", size = 36078, upload-time = "2026-03-30T22:50:08.057Z" }
|
| 1055 |
+
wheels = [
|
| 1056 |
+
{ url = "https://files.pythonhosted.org/packages/73/d9/5bb050cb32826466aa9b25f79e2ca2879fe66cb76782d4ed798dd7506151/google_cloud_core-2.5.1-py3-none-any.whl", hash = "sha256:ea62cdf502c20e3e14be8a32c05ed02113d7bef454e40ff3fab6fe1ec9f1f4e7", size = 29452, upload-time = "2026-03-30T22:48:31.567Z" },
|
| 1057 |
+
]
|
| 1058 |
+
|
| 1059 |
+
[[package]]
|
| 1060 |
+
name = "google-crc32c"
|
| 1061 |
+
version = "1.8.0"
|
| 1062 |
+
source = { registry = "https://pypi.org/simple" }
|
| 1063 |
+
sdist = { url = "https://files.pythonhosted.org/packages/03/41/4b9c02f99e4c5fb477122cd5437403b552873f014616ac1d19ac8221a58d/google_crc32c-1.8.0.tar.gz", hash = "sha256:a428e25fb7691024de47fecfbff7ff957214da51eddded0da0ae0e0f03a2cf79", size = 14192, upload-time = "2025-12-16T00:35:25.142Z" }
|
| 1064 |
+
wheels = [
|
| 1065 |
+
{ url = "https://files.pythonhosted.org/packages/e9/5f/7307325b1198b59324c0fa9807cafb551afb65e831699f2ce211ad5c8240/google_crc32c-1.8.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:4b8286b659c1335172e39563ab0a768b8015e88e08329fa5321f774275fc3113", size = 31300, upload-time = "2025-12-16T00:21:56.723Z" },
|
| 1066 |
+
{ url = "https://files.pythonhosted.org/packages/21/8e/58c0d5d86e2220e6a37befe7e6a94dd2f6006044b1a33edf1ff6d9f7e319/google_crc32c-1.8.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:2a3dc3318507de089c5384cc74d54318401410f82aa65b2d9cdde9d297aca7cb", size = 30867, upload-time = "2025-12-16T00:38:31.302Z" },
|
| 1067 |
+
{ url = "https://files.pythonhosted.org/packages/ce/a9/a780cc66f86335a6019f557a8aaca8fbb970728f0efd2430d15ff1beae0e/google_crc32c-1.8.0-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:14f87e04d613dfa218d6135e81b78272c3b904e2a7053b841481b38a7d901411", size = 33364, upload-time = "2025-12-16T00:40:22.96Z" },
|
| 1068 |
+
{ url = "https://files.pythonhosted.org/packages/21/3f/3457ea803db0198c9aaca2dd373750972ce28a26f00544b6b85088811939/google_crc32c-1.8.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cb5c869c2923d56cb0c8e6bcdd73c009c36ae39b652dbe46a05eb4ef0ad01454", size = 33740, upload-time = "2025-12-16T00:40:23.96Z" },
|
| 1069 |
+
{ url = "https://files.pythonhosted.org/packages/df/c0/87c2073e0c72515bb8733d4eef7b21548e8d189f094b5dad20b0ecaf64f6/google_crc32c-1.8.0-cp312-cp312-win_amd64.whl", hash = "sha256:3cc0c8912038065eafa603b238abf252e204accab2a704c63b9e14837a854962", size = 34437, upload-time = "2025-12-16T00:35:21.395Z" },
|
| 1070 |
+
]
|
| 1071 |
+
|
| 1072 |
+
[[package]]
|
| 1073 |
+
name = "google-resumable-media"
|
| 1074 |
+
version = "2.8.2"
|
| 1075 |
+
source = { registry = "https://pypi.org/simple" }
|
| 1076 |
+
dependencies = [
|
| 1077 |
+
{ name = "google-crc32c" },
|
| 1078 |
+
]
|
| 1079 |
+
sdist = { url = "https://files.pythonhosted.org/packages/3f/d1/b1ea14b93b6b78f57fc580125de44e9f593ab88dd2460f1a8a8d18f74754/google_resumable_media-2.8.2.tar.gz", hash = "sha256:f3354a182ebd193ae3f42e3ef95e6c9b10f128320de23ac7637236713b1acd70", size = 2164510, upload-time = "2026-03-30T23:34:25.369Z" }
|
| 1080 |
+
wheels = [
|
| 1081 |
+
{ url = "https://files.pythonhosted.org/packages/5e/f8/50bfaf4658431ff9de45c5c3935af7ab01157a4903c603cd0eee6e78e087/google_resumable_media-2.8.2-py3-none-any.whl", hash = "sha256:82b6d8ccd11765268cdd2a2123f417ec806b8eef3000a9a38dfe3033da5fb220", size = 81511, upload-time = "2026-03-30T23:34:09.671Z" },
|
| 1082 |
+
]
|
| 1083 |
+
|
| 1084 |
+
[[package]]
|
| 1085 |
+
name = "googleapis-common-protos"
|
| 1086 |
+
version = "1.74.0"
|
| 1087 |
+
source = { registry = "https://pypi.org/simple" }
|
| 1088 |
+
dependencies = [
|
| 1089 |
+
{ name = "protobuf" },
|
| 1090 |
+
]
|
| 1091 |
+
sdist = { url = "https://files.pythonhosted.org/packages/20/18/a746c8344152d368a5aac738d4c857012f2c5d1fd2eac7e17b647a7861bd/googleapis_common_protos-1.74.0.tar.gz", hash = "sha256:57971e4eeeba6aad1163c1f0fc88543f965bb49129b8bb55b2b7b26ecab084f1", size = 151254, upload-time = "2026-04-02T21:23:26.679Z" }
|
| 1092 |
+
wheels = [
|
| 1093 |
+
{ url = "https://files.pythonhosted.org/packages/b6/b0/be5d3329badb9230b765de6eea66b73abd5944bdeb5afb3562ddcd80ae84/googleapis_common_protos-1.74.0-py3-none-any.whl", hash = "sha256:702216f78610bb510e3f12ac3cafd281b7ac45cc5d86e90ad87e4d301a3426b5", size = 300743, upload-time = "2026-04-02T21:22:49.108Z" },
|
| 1094 |
+
]
|
| 1095 |
+
|
| 1096 |
[[package]]
|
| 1097 |
name = "greenlet"
|
| 1098 |
version = "3.3.2"
|
|
|
|
| 1110 |
{ url = "https://files.pythonhosted.org/packages/58/2e/fe7f36ff1982d6b10a60d5e0740c759259a7d6d2e1dc41da6d96de32fff6/greenlet-3.3.2-cp312-cp312-win_arm64.whl", hash = "sha256:d3a62fa76a32b462a97198e4c9e99afb9ab375115e74e9a83ce180e7a496f643", size = 230331, upload-time = "2026-02-20T20:17:23.34Z" },
|
| 1111 |
]
|
| 1112 |
|
| 1113 |
+
[[package]]
|
| 1114 |
+
name = "grpcio"
|
| 1115 |
+
version = "1.80.0"
|
| 1116 |
+
source = { registry = "https://pypi.org/simple" }
|
| 1117 |
+
dependencies = [
|
| 1118 |
+
{ name = "typing-extensions" },
|
| 1119 |
+
]
|
| 1120 |
+
sdist = { url = "https://files.pythonhosted.org/packages/b7/48/af6173dbca4454f4637a4678b67f52ca7e0c1ed7d5894d89d434fecede05/grpcio-1.80.0.tar.gz", hash = "sha256:29aca15edd0688c22ba01d7cc01cb000d72b2033f4a3c72a81a19b56fd143257", size = 12978905, upload-time = "2026-03-30T08:49:10.502Z" }
|
| 1121 |
+
wheels = [
|
| 1122 |
+
{ url = "https://files.pythonhosted.org/packages/5c/e8/a2b749265eb3415abc94f2e619bbd9e9707bebdda787e61c593004ec927a/grpcio-1.80.0-cp312-cp312-linux_armv7l.whl", hash = "sha256:c624cc9f1008361014378c9d776de7182b11fe8b2e5a81bc69f23a295f2a1ad0", size = 6015616, upload-time = "2026-03-30T08:47:13.428Z" },
|
| 1123 |
+
{ url = "https://files.pythonhosted.org/packages/3e/97/b1282161a15d699d1e90c360df18d19165a045ce1c343c7f313f5e8a0b77/grpcio-1.80.0-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:f49eddcac43c3bf350c0385366a58f36bed8cc2c0ec35ef7b74b49e56552c0c2", size = 12014204, upload-time = "2026-03-30T08:47:15.873Z" },
|
| 1124 |
+
{ url = "https://files.pythonhosted.org/packages/6e/5e/d319c6e997b50c155ac5a8cb12f5173d5b42677510e886d250d50264949d/grpcio-1.80.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d334591df610ab94714048e0d5b4f3dd5ad1bee74dfec11eee344220077a79de", size = 6563866, upload-time = "2026-03-30T08:47:18.588Z" },
|
| 1125 |
+
{ url = "https://files.pythonhosted.org/packages/ae/f6/fdd975a2cb4d78eb67769a7b3b3830970bfa2e919f1decf724ae4445f42c/grpcio-1.80.0-cp312-cp312-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:0cb517eb1d0d0aaf1d87af7cc5b801d686557c1d88b2619f5e31fab3c2315921", size = 7273060, upload-time = "2026-03-30T08:47:21.113Z" },
|
| 1126 |
+
{ url = "https://files.pythonhosted.org/packages/db/f0/a3deb5feba60d9538a962913e37bd2e69a195f1c3376a3dd44fe0427e996/grpcio-1.80.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4e78c4ac0d97dc2e569b2f4bcbbb447491167cb358d1a389fc4af71ab6f70411", size = 6782121, upload-time = "2026-03-30T08:47:23.827Z" },
|
| 1127 |
+
{ url = "https://files.pythonhosted.org/packages/ca/84/36c6dcfddc093e108141f757c407902a05085e0c328007cb090d56646cdf/grpcio-1.80.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:2ed770b4c06984f3b47eb0517b1c69ad0b84ef3f40128f51448433be904634cd", size = 7383811, upload-time = "2026-03-30T08:47:26.517Z" },
|
| 1128 |
+
{ url = "https://files.pythonhosted.org/packages/7c/ef/f3a77e3dc5b471a0ec86c564c98d6adfa3510d38f8ee99010410858d591e/grpcio-1.80.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:256507e2f524092f1473071a05e65a5b10d84b82e3ff24c5b571513cfaa61e2f", size = 8393860, upload-time = "2026-03-30T08:47:29.439Z" },
|
| 1129 |
+
{ url = "https://files.pythonhosted.org/packages/9b/8d/9d4d27ed7f33d109c50d6b5ce578a9914aa68edab75d65869a17e630a8d1/grpcio-1.80.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:9a6284a5d907c37db53350645567c522be314bac859a64a7a5ca63b77bb7958f", size = 7830132, upload-time = "2026-03-30T08:47:33.254Z" },
|
| 1130 |
+
{ url = "https://files.pythonhosted.org/packages/14/e4/9990b41c6d7a44e1e9dee8ac11d7a9802ba1378b40d77468a7761d1ad288/grpcio-1.80.0-cp312-cp312-win32.whl", hash = "sha256:c71309cfce2f22be26aa4a847357c502db6c621f1a49825ae98aa0907595b193", size = 4140904, upload-time = "2026-03-30T08:47:35.319Z" },
|
| 1131 |
+
{ url = "https://files.pythonhosted.org/packages/2f/2c/296f6138caca1f4b92a31ace4ae1b87dab692fc16a7a3417af3bb3c805bf/grpcio-1.80.0-cp312-cp312-win_amd64.whl", hash = "sha256:9fe648599c0e37594c4809d81a9e77bd138cc82eb8baa71b6a86af65426723ff", size = 4880944, upload-time = "2026-03-30T08:47:37.831Z" },
|
| 1132 |
+
]
|
| 1133 |
+
|
| 1134 |
+
[[package]]
|
| 1135 |
+
name = "grpcio-status"
|
| 1136 |
+
version = "1.80.0"
|
| 1137 |
+
source = { registry = "https://pypi.org/simple" }
|
| 1138 |
+
dependencies = [
|
| 1139 |
+
{ name = "googleapis-common-protos" },
|
| 1140 |
+
{ name = "grpcio" },
|
| 1141 |
+
{ name = "protobuf" },
|
| 1142 |
+
]
|
| 1143 |
+
sdist = { url = "https://files.pythonhosted.org/packages/b1/ed/105f619bdd00cb47a49aa2feea6232ea2bbb04199d52a22cc6a7d603b5cb/grpcio_status-1.80.0.tar.gz", hash = "sha256:df73802a4c89a3ea88aa2aff971e886fccce162bc2e6511408b3d67a144381cd", size = 13901, upload-time = "2026-03-30T08:54:34.784Z" }
|
| 1144 |
+
wheels = [
|
| 1145 |
+
{ url = "https://files.pythonhosted.org/packages/76/80/58cd2dfc19a07d022abe44bde7c365627f6c7cb6f692ada6c65ca437d09a/grpcio_status-1.80.0-py3-none-any.whl", hash = "sha256:4b56990363af50dbf2c2ebb80f1967185c07d87aa25aa2bea45ddb75fc181dbe", size = 14638, upload-time = "2026-03-30T08:54:01.569Z" },
|
| 1146 |
+
]
|
| 1147 |
+
|
| 1148 |
[[package]]
|
| 1149 |
name = "h11"
|
| 1150 |
version = "0.16.0"
|
|
|
|
| 1314 |
{ url = "https://files.pythonhosted.org/packages/67/8a/a342b2f0251f3dac4ca17618265d93bf244a2a4d089126e81e4c1056ac50/jiter-0.13.0-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7bb00b6d26db67a05fe3e12c76edc75f32077fb51deed13822dc648fa373bc19", size = 343768, upload-time = "2026-02-02T12:37:55.055Z" },
|
| 1315 |
]
|
| 1316 |
|
| 1317 |
+
[[package]]
|
| 1318 |
+
name = "jmespath"
|
| 1319 |
+
version = "1.1.0"
|
| 1320 |
+
source = { registry = "https://pypi.org/simple" }
|
| 1321 |
+
sdist = { url = "https://files.pythonhosted.org/packages/d3/59/322338183ecda247fb5d1763a6cbe46eff7222eaeebafd9fa65d4bf5cb11/jmespath-1.1.0.tar.gz", hash = "sha256:472c87d80f36026ae83c6ddd0f1d05d4e510134ed462851fd5f754c8c3cbb88d", size = 27377, upload-time = "2026-01-22T16:35:26.279Z" }
|
| 1322 |
+
wheels = [
|
| 1323 |
+
{ url = "https://files.pythonhosted.org/packages/14/2f/967ba146e6d58cf6a652da73885f52fc68001525b4197effc174321d70b4/jmespath-1.1.0-py3-none-any.whl", hash = "sha256:a5663118de4908c91729bea0acadca56526eb2698e83de10cd116ae0f4e97c64", size = 20419, upload-time = "2026-01-22T16:35:24.919Z" },
|
| 1324 |
+
]
|
| 1325 |
+
|
| 1326 |
[[package]]
|
| 1327 |
name = "joblib"
|
| 1328 |
version = "1.5.3"
|
|
|
|
| 2150 |
{ name = "bcrypt" },
|
| 2151 |
]
|
| 2152 |
|
| 2153 |
+
[[package]]
|
| 2154 |
+
name = "pdf2image"
|
| 2155 |
+
version = "1.17.0"
|
| 2156 |
+
source = { registry = "https://pypi.org/simple" }
|
| 2157 |
+
dependencies = [
|
| 2158 |
+
{ name = "pillow" },
|
| 2159 |
+
]
|
| 2160 |
+
sdist = { url = "https://files.pythonhosted.org/packages/00/d8/b280f01045555dc257b8153c00dee3bc75830f91a744cd5f84ef3a0a64b1/pdf2image-1.17.0.tar.gz", hash = "sha256:eaa959bc116b420dd7ec415fcae49b98100dda3dd18cd2fdfa86d09f112f6d57", size = 12811, upload-time = "2024-01-07T20:33:01.965Z" }
|
| 2161 |
+
wheels = [
|
| 2162 |
+
{ url = "https://files.pythonhosted.org/packages/62/33/61766ae033518957f877ab246f87ca30a85b778ebaad65b7f74fa7e52988/pdf2image-1.17.0-py3-none-any.whl", hash = "sha256:ecdd58d7afb810dffe21ef2b1bbc057ef434dabbac6c33778a38a3f7744a27e2", size = 11618, upload-time = "2024-01-07T20:32:59.957Z" },
|
| 2163 |
+
]
|
| 2164 |
+
|
| 2165 |
[[package]]
|
| 2166 |
name = "pgvector"
|
| 2167 |
version = "0.3.6"
|
|
|
|
| 2329 |
{ url = "https://files.pythonhosted.org/packages/5b/5a/bc7b4a4ef808fa59a816c17b20c4bef6884daebbdf627ff2a161da67da19/propcache-0.4.1-py3-none-any.whl", hash = "sha256:af2a6052aeb6cf17d3e46ee169099044fd8224cbaf75c76a2ef596e8163e2237", size = 13305, upload-time = "2025-10-08T19:49:00.792Z" },
|
| 2330 |
]
|
| 2331 |
|
| 2332 |
+
[[package]]
|
| 2333 |
+
name = "proto-plus"
|
| 2334 |
+
version = "1.27.2"
|
| 2335 |
+
source = { registry = "https://pypi.org/simple" }
|
| 2336 |
+
dependencies = [
|
| 2337 |
+
{ name = "protobuf" },
|
| 2338 |
+
]
|
| 2339 |
+
sdist = { url = "https://files.pythonhosted.org/packages/81/0d/94dfe80193e79d55258345901acd2917523d56e8381bc4dee7fd38e3868a/proto_plus-1.27.2.tar.gz", hash = "sha256:b2adde53adadf75737c44d3dcb0104fde65250dfc83ad59168b4aa3e574b6a24", size = 57204, upload-time = "2026-03-26T22:18:57.174Z" }
|
| 2340 |
+
wheels = [
|
| 2341 |
+
{ url = "https://files.pythonhosted.org/packages/84/f3/1fba73eeffafc998a25d59703b63f8be4fe8a5cb12eaff7386a0ba0f7125/proto_plus-1.27.2-py3-none-any.whl", hash = "sha256:6432f75893d3b9e70b9c412f1d2f03f65b11fb164b793d14ae2ca01821d22718", size = 50450, upload-time = "2026-03-26T22:13:42.927Z" },
|
| 2342 |
+
]
|
| 2343 |
+
|
| 2344 |
+
[[package]]
|
| 2345 |
+
name = "protobuf"
|
| 2346 |
+
version = "6.33.6"
|
| 2347 |
+
source = { registry = "https://pypi.org/simple" }
|
| 2348 |
+
sdist = { url = "https://files.pythonhosted.org/packages/66/70/e908e9c5e52ef7c3a6c7902c9dfbb34c7e29c25d2f81ade3856445fd5c94/protobuf-6.33.6.tar.gz", hash = "sha256:a6768d25248312c297558af96a9f9c929e8c4cee0659cb07e780731095f38135", size = 444531, upload-time = "2026-03-18T19:05:00.988Z" }
|
| 2349 |
+
wheels = [
|
| 2350 |
+
{ url = "https://files.pythonhosted.org/packages/fc/9f/2f509339e89cfa6f6a4c4ff50438db9ca488dec341f7e454adad60150b00/protobuf-6.33.6-cp310-abi3-win32.whl", hash = "sha256:7d29d9b65f8afef196f8334e80d6bc1d5d4adedb449971fefd3723824e6e77d3", size = 425739, upload-time = "2026-03-18T19:04:48.373Z" },
|
| 2351 |
+
{ url = "https://files.pythonhosted.org/packages/76/5d/683efcd4798e0030c1bab27374fd13a89f7c2515fb1f3123efdfaa5eab57/protobuf-6.33.6-cp310-abi3-win_amd64.whl", hash = "sha256:0cd27b587afca21b7cfa59a74dcbd48a50f0a6400cfb59391340ad729d91d326", size = 437089, upload-time = "2026-03-18T19:04:50.381Z" },
|
| 2352 |
+
{ url = "https://files.pythonhosted.org/packages/5c/01/a3c3ed5cd186f39e7880f8303cc51385a198a81469d53d0fdecf1f64d929/protobuf-6.33.6-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:9720e6961b251bde64edfdab7d500725a2af5280f3f4c87e57c0208376aa8c3a", size = 427737, upload-time = "2026-03-18T19:04:51.866Z" },
|
| 2353 |
+
{ url = "https://files.pythonhosted.org/packages/ee/90/b3c01fdec7d2f627b3a6884243ba328c1217ed2d978def5c12dc50d328a3/protobuf-6.33.6-cp39-abi3-manylinux2014_aarch64.whl", hash = "sha256:e2afbae9b8e1825e3529f88d514754e094278bb95eadc0e199751cdd9a2e82a2", size = 324610, upload-time = "2026-03-18T19:04:53.096Z" },
|
| 2354 |
+
{ url = "https://files.pythonhosted.org/packages/9b/ca/25afc144934014700c52e05103c2421997482d561f3101ff352e1292fb81/protobuf-6.33.6-cp39-abi3-manylinux2014_s390x.whl", hash = "sha256:c96c37eec15086b79762ed265d59ab204dabc53056e3443e702d2681f4b39ce3", size = 339381, upload-time = "2026-03-18T19:04:54.616Z" },
|
| 2355 |
+
{ url = "https://files.pythonhosted.org/packages/16/92/d1e32e3e0d894fe00b15ce28ad4944ab692713f2e7f0a99787405e43533a/protobuf-6.33.6-cp39-abi3-manylinux2014_x86_64.whl", hash = "sha256:e9db7e292e0ab79dd108d7f1a94fe31601ce1ee3f7b79e0692043423020b0593", size = 323436, upload-time = "2026-03-18T19:04:55.768Z" },
|
| 2356 |
+
{ url = "https://files.pythonhosted.org/packages/c4/72/02445137af02769918a93807b2b7890047c32bfb9f90371cbc12688819eb/protobuf-6.33.6-py3-none-any.whl", hash = "sha256:77179e006c476e69bf8e8ce866640091ec42e1beb80b213c3900006ecfba6901", size = 170656, upload-time = "2026-03-18T19:04:59.826Z" },
|
| 2357 |
+
]
|
| 2358 |
+
|
| 2359 |
[[package]]
|
| 2360 |
name = "psycopg"
|
| 2361 |
version = "3.2.3"
|
|
|
|
| 2416 |
{ url = "https://files.pythonhosted.org/packages/b5/bf/635fbe5dd10ed200afbbfbe98f8602829252ca1cce81cc48fb25ed8dadc0/psycopg2-2.9.11-cp312-cp312-win_amd64.whl", hash = "sha256:e03e4a6dbe87ff81540b434f2e5dc2bddad10296db5eea7bdc995bf5f4162938", size = 2713969, upload-time = "2025-10-10T11:10:15.946Z" },
|
| 2417 |
]
|
| 2418 |
|
| 2419 |
+
[[package]]
|
| 2420 |
+
name = "pyarrow"
|
| 2421 |
+
version = "24.0.0"
|
| 2422 |
+
source = { registry = "https://pypi.org/simple" }
|
| 2423 |
+
sdist = { url = "https://files.pythonhosted.org/packages/91/13/13e1069b351bdc3881266e11147ffccf687505dbb0ea74036237f5d454a5/pyarrow-24.0.0.tar.gz", hash = "sha256:85fe721a14dd823aca09127acbb06c3ca723efbd436c004f16bca601b04dcc83", size = 1180261, upload-time = "2026-04-21T10:51:25.837Z" }
|
| 2424 |
+
wheels = [
|
| 2425 |
+
{ url = "https://files.pythonhosted.org/packages/b4/a9/9686d9f07837f91f775e8932659192e02c74f9d8920524b480b85212cc68/pyarrow-24.0.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:6233c9ed9ab9d1db47de57d9753256d9dcffbf42db341576099f0fd9f6bf4810", size = 34981559, upload-time = "2026-04-21T10:47:22.17Z" },
|
| 2426 |
+
{ url = "https://files.pythonhosted.org/packages/80/b6/0ddf0e9b6ead3474ab087ae598c76b031fc45532bf6a63f3a553440fb258/pyarrow-24.0.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:f7616236ec1bc2b15bfdec22a71ab38851c86f8f05ff64f379e1278cf20c634a", size = 36663654, upload-time = "2026-04-21T10:47:28.315Z" },
|
| 2427 |
+
{ url = "https://files.pythonhosted.org/packages/7c/3b/926382efe8ce27ba729071d3566ade6dfb86bdf112f366000196b2f5780a/pyarrow-24.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:1617043b99bd33e5318ae18eb2919af09c71322ef1ca46566cdafc6e6712fb66", size = 45679394, upload-time = "2026-04-21T10:47:34.821Z" },
|
| 2428 |
+
{ url = "https://files.pythonhosted.org/packages/b3/7a/829f7d9dfd37c207206081d6dad474d81dde29952401f07f2ba507814818/pyarrow-24.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:6165461f55ef6314f026de6638d661188e3455d3ec49834556a0ebbdbace18bb", size = 48863122, upload-time = "2026-04-21T10:47:42.056Z" },
|
| 2429 |
+
{ url = "https://files.pythonhosted.org/packages/5f/e8/f88ce625fe8babaae64e8db2d417c7653adb3019b08aae85c5ed787dc816/pyarrow-24.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3b13dedfe76a0ad2d1d859b0811b53827a4e9d93a0bcb05cf59333ab4980cc7e", size = 49376032, upload-time = "2026-04-21T10:47:48.967Z" },
|
| 2430 |
+
{ url = "https://files.pythonhosted.org/packages/36/7a/82c363caa145fff88fb475da50d3bf52bb024f61917be5424c3392eaf878/pyarrow-24.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:25ea65d868eb04015cd18e6df2fbe98f07e5bda2abefabcb88fce39a947716f6", size = 51929490, upload-time = "2026-04-21T10:47:55.981Z" },
|
| 2431 |
+
{ url = "https://files.pythonhosted.org/packages/66/1c/e3e72c8014ad2743ca64a701652c733cc5cbcee15c0463a32a8c55518d9e/pyarrow-24.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:295f0a7f2e242dabd513737cf076007dc5b2d59237e3eca37b05c0c6446f3826", size = 27355660, upload-time = "2026-04-21T10:48:01.718Z" },
|
| 2432 |
+
]
|
| 2433 |
+
|
| 2434 |
+
[[package]]
|
| 2435 |
+
name = "pyasn1"
|
| 2436 |
+
version = "0.6.3"
|
| 2437 |
+
source = { registry = "https://pypi.org/simple" }
|
| 2438 |
+
sdist = { url = "https://files.pythonhosted.org/packages/5c/5f/6583902b6f79b399c9c40674ac384fd9cd77805f9e6205075f828ef11fb2/pyasn1-0.6.3.tar.gz", hash = "sha256:697a8ecd6d98891189184ca1fa05d1bb00e2f84b5977c481452050549c8a72cf", size = 148685, upload-time = "2026-03-17T01:06:53.382Z" }
|
| 2439 |
+
wheels = [
|
| 2440 |
+
{ url = "https://files.pythonhosted.org/packages/5d/a0/7d793dce3fa811fe047d6ae2431c672364b462850c6235ae306c0efd025f/pyasn1-0.6.3-py3-none-any.whl", hash = "sha256:a80184d120f0864a52a073acc6fc642847d0be408e7c7252f31390c0f4eadcde", size = 83997, upload-time = "2026-03-17T01:06:52.036Z" },
|
| 2441 |
+
]
|
| 2442 |
+
|
| 2443 |
+
[[package]]
|
| 2444 |
+
name = "pyasn1-modules"
|
| 2445 |
+
version = "0.4.2"
|
| 2446 |
+
source = { registry = "https://pypi.org/simple" }
|
| 2447 |
+
dependencies = [
|
| 2448 |
+
{ name = "pyasn1" },
|
| 2449 |
+
]
|
| 2450 |
+
sdist = { url = "https://files.pythonhosted.org/packages/e9/e6/78ebbb10a8c8e4b61a59249394a4a594c1a7af95593dc933a349c8d00964/pyasn1_modules-0.4.2.tar.gz", hash = "sha256:677091de870a80aae844b1ca6134f54652fa2c8c5a52aa396440ac3106e941e6", size = 307892, upload-time = "2025-03-28T02:41:22.17Z" }
|
| 2451 |
+
wheels = [
|
| 2452 |
+
{ url = "https://files.pythonhosted.org/packages/47/8d/d529b5d697919ba8c11ad626e835d4039be708a35b0d22de83a269a6682c/pyasn1_modules-0.4.2-py3-none-any.whl", hash = "sha256:29253a9207ce32b64c3ac6600edc75368f98473906e8fd1043bd6b5b1de2c14a", size = 181259, upload-time = "2025-03-28T02:41:19.028Z" },
|
| 2453 |
+
]
|
| 2454 |
+
|
| 2455 |
[[package]]
|
| 2456 |
name = "pycparser"
|
| 2457 |
version = "3.0"
|
|
|
|
| 2581 |
{ url = "https://files.pythonhosted.org/packages/60/4c/33f75713d50d5247f2258405142c0318ff32c6f8976171c4fcae87a9dbdf/pymongo-4.16.0-cp312-cp312-win_arm64.whl", hash = "sha256:dfc320f08ea9a7ec5b2403dc4e8150636f0d6150f4b9792faaae539c88e7db3b", size = 892971, upload-time = "2026-01-07T18:04:35.594Z" },
|
| 2582 |
]
|
| 2583 |
|
| 2584 |
+
[[package]]
|
| 2585 |
+
name = "pymssql"
|
| 2586 |
+
version = "2.3.13"
|
| 2587 |
+
source = { registry = "https://pypi.org/simple" }
|
| 2588 |
+
sdist = { url = "https://files.pythonhosted.org/packages/7a/cc/843c044b7f71ee329436b7327c578383e2f2499313899f88ad267cdf1f33/pymssql-2.3.13.tar.gz", hash = "sha256:2137e904b1a65546be4ccb96730a391fcd5a85aab8a0632721feb5d7e39cfbce", size = 203153, upload-time = "2026-02-14T05:00:36.865Z" }
|
| 2589 |
+
wheels = [
|
| 2590 |
+
{ url = "https://files.pythonhosted.org/packages/ba/60/a2e8a8a38f7be21d54402e2b3365cd56f1761ce9f2706c97f864e8aa8300/pymssql-2.3.13-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:cf4f32b4a05b66f02cb7d55a0f3bcb0574a6f8cf0bee4bea6f7b104038364733", size = 3158689, upload-time = "2026-02-14T04:59:46.982Z" },
|
| 2591 |
+
{ url = "https://files.pythonhosted.org/packages/43/9e/0cf0ffb9e2f73238baf766d8e31d7237b5bee3cc1bb29a376b404610994a/pymssql-2.3.13-cp312-cp312-macosx_15_0_x86_64.whl", hash = "sha256:2b056eb175955f7fb715b60dc1c0c624969f4d24dbdcf804b41ab1e640a2b131", size = 2960018, upload-time = "2026-02-14T04:59:48.668Z" },
|
| 2592 |
+
{ url = "https://files.pythonhosted.org/packages/93/ea/bc27354feaca717faa4626911f6b19bb62985c87dda28957c63de4de5895/pymssql-2.3.13-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:319810b89aa64b99d9c5c01518752c813938df230496fa2c4c6dda0603f04c4c", size = 3065719, upload-time = "2026-02-14T04:59:50.369Z" },
|
| 2593 |
+
{ url = "https://files.pythonhosted.org/packages/1e/7a/8028681c96241fb5fc850b87c8959402c353e4b83c6e049a99ffa67ded54/pymssql-2.3.13-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c0ea72641cb0f8bce7ad8565dbdbda4a7437aa58bce045f2a3a788d71af2e4be", size = 3190567, upload-time = "2026-02-14T04:59:52.202Z" },
|
| 2594 |
+
{ url = "https://files.pythonhosted.org/packages/aa/f1/ab5b76adbbd6db9ce746d448db34b044683522e7e7b95053f9dd0165297b/pymssql-2.3.13-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1493f63d213607f708a5722aa230776ada726ccdb94097fab090a1717a2534e0", size = 3710481, upload-time = "2026-02-14T04:59:54.01Z" },
|
| 2595 |
+
{ url = "https://files.pythonhosted.org/packages/59/aa/2fa0951475cd0a1829e0b8bfbe334d04ece4bce11546a556b005c4100689/pymssql-2.3.13-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:eb3275985c23479e952d6462ae6c8b2b6993ab6b99a92805a9c17942cf3d5b3d", size = 3453789, upload-time = "2026-02-14T04:59:56.841Z" },
|
| 2596 |
+
{ url = "https://files.pythonhosted.org/packages/78/08/8cd2af9003f9fc03912b658a64f5a4919dcd68f0dd3bbc822b49a3d14fd9/pymssql-2.3.13-cp312-cp312-win_amd64.whl", hash = "sha256:a930adda87bdd8351a5637cf73d6491936f34e525a5e513068a6eac742f69cdb", size = 1994709, upload-time = "2026-02-14T04:59:58.972Z" },
|
| 2597 |
+
]
|
| 2598 |
+
|
| 2599 |
+
[[package]]
|
| 2600 |
+
name = "pymysql"
|
| 2601 |
+
version = "1.1.2"
|
| 2602 |
+
source = { registry = "https://pypi.org/simple" }
|
| 2603 |
+
sdist = { url = "https://files.pythonhosted.org/packages/f5/ae/1fe3fcd9f959efa0ebe200b8de88b5a5ce3e767e38c7ac32fb179f16a388/pymysql-1.1.2.tar.gz", hash = "sha256:4961d3e165614ae65014e361811a724e2044ad3ea3739de9903ae7c21f539f03", size = 48258, upload-time = "2025-08-24T12:55:55.146Z" }
|
| 2604 |
+
wheels = [
|
| 2605 |
+
{ url = "https://files.pythonhosted.org/packages/7c/4c/ad33b92b9864cbde84f259d5df035a6447f91891f5be77788e2a3892bce3/pymysql-1.1.2-py3-none-any.whl", hash = "sha256:e6b1d89711dd51f8f74b1631fe08f039e7d76cf67a42a323d3178f0f25762ed9", size = 45300, upload-time = "2025-08-24T12:55:53.394Z" },
|
| 2606 |
+
]
|
| 2607 |
+
|
| 2608 |
+
[[package]]
|
| 2609 |
+
name = "pyopenssl"
|
| 2610 |
+
version = "25.1.0"
|
| 2611 |
+
source = { registry = "https://pypi.org/simple" }
|
| 2612 |
+
dependencies = [
|
| 2613 |
+
{ name = "cryptography" },
|
| 2614 |
+
{ name = "typing-extensions" },
|
| 2615 |
+
]
|
| 2616 |
+
sdist = { url = "https://files.pythonhosted.org/packages/04/8c/cd89ad05804f8e3c17dea8f178c3f40eeab5694c30e0c9f5bcd49f576fc3/pyopenssl-25.1.0.tar.gz", hash = "sha256:8d031884482e0c67ee92bf9a4d8cceb08d92aba7136432ffb0703c5280fc205b", size = 179937, upload-time = "2025-05-17T16:28:31.31Z" }
|
| 2617 |
+
wheels = [
|
| 2618 |
+
{ url = "https://files.pythonhosted.org/packages/80/28/2659c02301b9500751f8d42f9a6632e1508aa5120de5e43042b8b30f8d5d/pyopenssl-25.1.0-py3-none-any.whl", hash = "sha256:2b11f239acc47ac2e5aca04fd7fa829800aeee22a2eb30d744572a157bd8a1ab", size = 56771, upload-time = "2025-05-17T16:28:29.197Z" },
|
| 2619 |
+
]
|
| 2620 |
+
|
| 2621 |
[[package]]
|
| 2622 |
name = "pyparsing"
|
| 2623 |
version = "3.3.2"
|
|
|
|
| 2636 |
{ url = "https://files.pythonhosted.org/packages/04/fc/6f52588ac1cb4400a7804ef88d0d4e00cfe57a7ac6793ec3b00de5a8758b/pypdf-5.1.0-py3-none-any.whl", hash = "sha256:3bd4f503f4ebc58bae40d81e81a9176c400cbbac2ba2d877367595fb524dfdfc", size = 297976, upload-time = "2024-10-27T19:46:44.439Z" },
|
| 2637 |
]
|
| 2638 |
|
| 2639 |
+
[[package]]
|
| 2640 |
+
name = "pypdf2"
|
| 2641 |
+
version = "3.0.1"
|
| 2642 |
+
source = { registry = "https://pypi.org/simple" }
|
| 2643 |
+
sdist = { url = "https://files.pythonhosted.org/packages/9f/bb/18dc3062d37db6c491392007dfd1a7f524bb95886eb956569ac38a23a784/PyPDF2-3.0.1.tar.gz", hash = "sha256:a74408f69ba6271f71b9352ef4ed03dc53a31aa404d29b5d31f53bfecfee1440", size = 227419, upload-time = "2022-12-31T10:36:13.13Z" }
|
| 2644 |
+
wheels = [
|
| 2645 |
+
{ url = "https://files.pythonhosted.org/packages/8e/5e/c86a5643653825d3c913719e788e41386bee415c2b87b4f955432f2de6b2/pypdf2-3.0.1-py3-none-any.whl", hash = "sha256:d16e4205cfee272fbdc0568b68d82be796540b1537508cef59388f839c191928", size = 232572, upload-time = "2022-12-31T10:36:10.327Z" },
|
| 2646 |
+
]
|
| 2647 |
+
|
| 2648 |
+
[[package]]
|
| 2649 |
+
name = "pytesseract"
|
| 2650 |
+
version = "0.3.13"
|
| 2651 |
+
source = { registry = "https://pypi.org/simple" }
|
| 2652 |
+
dependencies = [
|
| 2653 |
+
{ name = "packaging" },
|
| 2654 |
+
{ name = "pillow" },
|
| 2655 |
+
]
|
| 2656 |
+
sdist = { url = "https://files.pythonhosted.org/packages/9f/a6/7d679b83c285974a7cb94d739b461fa7e7a9b17a3abfd7bf6cbc5c2394b0/pytesseract-0.3.13.tar.gz", hash = "sha256:4bf5f880c99406f52a3cfc2633e42d9dc67615e69d8a509d74867d3baddb5db9", size = 17689, upload-time = "2024-08-16T02:33:56.762Z" }
|
| 2657 |
+
wheels = [
|
| 2658 |
+
{ url = "https://files.pythonhosted.org/packages/7a/33/8312d7ce74670c9d39a532b2c246a853861120486be9443eebf048043637/pytesseract-0.3.13-py3-none-any.whl", hash = "sha256:7a99c6c2ac598360693d83a416e36e0b33a67638bb9d77fdcac094a3589d4b34", size = 14705, upload-time = "2024-08-16T02:36:10.09Z" },
|
| 2659 |
+
]
|
| 2660 |
+
|
| 2661 |
[[package]]
|
| 2662 |
name = "pytest"
|
| 2663 |
version = "8.3.4"
|
|
|
|
| 2940 |
{ url = "https://files.pythonhosted.org/packages/13/9f/026e18ca7d7766783d779dae5e9c656746c6ede36ef73c6d934aaf4a6dec/ruff-0.8.4-py3-none-win_arm64.whl", hash = "sha256:9183dd615d8df50defa8b1d9a074053891ba39025cf5ae88e8bcb52edcc4bf08", size = 9074500, upload-time = "2024-12-19T13:36:23.92Z" },
|
| 2941 |
]
|
| 2942 |
|
| 2943 |
+
[[package]]
|
| 2944 |
+
name = "s3transfer"
|
| 2945 |
+
version = "0.16.0"
|
| 2946 |
+
source = { registry = "https://pypi.org/simple" }
|
| 2947 |
+
dependencies = [
|
| 2948 |
+
{ name = "botocore" },
|
| 2949 |
+
]
|
| 2950 |
+
sdist = { url = "https://files.pythonhosted.org/packages/05/04/74127fc843314818edfa81b5540e26dd537353b123a4edc563109d8f17dd/s3transfer-0.16.0.tar.gz", hash = "sha256:8e990f13268025792229cd52fa10cb7163744bf56e719e0b9cb925ab79abf920", size = 153827, upload-time = "2025-12-01T02:30:59.114Z" }
|
| 2951 |
+
wheels = [
|
| 2952 |
+
{ url = "https://files.pythonhosted.org/packages/fc/51/727abb13f44c1fcf6d145979e1535a35794db0f6e450a0cb46aa24732fe2/s3transfer-0.16.0-py3-none-any.whl", hash = "sha256:18e25d66fed509e3868dc1572b3f427ff947dd2c56f844a5bf09481ad3f3b2fe", size = 86830, upload-time = "2025-12-01T02:30:57.729Z" },
|
| 2953 |
+
]
|
| 2954 |
+
|
| 2955 |
[[package]]
|
| 2956 |
name = "safetensors"
|
| 2957 |
version = "0.7.0"
|
|
|
|
| 3106 |
{ url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" },
|
| 3107 |
]
|
| 3108 |
|
| 3109 |
+
[[package]]
|
| 3110 |
+
name = "snowflake-connector-python"
|
| 3111 |
+
version = "4.0.0"
|
| 3112 |
+
source = { registry = "https://pypi.org/simple" }
|
| 3113 |
+
dependencies = [
|
| 3114 |
+
{ name = "asn1crypto" },
|
| 3115 |
+
{ name = "boto3" },
|
| 3116 |
+
{ name = "botocore" },
|
| 3117 |
+
{ name = "certifi" },
|
| 3118 |
+
{ name = "charset-normalizer" },
|
| 3119 |
+
{ name = "cryptography" },
|
| 3120 |
+
{ name = "filelock" },
|
| 3121 |
+
{ name = "idna" },
|
| 3122 |
+
{ name = "packaging" },
|
| 3123 |
+
{ name = "platformdirs" },
|
| 3124 |
+
{ name = "pyjwt" },
|
| 3125 |
+
{ name = "pyopenssl" },
|
| 3126 |
+
{ name = "pytz" },
|
| 3127 |
+
{ name = "requests" },
|
| 3128 |
+
{ name = "sortedcontainers" },
|
| 3129 |
+
{ name = "tomlkit" },
|
| 3130 |
+
{ name = "typing-extensions" },
|
| 3131 |
+
]
|
| 3132 |
+
sdist = { url = "https://files.pythonhosted.org/packages/1d/f1/4aff125021a9c5e0183f2f55dd7d04b7256a0e1e10db50d537a7415d9c55/snowflake_connector_python-4.0.0.tar.gz", hash = "sha256:4b10a865c4a5e1fa60c365c7fe41e0433605e6e5edc824e8730a9038f330b3a6", size = 813937, upload-time = "2025-10-09T10:11:34.631Z" }
|
| 3133 |
+
wheels = [
|
| 3134 |
+
{ url = "https://files.pythonhosted.org/packages/ea/b0/462c0deee35d6d03d3d729b3f923615bae665beb7f9a94673a23a52080fe/snowflake_connector_python-4.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:bfd3b8523d7adc830f99c5c4c635689ceca61700a05368d5bbb34c6811f2ec54", size = 1029568, upload-time = "2025-10-09T10:11:42.125Z" },
|
| 3135 |
+
{ url = "https://files.pythonhosted.org/packages/ff/4b/bb3ae3f07e7927c8f16c4c0f1283d3c721978d16e8bf4193fc8e41025c1e/snowflake_connector_python-4.0.0-cp312-cp312-macosx_11_0_x86_64.whl", hash = "sha256:835161dd46ef8f5fc9d2f135ca654c2f3fbdf57b035d3e1980506aa8eac671dc", size = 1041337, upload-time = "2025-10-09T10:11:43.692Z" },
|
| 3136 |
+
{ url = "https://files.pythonhosted.org/packages/9c/75/4bfac89f10c6dbb75e97adf1e217737fc599ebf964031c9298b6cbd807d0/snowflake_connector_python-4.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:65e4e36dd1b0c7235d84cddef8a3c97c5ea0dc8fea85e31e45fc485000b77a83", size = 2699730, upload-time = "2025-10-09T10:11:25.295Z" },
|
| 3137 |
+
{ url = "https://files.pythonhosted.org/packages/cd/78/0e916416c50909dbae511fe38b1e671a9efa62decdce51b174a0396804e4/snowflake_connector_python-4.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e6132986d6965e4005b0167270612fbc7fa4bc4ef42726a40b85a8f57475a78d", size = 2731336, upload-time = "2025-10-09T10:11:27.028Z" },
|
| 3138 |
+
{ url = "https://files.pythonhosted.org/packages/83/f0/3db8a2f3f5ee724d309c661af739a70d0643070b9b4597728151ef900f9b/snowflake_connector_python-4.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:a790f06808e4481c23cfed1396d2c9a786060ddd62408b1fda1a63e1e6bc4b07", size = 1176292, upload-time = "2025-10-09T10:11:54.956Z" },
|
| 3139 |
+
]
|
| 3140 |
+
|
| 3141 |
+
[[package]]
|
| 3142 |
+
name = "snowflake-sqlalchemy"
|
| 3143 |
+
version = "1.9.0"
|
| 3144 |
+
source = { registry = "https://pypi.org/simple" }
|
| 3145 |
+
dependencies = [
|
| 3146 |
+
{ name = "snowflake-connector-python" },
|
| 3147 |
+
{ name = "sqlalchemy" },
|
| 3148 |
+
]
|
| 3149 |
+
sdist = { url = "https://files.pythonhosted.org/packages/ff/6a/fcc5c00c3a253029a7b7b293a3958ba07d5e97623b643de47be0cc9e5530/snowflake_sqlalchemy-1.9.0.tar.gz", hash = "sha256:fb32baf559f7f933ae8fde2ec535bcea5381bb15188777cd8c006b3226efa3b1", size = 141707, upload-time = "2026-03-04T13:48:17.905Z" }
|
| 3150 |
+
wheels = [
|
| 3151 |
+
{ url = "https://files.pythonhosted.org/packages/88/28/b7ae8df80847e8157b74669ad7e1b0180e82ac0e3daf950612effd232fea/snowflake_sqlalchemy-1.9.0-py3-none-any.whl", hash = "sha256:f0b1528173e93c8c80bd9ca510985054667e0e514dd90b890271ac1cfae261c1", size = 78953, upload-time = "2026-03-04T13:48:16.393Z" },
|
| 3152 |
+
]
|
| 3153 |
+
|
| 3154 |
+
[[package]]
|
| 3155 |
+
name = "sortedcontainers"
|
| 3156 |
+
version = "2.4.0"
|
| 3157 |
+
source = { registry = "https://pypi.org/simple" }
|
| 3158 |
+
sdist = { url = "https://files.pythonhosted.org/packages/e8/c4/ba2f8066cceb6f23394729afe52f3bf7adec04bf9ed2c820b39e19299111/sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88", size = 30594, upload-time = "2021-05-16T22:03:42.897Z" }
|
| 3159 |
+
wheels = [
|
| 3160 |
+
{ url = "https://files.pythonhosted.org/packages/32/46/9cb0e58b2deb7f82b84065f37f3bffeb12413f947f9388e4cac22c4621ce/sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0", size = 29575, upload-time = "2021-05-16T22:03:41.177Z" },
|
| 3161 |
+
]
|
| 3162 |
+
|
| 3163 |
[[package]]
|
| 3164 |
name = "spacy"
|
| 3165 |
version = "3.8.3"
|
|
|
|
| 3238 |
{ name = "greenlet" },
|
| 3239 |
]
|
| 3240 |
|
| 3241 |
+
[[package]]
|
| 3242 |
+
name = "sqlalchemy-bigquery"
|
| 3243 |
+
version = "1.16.0"
|
| 3244 |
+
source = { registry = "https://pypi.org/simple" }
|
| 3245 |
+
dependencies = [
|
| 3246 |
+
{ name = "google-api-core" },
|
| 3247 |
+
{ name = "google-auth" },
|
| 3248 |
+
{ name = "google-cloud-bigquery" },
|
| 3249 |
+
{ name = "packaging" },
|
| 3250 |
+
{ name = "sqlalchemy" },
|
| 3251 |
+
]
|
| 3252 |
+
sdist = { url = "https://files.pythonhosted.org/packages/7e/6a/c49932b3d9c44cab9202b1866c5b36b7f0d0455d4653fbc0af4466aeaa76/sqlalchemy_bigquery-1.16.0.tar.gz", hash = "sha256:fe937a0d1f4cf7219fcf5d4995c6718805b38d4df43e29398dec5dc7b6d1987e", size = 119632, upload-time = "2025-11-06T01:35:40.373Z" }
|
| 3253 |
+
wheels = [
|
| 3254 |
+
{ url = "https://files.pythonhosted.org/packages/c0/87/11e6de00ef7949bb8ea06b55304a1a4911c329fdf0d9882b464db240c2c5/sqlalchemy_bigquery-1.16.0-py3-none-any.whl", hash = "sha256:0fe7634cd954f3e74f5e2db6d159f9e5ee87a47fbe8d52eac3cd3bb3dadb3a77", size = 40615, upload-time = "2025-11-06T01:35:39.358Z" },
|
| 3255 |
+
]
|
| 3256 |
+
|
| 3257 |
+
[[package]]
|
| 3258 |
+
name = "sqlglot"
|
| 3259 |
+
version = "30.6.0"
|
| 3260 |
+
source = { registry = "https://pypi.org/simple" }
|
| 3261 |
+
sdist = { url = "https://files.pythonhosted.org/packages/3c/66/6ece15f197874e56c76e1d0269cebf284ba992a80dfadca9d1972fdf7edf/sqlglot-30.6.0.tar.gz", hash = "sha256:246d34d39927422a50a3fa155f37b2f6346fba85f1a755b13c941eb32ef93361", size = 5835307, upload-time = "2026-04-20T20:11:08.164Z" }
|
| 3262 |
+
wheels = [
|
| 3263 |
+
{ url = "https://files.pythonhosted.org/packages/dc/e7/64fe971cbca33a0446b06f4a5ff8e3fa4a1dbd0a039ceabcc3e6cf4087a9/sqlglot-30.6.0-py3-none-any.whl", hash = "sha256:e005fc2f47994f90d7d8df341f1cbe937518497b0b7b1507d4c03c4c9dfd2778", size = 673920, upload-time = "2026-04-20T20:11:05.758Z" },
|
| 3264 |
+
]
|
| 3265 |
+
|
| 3266 |
[[package]]
|
| 3267 |
name = "srsly"
|
| 3268 |
version = "2.5.3"
|
|
|
|
| 3436 |
{ url = "https://files.pythonhosted.org/packages/72/f4/0de46cfa12cdcbcd464cc59fde36912af405696f687e53a091fb432f694c/tokenizers-0.22.2-cp39-abi3-win_arm64.whl", hash = "sha256:9ce725d22864a1e965217204946f830c37876eee3b2ba6fc6255e8e903d5fcbc", size = 2612133, upload-time = "2026-01-05T10:45:17.232Z" },
|
| 3437 |
]
|
| 3438 |
|
| 3439 |
+
[[package]]
|
| 3440 |
+
name = "tomlkit"
|
| 3441 |
+
version = "0.14.0"
|
| 3442 |
+
source = { registry = "https://pypi.org/simple" }
|
| 3443 |
+
sdist = { url = "https://files.pythonhosted.org/packages/c3/af/14b24e41977adb296d6bd1fb59402cf7d60ce364f90c890bd2ec65c43b5a/tomlkit-0.14.0.tar.gz", hash = "sha256:cf00efca415dbd57575befb1f6634c4f42d2d87dbba376128adb42c121b87064", size = 187167, upload-time = "2026-01-13T01:14:53.304Z" }
|
| 3444 |
+
wheels = [
|
| 3445 |
+
{ url = "https://files.pythonhosted.org/packages/b5/11/87d6d29fb5d237229d67973a6c9e06e048f01cf4994dee194ab0ea841814/tomlkit-0.14.0-py3-none-any.whl", hash = "sha256:592064ed85b40fa213469f81ac584f67a4f2992509a7c3ea2d632208623a3680", size = 39310, upload-time = "2026-01-13T01:14:51.965Z" },
|
| 3446 |
+
]
|
| 3447 |
+
|
| 3448 |
[[package]]
|
| 3449 |
name = "torch"
|
| 3450 |
version = "2.11.0"
|