[KM-438][KM-439] Improve Retrieval and Querying feature

#15
by rhbt6767 - opened
Files changed (46) hide show
  1. .dockerignore +6 -0
  2. .gitattributes +1 -0
  3. .gitignore +13 -1
  4. .vscode/launch.json +25 -0
  5. Dockerfile +2 -0
  6. README.md +2 -0
  7. main.py +2 -0
  8. pyproject.toml +19 -10
  9. src/agents/chatbot.py +11 -1
  10. src/agents/orchestration.py +5 -0
  11. src/api/v1/chat.py +80 -17
  12. src/api/v1/db_client.py +471 -3
  13. src/api/v1/document.py +43 -128
  14. src/config/agents/system_prompt.md +1 -2
  15. src/config/settings.py +5 -0
  16. src/database_client/database_client_service.py +164 -0
  17. src/db/postgres/init_db.py +43 -1
  18. src/db/postgres/models.py +16 -0
  19. src/document/document_service.py +17 -1
  20. src/knowledge/parquet_service.py +77 -0
  21. src/knowledge/processing_service.py +145 -56
  22. src/models/credentials.py +164 -0
  23. src/models/sql_query.py +8 -0
  24. src/models/structured_output.py +4 -0
  25. src/pipeline/db_pipeline/__init__.py +3 -0
  26. src/pipeline/db_pipeline/db_pipeline_service.py +302 -0
  27. src/pipeline/db_pipeline/extractor.py +283 -0
  28. src/pipeline/document_pipeline/__init__.py +0 -0
  29. src/pipeline/document_pipeline/document_pipeline.py +94 -0
  30. src/query/__init__.py +0 -0
  31. src/query/base.py +32 -0
  32. src/query/executors/__init__.py +0 -0
  33. src/query/executors/db_executor.py +648 -0
  34. src/query/executors/tabular.py +287 -0
  35. src/query/query_executor.py +42 -0
  36. src/rag/base.py +20 -0
  37. src/rag/retriever.py +24 -48
  38. src/rag/retrievers/__init__.py +0 -0
  39. src/rag/retrievers/baseline.py +76 -0
  40. src/rag/retrievers/document.py +158 -0
  41. src/rag/retrievers/schema.py +411 -0
  42. src/rag/router.py +179 -0
  43. src/storage/az_blob/az_blob.py +34 -0
  44. src/tools/search.py +3 -3
  45. src/utils/db_credential_encryption.py +70 -0
  46. uv.lock +440 -10
.dockerignore ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ .venv
2
+ software/
3
+ __pycache__
4
+ *.py[oc]
5
+ .env
6
+ .env.*
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ software/** filter=lfs diff=lfs merge=lfs -text
.gitignore CHANGED
@@ -26,6 +26,10 @@ test/users/user_accounts.csv
26
  .env.prd
27
  .env.example
28
 
 
 
 
 
29
  erd/
30
  playground/
31
  playground_retriever.py
@@ -33,4 +37,12 @@ playground_chat.py
33
  playground_flush_cache.py
34
  playground_create_user.py
35
  API_CONTRACT.md
36
- context_engineering/
 
 
 
 
 
 
 
 
 
26
  .env.prd
27
  .env.example
28
 
29
+ CLAUDE.md
30
+
31
+ /experiments
32
+ src/rag/experiments/
33
  erd/
34
  playground/
35
  playground_retriever.py
 
37
  playground_flush_cache.py
38
  playground_create_user.py
39
  API_CONTRACT.md
40
+ context_engineering/
41
+ sample_file/
42
+ test_tesseract.py
43
+
44
+ # Windows binaries — installed via apt in Docker instead
45
+ software/
46
+
47
+ tests/
48
+ .claude/
.vscode/launch.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ // Use IntelliSense to learn about possible attributes.
3
+ // Hover to view descriptions of existing attributes.
4
+ // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
5
+ "version": "0.2.0",
6
+ "configurations": [
7
+ {
8
+ "name": "DataEyond: FastAPI (debug)",
9
+ "type": "debugpy",
10
+ "request": "launch",
11
+ "module": "uvicorn",
12
+ "args": [
13
+ "main:app",
14
+ "--host", "0.0.0.0",
15
+ "--port", "7860",
16
+ "--reload"
17
+ ],
18
+ "jinja": true,
19
+ "justMyCode": true,
20
+ "envFile": "${workspaceFolder}/.env",
21
+ "console": "integratedTerminal",
22
+ "cwd": "${workspaceFolder}"
23
+ }
24
+ ]
25
+ }
Dockerfile CHANGED
@@ -12,6 +12,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
12
  libpq-dev \
13
  gcc \
14
  libgomp1 \
 
 
15
  && rm -rf /var/lib/apt/lists/*
16
 
17
  RUN addgroup --system app && \
 
12
  libpq-dev \
13
  gcc \
14
  libgomp1 \
15
+ tesseract-ocr \
16
+ poppler-utils \
17
  && rm -rf /var/lib/apt/lists/*
18
 
19
  RUN addgroup --system app && \
README.md CHANGED
@@ -11,6 +11,8 @@ short_description: AI Agent core service
11
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
12
 
13
 
 
 
14
  How to run:
15
  `uv run --no-sync uvicorn main:app --host 0.0.0.0 --port 7860`
16
 
 
11
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
12
 
13
 
14
+ # Agentic Service Data Eyond
15
+
16
  How to run:
17
  `uv run --no-sync uvicorn main:app --host 0.0.0.0 --port 7860`
18
 
main.py CHANGED
@@ -10,6 +10,7 @@ from src.api.v1.chat import router as chat_router
10
  from src.api.v1.room import router as room_router
11
  from src.api.v1.users import router as users_router
12
  from src.api.v1.knowledge import router as knowledge_router
 
13
  from src.db.postgres.init_db import init_db
14
  import uvicorn
15
 
@@ -35,6 +36,7 @@ app.include_router(document_router)
35
  app.include_router(knowledge_router)
36
  app.include_router(room_router)
37
  app.include_router(chat_router)
 
38
 
39
 
40
  @app.on_event("startup")
 
10
  from src.api.v1.room import router as room_router
11
  from src.api.v1.users import router as users_router
12
  from src.api.v1.knowledge import router as knowledge_router
13
+ from src.api.v1.db_client import router as db_client_router
14
  from src.db.postgres.init_db import init_db
15
  import uvicorn
16
 
 
36
  app.include_router(knowledge_router)
37
  app.include_router(room_router)
38
  app.include_router(chat_router)
39
+ app.include_router(db_client_router)
40
 
41
 
42
  @app.on_event("startup")
pyproject.toml CHANGED
@@ -79,6 +79,18 @@ dependencies = [
79
  "jsonpatch>=1.33",
80
  "pymongo>=4.14.0",
81
  "psycopg2>=2.9.11",
 
 
 
 
 
 
 
 
 
 
 
 
82
  ]
83
 
84
  [project.optional-dependencies]
@@ -92,16 +104,6 @@ dev = [
92
  "pre-commit==4.0.1",
93
  ]
94
 
95
- [tool.uv]
96
- dev-dependencies = [
97
- "pytest==8.3.4",
98
- "pytest-asyncio==0.24.0",
99
- "pytest-cov==6.0.0",
100
- "ruff==0.8.4",
101
- "mypy==1.13.0",
102
- "pre-commit==4.0.1",
103
- ]
104
-
105
  [tool.hatch.build.targets.wheel]
106
  packages = ["src/agent_service"]
107
 
@@ -133,3 +135,10 @@ testpaths = ["tests"]
133
  filterwarnings = [
134
  "ignore::DeprecationWarning",
135
  ]
 
 
 
 
 
 
 
 
79
  "jsonpatch>=1.33",
80
  "pymongo>=4.14.0",
81
  "psycopg2>=2.9.11",
82
+ # --- SQL parsing / guardrails ---
83
+ "sqlglot>=25.0.0",
84
+ # --- User-DB connectors (db_pipeline) ---
85
+ "pymysql>=1.1.1",
86
+ "pymssql>=2.3.0",
87
+ "sqlalchemy-bigquery>=1.11.0",
88
+ "snowflake-sqlalchemy>=1.7.0",
89
+ # --- OCR (pdf processing) ---
90
+ "pdf2image>=1.17.0",
91
+ "pytesseract>=0.3.13",
92
+ "pypdf2>=3.0.1",
93
+ "pyarrow>=24.0.0",
94
  ]
95
 
96
  [project.optional-dependencies]
 
104
  "pre-commit==4.0.1",
105
  ]
106
 
 
 
 
 
 
 
 
 
 
 
107
  [tool.hatch.build.targets.wheel]
108
  packages = ["src/agent_service"]
109
 
 
135
  filterwarnings = [
136
  "ignore::DeprecationWarning",
137
  ]
138
+
139
+ [dependency-groups]
140
+ dev = [
141
+ "pytest>=8.3.4",
142
+ "pytest-asyncio>=0.24.0",
143
+ "ruff>=0.8.4",
144
+ ]
src/agents/chatbot.py CHANGED
@@ -1,5 +1,6 @@
1
  """Chatbot agent with RAG capabilities."""
2
 
 
3
  from langchain_openai import AzureChatOpenAI
4
  from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
5
  from langchain_core.output_parsers import StrOutputParser
@@ -9,6 +10,14 @@ from langchain_core.messages import HumanMessage, AIMessage
9
 
10
  logger = get_logger("chatbot")
11
 
 
 
 
 
 
 
 
 
12
 
13
  class ChatbotAgent:
14
  """Chatbot agent with RAG capabilities."""
@@ -64,7 +73,8 @@ class ChatbotAgent:
64
  async def astream_response(self, messages: list, context: str = ""):
65
  """Stream response tokens as they are generated."""
66
  try:
67
- logger.info("Streaming chatbot response")
 
68
  async for token in self.chain.astream({"messages": messages, "context": context}):
69
  yield token
70
  except Exception as e:
 
1
  """Chatbot agent with RAG capabilities."""
2
 
3
+ import tiktoken
4
  from langchain_openai import AzureChatOpenAI
5
  from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
6
  from langchain_core.output_parsers import StrOutputParser
 
10
 
11
  logger = get_logger("chatbot")
12
 
13
+ _enc = tiktoken.get_encoding("cl100k_base")
14
+
15
+
16
+ def _count_tokens(messages: list, context: str) -> dict:
17
+ msg_tokens = sum(len(_enc.encode(m.content)) for m in messages)
18
+ ctx_tokens = len(_enc.encode(context))
19
+ return {"messages_tokens": msg_tokens, "context_tokens": ctx_tokens, "total": msg_tokens + ctx_tokens}
20
+
21
 
22
  class ChatbotAgent:
23
  """Chatbot agent with RAG capabilities."""
 
73
  async def astream_response(self, messages: list, context: str = ""):
74
  """Stream response tokens as they are generated."""
75
  try:
76
+ token_counts = _count_tokens(messages, context)
77
+ logger.info("LLM input tokens", **token_counts)
78
  async for token in self.chain.astream({"messages": messages, "context": context}):
79
  yield token
80
  except Exception as e:
src/agents/orchestration.py CHANGED
@@ -35,6 +35,11 @@ Intent Routing:
35
  - greeting -> needs_search=False, direct_response="Hello! How can I assist you today?"
36
  - goodbye -> needs_search=False, direct_response="Goodbye! Have a great day!"
37
  - other -> needs_search=True, search_query=<standalone rewritten query>
 
 
 
 
 
38
  """),
39
  MessagesPlaceholder(variable_name="history"),
40
  ("user", "{message}")
 
35
  - greeting -> needs_search=False, direct_response="Hello! How can I assist you today?"
36
  - goodbye -> needs_search=False, direct_response="Goodbye! Have a great day!"
37
  - other -> needs_search=True, search_query=<standalone rewritten query>
38
+
39
+ Source Routing (set source_hint):
40
+ - Columns, tables, sheets, data types, schema, row counts, statistics -> source_hint=schema
41
+ - Document content, paragraphs, reports, articles, text -> source_hint=document
42
+ - Unclear or spans both -> source_hint=both
43
  """),
44
  MessagesPlaceholder(variable_name="history"),
45
  ("user", "{message}")
src/api/v1/chat.py CHANGED
@@ -9,6 +9,9 @@ from src.db.postgres.models import ChatMessage, MessageSource
9
  from src.agents.orchestration import orchestrator
10
  from src.agents.chatbot import chatbot
11
  from src.rag.retriever import retriever
 
 
 
12
  from src.db.redis.connection import get_redis
13
  from src.config.settings import settings
14
  from src.middlewares.logging import get_logger, log_execution
@@ -45,34 +48,66 @@ class ChatRequest(BaseModel):
45
  message: str
46
 
47
 
48
- def _format_context(results: List[Dict[str, Any]]) -> str:
49
  """Format retrieval results as context string for the LLM."""
50
  lines = []
51
  for result in results:
52
- filename = result["metadata"].get("filename", "Unknown")
53
- page = result["metadata"].get("page_label")
 
54
  source_label = f"{filename}, p.{page}" if page else filename
55
- lines.append(f"[Source: {source_label}]\n{result['content']}\n")
56
  return "\n".join(lines)
57
 
58
 
59
- def _extract_sources(results: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
60
  """Extract deduplicated source references from retrieval results."""
61
  seen = set()
62
  sources = []
63
  for result in results:
64
- meta = result["metadata"]
65
- key = (meta.get("document_id"), meta.get("page_label"))
66
- if key not in seen:
67
- seen.add(key)
68
- sources.append({
69
- "document_id": meta.get("document_id"),
70
- "filename": meta.get("filename", "Unknown"),
71
- "page_label": meta.get("page_label"),
72
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  return sources
74
 
75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  async def get_cached_response(redis, cache_key: str) -> Optional[str]:
77
  cached = await redis.get(cache_key)
78
  if cached:
@@ -155,9 +190,12 @@ async def chat_stream(request: ChatRequest, db: AsyncSession = Depends(get_db)):
155
  sources: List[Dict[str, Any]] = []
156
 
157
  if intent_result is None:
158
- # Step 2: Launch retrieval and history loading in parallel, then run orchestrator
 
 
 
159
  retrieval_task = asyncio.create_task(
160
- retriever.retrieve(request.message, request.user_id, db)
161
  )
162
  history_task = asyncio.create_task(
163
  load_history(db, request.room_id, limit=6) # 6 msgs (3 pairs) for orchestrator
@@ -165,18 +203,28 @@ async def chat_stream(request: ChatRequest, db: AsyncSession = Depends(get_db)):
165
  history = await history_task # fast DB query (<100ms), done before orchestrator finishes
166
  intent_result = await orchestrator.analyze_message(request.message, history)
167
 
 
168
  if not intent_result.get("needs_search"):
169
  retrieval_task.cancel()
 
 
 
 
170
  raw_results = []
171
  else:
172
- search_query = intent_result.get("search_query", request.message)
173
  logger.info(f"Searching for: {search_query}")
174
  if search_query != request.message:
175
  retrieval_task.cancel()
 
 
 
 
176
  raw_results = await retriever.retrieve(
177
  query=search_query,
178
  user_id=request.user_id,
179
  db=db,
 
 
180
  )
181
  else:
182
  raw_results = await retrieval_task
@@ -184,6 +232,21 @@ async def chat_stream(request: ChatRequest, db: AsyncSession = Depends(get_db)):
184
  context = _format_context(raw_results)
185
  sources = _extract_sources(raw_results)
186
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  # Step 3: Direct response for greetings / non-document intents
188
  if intent_result.get("direct_response"):
189
  response = intent_result["direct_response"]
 
9
  from src.agents.orchestration import orchestrator
10
  from src.agents.chatbot import chatbot
11
  from src.rag.retriever import retriever
12
+ from src.rag.base import RetrievalResult
13
+ from src.query.query_executor import query_executor
14
+ from src.query.base import QueryResult
15
  from src.db.redis.connection import get_redis
16
  from src.config.settings import settings
17
  from src.middlewares.logging import get_logger, log_execution
 
48
  message: str
49
 
50
 
51
+ def _format_context(results: List[RetrievalResult]) -> str:
52
  """Format retrieval results as context string for the LLM."""
53
  lines = []
54
  for result in results:
55
+ data = result.metadata.get("data", {})
56
+ filename = data.get("filename", "Unknown")
57
+ page = data.get("page_label")
58
  source_label = f"{filename}, p.{page}" if page else filename
59
+ lines.append(f"[Source: {source_label}]\n{result.content}\n")
60
  return "\n".join(lines)
61
 
62
 
63
+ def _extract_sources(results: List[RetrievalResult]) -> List[Dict[str, Any]]:
64
  """Extract deduplicated source references from retrieval results."""
65
  seen = set()
66
  sources = []
67
  for result in results:
68
+ meta = result.metadata
69
+ data = meta.get("data", {})
70
+ if "document_id" in data:
71
+ key = (data.get("document_id"), data.get("page_label"))
72
+ if key not in seen:
73
+ seen.add(key)
74
+ sources.append({
75
+ "document_id": data.get("document_id"),
76
+ "filename": data.get("filename", "Unknown"),
77
+ "page_label": data.get("page_label", "Unknown"),
78
+ })
79
+ else:
80
+ key = (data.get("table_name"), data.get("column_name"))
81
+ if key not in seen:
82
+ seen.add(key)
83
+ table_name = data.get("table_name")
84
+ user_id = meta.get("user_id")
85
+ sources.append({
86
+ "document_id": f"{user_id}_{table_name}",
87
+ "filename": data.get("table_name", "Unknown"),
88
+ "page_label": data.get("column_name", "Unknown"),
89
+ })
90
+
91
+ logger.debug(f"Extracted sources: {sources}")
92
  return sources
93
 
94
 
95
+ def _format_query_results(results: list[QueryResult]) -> str:
96
+ if not results:
97
+ return ""
98
+ lines = []
99
+ for r in results:
100
+ name = r.metadata.get("client_name", r.source_id)
101
+ lines.append(f"[Query result — {name}, tables: {r.table_or_file}]")
102
+ lines.append(f"SQL: {r.metadata.get('sql', '')}")
103
+ if r.columns and r.rows:
104
+ lines.append(" | ".join(r.columns))
105
+ for row in r.rows[:20]:
106
+ lines.append(" | ".join(str(row.get(c, "")) for c in r.columns))
107
+ lines.append(f"({r.row_count} rows total)\n")
108
+ return "\n".join(lines)
109
+
110
+
111
  async def get_cached_response(redis, cache_key: str) -> Optional[str]:
112
  cached = await redis.get(cache_key)
113
  if cached:
 
190
  sources: List[Dict[str, Any]] = []
191
 
192
  if intent_result is None:
193
+ # Step 2: Launch retrieval and history loading in parallel, then run orchestrator.
194
+ # k=5
195
+ # tables — db_executor's FK expansion is one-hop and cannot bridge
196
+ # 2-hop gaps (e.g. customers -> order_items -> products) on its own.
197
  retrieval_task = asyncio.create_task(
198
+ retriever.retrieve(request.message, request.user_id, db, k=5)
199
  )
200
  history_task = asyncio.create_task(
201
  load_history(db, request.room_id, limit=6) # 6 msgs (3 pairs) for orchestrator
 
203
  history = await history_task # fast DB query (<100ms), done before orchestrator finishes
204
  intent_result = await orchestrator.analyze_message(request.message, history)
205
 
206
+ search_query = intent_result.get("search_query", request.message) or request.message
207
  if not intent_result.get("needs_search"):
208
  retrieval_task.cancel()
209
+ try:
210
+ await retrieval_task
211
+ except asyncio.CancelledError:
212
+ pass
213
  raw_results = []
214
  else:
 
215
  logger.info(f"Searching for: {search_query}")
216
  if search_query != request.message:
217
  retrieval_task.cancel()
218
+ try:
219
+ await retrieval_task
220
+ except asyncio.CancelledError:
221
+ pass
222
  raw_results = await retriever.retrieve(
223
  query=search_query,
224
  user_id=request.user_id,
225
  db=db,
226
+ k=5,
227
+ source_hint=intent_result.get("source_hint", "both"),
228
  )
229
  else:
230
  raw_results = await retrieval_task
 
232
  context = _format_context(raw_results)
233
  sources = _extract_sources(raw_results)
234
 
235
+ source_hint = intent_result.get("source_hint", "both")
236
+ if source_hint in ("schema", "both"):
237
+ # Use search_query (orchestrator's standalone rewrite) so follow-up
238
+ # messages like "dive deeper" or "show me last year" resolve correctly.
239
+ # For first-turn questions search_query == request.message, so no change.
240
+ query_results = await query_executor.execute(
241
+ results=raw_results,
242
+ user_id=request.user_id,
243
+ db=db,
244
+ question=search_query,
245
+ )
246
+ query_context = _format_query_results(query_results)
247
+ if query_context:
248
+ context = query_context + "\n\n" + context
249
+
250
  # Step 3: Direct response for greetings / non-document intents
251
  if intent_result.get("direct_response"):
252
  response = intent_result["direct_response"]
src/api/v1/db_client.py CHANGED
@@ -1,5 +1,473 @@
1
- from typing import Literal, Dict
2
 
 
 
 
 
 
3
 
4
- dbtypes: Literal["postgresql", "mysql", "sqlite"] = Literal["postgresql", "mysql", "sqlite"]
5
- creds: Dict[str, str]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """API endpoints for user-registered database connections.
2
 
3
+ Credential schemas (DbType, PostgresCredentials, etc.) live in
4
+ `src/models/credentials.py` — they are imported below (with noqa: F401) so
5
+ FastAPI/Swagger picks them up for OpenAPI schema generation even though they
6
+ are not referenced by name in this file.
7
+ """
8
 
9
+ from typing import Any, Dict, List, Literal, Optional
10
+ from datetime import datetime
11
+
12
+ from fastapi import APIRouter, Depends, HTTPException, Query, Request, status
13
+ from pydantic import BaseModel, Field
14
+ from sqlalchemy.ext.asyncio import AsyncSession
15
+
16
+ from src.database_client.database_client_service import database_client_service
17
+ from src.db.postgres.connection import get_db
18
+ from src.middlewares.logging import get_logger, log_execution
19
+ from src.middlewares.rate_limit import limiter
20
+ from src.models.credentials import ( # noqa: F401 — re-exported for Swagger schema discovery
21
+ BigQueryCredentials,
22
+ CredentialSchemas,
23
+ DbType,
24
+ MysqlCredentials,
25
+ PostgresCredentials,
26
+ SnowflakeCredentials,
27
+ SqlServerCredentials,
28
+ SupabaseCredentials,
29
+ )
30
+ from src.pipeline.db_pipeline import db_pipeline_service
31
+ from src.utils.db_credential_encryption import decrypt_credentials_dict
32
+
33
+ logger = get_logger("database_client_api")
34
+
35
+ router = APIRouter(prefix="/api/v1", tags=["Database Clients"])
36
+
37
+
38
+ # ---------------------------------------------------------------------------
39
+ # Request / Response schemas
40
+ # ---------------------------------------------------------------------------
41
+
42
+
43
+ class DatabaseClientCreate(BaseModel):
44
+ """
45
+ Payload to register a new external database connection.
46
+
47
+ The `credentials` object shape depends on `db_type`:
48
+
49
+ | db_type | Required fields |
50
+ |-------------|----------------------------------------------------------|
51
+ | postgres | host, port, database, username, password, ssl_mode |
52
+ | mysql | host, port, database, username, password, ssl |
53
+ | sqlserver | host, port, database, username, password, driver? |
54
+ | supabase | host, port, database, username, password, ssl_mode |
55
+ | bigquery | project_id, dataset_id, location?, service_account_json |
56
+ | snowflake | account, warehouse, database, schema?, username, password, role? |
57
+
58
+ Sensitive fields (`password`, `service_account_json`) are encrypted
59
+ at rest using Fernet symmetric encryption.
60
+ """
61
+
62
+ name: str = Field(..., description="Display name for this connection.", examples=["Production DB"])
63
+ db_type: DbType = Field(..., description="Type of the database engine.", examples=["postgres"])
64
+ credentials: Dict[str, Any] = Field(
65
+ ...,
66
+ description="Connection credentials. Shape depends on db_type. See schema descriptions above.",
67
+ examples=[
68
+ {
69
+ "host": "db.example.com",
70
+ "port": 5432,
71
+ "database": "mydb",
72
+ "username": "admin",
73
+ "password": "s3cr3t!",
74
+ "ssl_mode": "require",
75
+ }
76
+ ],
77
+ )
78
+
79
+
80
+ class DatabaseClientUpdate(BaseModel):
81
+ """
82
+ Payload to update an existing database connection.
83
+
84
+ All fields are optional — only provided fields will be updated.
85
+ If `credentials` is provided, it replaces the entire credentials object
86
+ and sensitive fields are re-encrypted.
87
+ """
88
+
89
+ name: Optional[str] = Field(None, description="New display name for this connection.", examples=["Staging DB"])
90
+ credentials: Optional[Dict[str, Any]] = Field(
91
+ None,
92
+ description="Updated credentials object. Replaces existing credentials entirely if provided.",
93
+ examples=[{"host": "new-host.example.com", "port": 5432, "database": "mydb", "username": "admin", "password": "n3wP@ss!", "ssl_mode": "require"}],
94
+ )
95
+ status: Optional[Literal["active", "inactive"]] = Field(
96
+ None,
97
+ description="Set to 'inactive' to soft-disable the connection without deleting it.",
98
+ examples=["inactive"],
99
+ )
100
+
101
+
102
+ class DatabaseClientResponse(BaseModel):
103
+ """
104
+ Database connection record returned by the API.
105
+
106
+ Credentials are **never** included in the response for security reasons.
107
+ """
108
+
109
+ id: str = Field(..., description="Unique identifier of the database connection.")
110
+ user_id: str = Field(..., description="ID of the user who owns this connection.")
111
+ name: str = Field(..., description="Display name of the connection.")
112
+ db_type: str = Field(..., description="Database engine type.")
113
+ status: str = Field(..., description="Connection status: 'active' or 'inactive'.")
114
+ created_at: datetime = Field(..., description="Timestamp when the connection was registered.")
115
+ updated_at: Optional[datetime] = Field(None, description="Timestamp of the last update, if any.")
116
+
117
+ model_config = {"from_attributes": True}
118
+
119
+
120
+ # ---------------------------------------------------------------------------
121
+ # Supported DB types registry
122
+ # ---------------------------------------------------------------------------
123
+
124
+ _DB_TYPES: List[Dict[str, Any]] = [
125
+ {
126
+ "db_type": "postgres",
127
+ "display_name": "PostgreSQL",
128
+ "logo": "postgres",
129
+ "status": "active",
130
+ "message": None,
131
+ "fields": [
132
+ {"name": "host", "type": "string", "required": True, "default": None, "description": "Hostname or IP address"},
133
+ {"name": "port", "type": "integer", "required": False, "default": 5432, "description": "Port number"},
134
+ {"name": "database", "type": "string", "required": True, "default": None, "description": "Database name"},
135
+ {"name": "username", "type": "string", "required": True, "default": None, "description": "Database username"},
136
+ {"name": "password", "type": "string", "required": True, "default": None, "description": "Database password", "sensitive": True},
137
+ {"name": "ssl_mode", "type": "select", "required": False, "default": "require", "description": "SSL mode", "options": ["disable", "require", "verify-ca", "verify-full"]},
138
+ ],
139
+ },
140
+ {
141
+ "db_type": "mysql",
142
+ "display_name": "MySQL",
143
+ "logo": "mysql",
144
+ "status": "active",
145
+ "message": None,
146
+ "fields": [
147
+ {"name": "host", "type": "string", "required": True, "default": None, "description": "Hostname or IP address"},
148
+ {"name": "port", "type": "integer", "required": False, "default": 3306, "description": "Port number"},
149
+ {"name": "database", "type": "string", "required": True, "default": None, "description": "Database name"},
150
+ {"name": "username", "type": "string", "required": True, "default": None, "description": "Database username"},
151
+ {"name": "password", "type": "string", "required": True, "default": None, "description": "Database password", "sensitive": True},
152
+ {"name": "ssl", "type": "boolean", "required": False, "default": True, "description": "Enable SSL"},
153
+ ],
154
+ },
155
+ {
156
+ "db_type": "supabase",
157
+ "display_name": "Supabase",
158
+ "logo": "supabase",
159
+ "status": "active",
160
+ "message": None,
161
+ "fields": [
162
+ {"name": "host", "type": "string", "required": True, "default": None, "description": "Supabase database host"},
163
+ {"name": "port", "type": "integer", "required": False, "default": 5432, "description": "Port number (5432 direct, 6543 pooler)"},
164
+ {"name": "database", "type": "string", "required": False, "default": "postgres", "description": "Database name"},
165
+ {"name": "username", "type": "string", "required": True, "default": None, "description": "Database user"},
166
+ {"name": "password", "type": "string", "required": True, "default": None, "description": "Database password", "sensitive": True},
167
+ {"name": "ssl_mode", "type": "select", "required": False, "default": "require", "description": "SSL mode", "options": ["require", "verify-ca", "verify-full"]},
168
+ ],
169
+ },
170
+ {
171
+ "db_type": "sqlserver",
172
+ "display_name": "SQL Server",
173
+ "logo": "sqlserver",
174
+ "status": "inactive",
175
+ "message": "Coming soon",
176
+ "fields": [
177
+ {"name": "host", "type": "string", "required": True, "default": None, "description": "Hostname or IP address"},
178
+ {"name": "port", "type": "integer", "required": False, "default": 1433, "description": "Port number"},
179
+ {"name": "database", "type": "string", "required": True, "default": None, "description": "Database name"},
180
+ {"name": "username", "type": "string", "required": True, "default": None, "description": "Database username"},
181
+ {"name": "password", "type": "string", "required": True, "default": None, "description": "Database password", "sensitive": True},
182
+ {"name": "driver", "type": "string", "required": False, "default": None, "description": "ODBC driver name"},
183
+ ],
184
+ },
185
+ {
186
+ "db_type": "bigquery",
187
+ "display_name": "BigQuery",
188
+ "logo": "bigquery",
189
+ "status": "inactive",
190
+ "message": "Coming soon",
191
+ "fields": [
192
+ {"name": "project_id", "type": "string", "required": True, "default": None, "description": "GCP project ID"},
193
+ {"name": "dataset_id", "type": "string", "required": True, "default": None, "description": "BigQuery dataset name"},
194
+ {"name": "location", "type": "string", "required": False, "default": "US", "description": "Dataset location/region"},
195
+ {"name": "service_account_json", "type": "string", "required": True, "default": None, "description": "GCP Service Account key JSON", "sensitive": True},
196
+ ],
197
+ },
198
+ {
199
+ "db_type": "snowflake",
200
+ "display_name": "Snowflake",
201
+ "logo": "snowflake",
202
+ "status": "inactive",
203
+ "message": "Coming soon",
204
+ "fields": [
205
+ {"name": "account", "type": "string", "required": True, "default": None, "description": "Snowflake account identifier"},
206
+ {"name": "warehouse", "type": "string", "required": True, "default": None, "description": "Virtual warehouse name"},
207
+ {"name": "database", "type": "string", "required": True, "default": None, "description": "Database name"},
208
+ {"name": "schema", "type": "string", "required": False, "default": "PUBLIC", "description": "Schema name"},
209
+ {"name": "username", "type": "string", "required": True, "default": None, "description": "Snowflake username"},
210
+ {"name": "password", "type": "string", "required": True, "default": None, "description": "Snowflake password", "sensitive": True},
211
+ {"name": "role", "type": "string", "required": False, "default": None, "description": "Snowflake role"},
212
+ ],
213
+ },
214
+ ]
215
+
216
+
217
+ # ---------------------------------------------------------------------------
218
+ # Endpoints
219
+ # ---------------------------------------------------------------------------
220
+
221
+
222
+ @router.get(
223
+ "/database-clients/dbtypes",
224
+ summary="List supported database types",
225
+ response_description="All database types supported by DataEyond with their connection parameters.",
226
+ )
227
+ async def list_db_types():
228
+ """
229
+ Return every database type DataEyond can connect to, along with the
230
+ credential fields the frontend should render, a logo filename, and
231
+ an active/inactive status with an optional message.
232
+ """
233
+ return _DB_TYPES
234
+
235
+
236
+ @router.post(
237
+ "/database-clients",
238
+ response_model=DatabaseClientResponse,
239
+ status_code=status.HTTP_201_CREATED,
240
+ summary="Register a new database connection",
241
+ response_description="The newly created database connection record (credentials excluded).",
242
+ responses={
243
+ 201: {"description": "Connection registered successfully."},
244
+ 422: {"description": "Validation error — check the credentials shape for the given db_type."},
245
+ 500: {"description": "Internal server error."},
246
+ },
247
+ )
248
+ @limiter.limit("10/minute")
249
+ @log_execution(logger)
250
+ async def create_database_client(
251
+ request: Request,
252
+ payload: DatabaseClientCreate,
253
+ user_id: str = Query(..., description="ID of the user registering the connection."),
254
+ db: AsyncSession = Depends(get_db),
255
+ ):
256
+ """
257
+ Register a new external database connection for a user.
258
+
259
+ The `credentials` object must match the shape for the chosen `db_type`
260
+ (see **CredentialSchemas** in the schema section below for exact fields).
261
+ Sensitive fields (`password`, `service_account_json`) are encrypted
262
+ before being persisted — they are never returned in any response.
263
+ """
264
+ try:
265
+ client = await database_client_service.create(
266
+ db=db,
267
+ user_id=user_id,
268
+ name=payload.name,
269
+ db_type=payload.db_type,
270
+ credentials=payload.credentials,
271
+ )
272
+ return DatabaseClientResponse.model_validate(client)
273
+ except Exception as e:
274
+ logger.error(f"Failed to create database client for user {user_id}", error=str(e))
275
+ raise HTTPException(
276
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
277
+ detail=f"Failed to create database client: {str(e)}",
278
+ )
279
+
280
+
281
+ @router.get(
282
+ "/database-clients/{user_id}",
283
+ response_model=List[DatabaseClientResponse],
284
+ summary="List all database connections for a user",
285
+ response_description="List of database connections (credentials excluded).",
286
+ responses={
287
+ 200: {"description": "Returns an empty list if the user has no connections."},
288
+ },
289
+ )
290
+ @log_execution(logger)
291
+ async def list_database_clients(
292
+ user_id: str,
293
+ db: AsyncSession = Depends(get_db),
294
+ ):
295
+ """
296
+ Return all database connections registered by the specified user,
297
+ ordered by creation date (newest first).
298
+
299
+ Credentials are never included in the response.
300
+ """
301
+ clients = await database_client_service.get_user_clients(db, user_id)
302
+ return [DatabaseClientResponse.model_validate(c) for c in clients]
303
+
304
+
305
+ @router.get(
306
+ "/database-clients/{user_id}/{client_id}",
307
+ response_model=DatabaseClientResponse,
308
+ summary="Get a single database connection",
309
+ response_description="Database connection detail (credentials excluded).",
310
+ responses={
311
+ 404: {"description": "Connection not found."},
312
+ 403: {"description": "Access denied — user_id does not own this connection."},
313
+ },
314
+ )
315
+ @log_execution(logger)
316
+ async def get_database_client(
317
+ user_id: str,
318
+ client_id: str,
319
+ db: AsyncSession = Depends(get_db),
320
+ ):
321
+ """
322
+ Return the detail of a single database connection.
323
+
324
+ Returns **403** if the `user_id` in the path does not match the owner
325
+ of the requested connection.
326
+ """
327
+ client = await database_client_service.get(db, client_id)
328
+
329
+ if not client:
330
+ raise HTTPException(status_code=404, detail="Database client not found")
331
+
332
+ if client.user_id != user_id:
333
+ raise HTTPException(status_code=403, detail="Access denied")
334
+
335
+ return DatabaseClientResponse.model_validate(client)
336
+
337
+
338
+ @router.put(
339
+ "/database-clients/{client_id}",
340
+ response_model=DatabaseClientResponse,
341
+ summary="Update a database connection",
342
+ response_description="Updated database connection record (credentials excluded).",
343
+ responses={
344
+ 404: {"description": "Connection not found."},
345
+ 403: {"description": "Access denied — user_id does not own this connection."},
346
+ },
347
+ )
348
+ @log_execution(logger)
349
+ async def update_database_client(
350
+ client_id: str,
351
+ payload: DatabaseClientUpdate,
352
+ user_id: str = Query(..., description="ID of the user who owns the connection."),
353
+ db: AsyncSession = Depends(get_db),
354
+ ):
355
+ """
356
+ Update an existing database connection.
357
+
358
+ Only fields present in the request body are updated.
359
+ If `credentials` is provided it **replaces** the entire credentials object
360
+ and sensitive fields are re-encrypted automatically.
361
+ """
362
+ client = await database_client_service.get(db, client_id)
363
+
364
+ if not client:
365
+ raise HTTPException(status_code=404, detail="Database client not found")
366
+
367
+ if client.user_id != user_id:
368
+ raise HTTPException(status_code=403, detail="Access denied")
369
+
370
+ updated = await database_client_service.update(
371
+ db=db,
372
+ client_id=client_id,
373
+ name=payload.name,
374
+ credentials=payload.credentials,
375
+ status=payload.status,
376
+ )
377
+ return DatabaseClientResponse.model_validate(updated)
378
+
379
+
380
+ @router.delete(
381
+ "/database-clients/{client_id}",
382
+ status_code=status.HTTP_200_OK,
383
+ summary="Delete a database connection",
384
+ responses={
385
+ 200: {"description": "Connection deleted successfully."},
386
+ 404: {"description": "Connection not found."},
387
+ 403: {"description": "Access denied — user_id does not own this connection."},
388
+ },
389
+ )
390
+ @log_execution(logger)
391
+ async def delete_database_client(
392
+ client_id: str,
393
+ user_id: str = Query(..., description="ID of the user who owns the connection."),
394
+ db: AsyncSession = Depends(get_db),
395
+ ):
396
+ """
397
+ Permanently delete a database connection.
398
+
399
+ This action is irreversible. The stored credentials are also removed.
400
+ """
401
+ client = await database_client_service.get(db, client_id)
402
+
403
+ if not client:
404
+ raise HTTPException(status_code=404, detail="Database client not found")
405
+
406
+ if client.user_id != user_id:
407
+ raise HTTPException(status_code=403, detail="Access denied")
408
+
409
+ await database_client_service.delete(db, client_id)
410
+ return {"status": "success", "message": "Database client deleted successfully"}
411
+
412
+
413
+ @router.post(
414
+ "/database-clients/{client_id}/ingest",
415
+ status_code=status.HTTP_200_OK,
416
+ summary="Ingest schema from a registered database into the vector store",
417
+ response_description="Count of chunks ingested.",
418
+ responses={
419
+ 200: {"description": "Ingestion completed successfully."},
420
+ 403: {"description": "Access denied — user_id does not own this connection."},
421
+ 404: {"description": "Connection not found."},
422
+ 501: {"description": "The connection's db_type is not yet supported by the pipeline."},
423
+ 500: {"description": "Ingestion failed (connection error, profiling error, etc.)."},
424
+ },
425
+ )
426
+ @limiter.limit("5/minute")
427
+ @log_execution(logger)
428
+ async def ingest_database_client(
429
+ request: Request,
430
+ client_id: str,
431
+ user_id: str = Query(..., description="ID of the user who owns the connection."),
432
+ db: AsyncSession = Depends(get_db),
433
+ ):
434
+ """
435
+ Decrypt the stored credentials, connect to the user's database, introspect
436
+ its schema, profile each column, embed the descriptions, and store them in
437
+ the shared PGVector collection tagged with `source_type="database"`.
438
+
439
+ Chunks become retrievable via the same retriever used for document chunks.
440
+ """
441
+ client = await database_client_service.get(db, client_id)
442
+
443
+ if not client:
444
+ raise HTTPException(status_code=404, detail="Database client not found")
445
+
446
+ if client.user_id != user_id:
447
+ raise HTTPException(status_code=403, detail="Access denied")
448
+
449
+ if client.status != "active":
450
+ raise HTTPException(
451
+ status_code=status.HTTP_409_CONFLICT,
452
+ detail="Cannot ingest from an inactive database connection.",
453
+ )
454
+
455
+ try:
456
+ creds = decrypt_credentials_dict(client.credentials)
457
+ with db_pipeline_service.engine_scope(
458
+ db_type=client.db_type,
459
+ credentials=creds,
460
+ ) as engine:
461
+ total = await db_pipeline_service.run(user_id=user_id, client_id=client_id, engine=engine)
462
+ except NotImplementedError as e:
463
+ raise HTTPException(status_code=status.HTTP_501_NOT_IMPLEMENTED, detail=str(e))
464
+ except Exception as e:
465
+ logger.error(
466
+ f"Ingestion failed for client {client_id}", user_id=user_id, error=str(e)
467
+ )
468
+ raise HTTPException(
469
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
470
+ detail=f"Ingestion failed: {e}",
471
+ )
472
+
473
+ return {"status": "success", "client_id": client_id, "chunks_ingested": total}
src/api/v1/document.py CHANGED
@@ -1,21 +1,20 @@
1
  """Document management API endpoints."""
2
-
3
- from fastapi import APIRouter, Depends, HTTPException, Request, UploadFile, File, status
4
  from sqlalchemy.ext.asyncio import AsyncSession
5
  from src.db.postgres.connection import get_db
6
  from src.document.document_service import document_service
7
- from src.knowledge.processing_service import knowledge_processor
8
- from src.storage.az_blob.az_blob import blob_storage
9
  from src.middlewares.logging import get_logger, log_execution
10
  from src.middlewares.rate_limit import limiter
 
11
  from pydantic import BaseModel
12
  from typing import List
13
-
14
  logger = get_logger("document_api")
15
-
16
  router = APIRouter(prefix="/api/v1", tags=["Documents"])
17
-
18
-
19
  class DocumentResponse(BaseModel):
20
  id: str
21
  filename: str
@@ -23,6 +22,27 @@ class DocumentResponse(BaseModel):
23
  file_size: int
24
  file_type: str
25
  created_at: str
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
 
28
  @router.get("/documents/{user_id}", response_model=List[DocumentResponse])
@@ -44,8 +64,8 @@ async def list_documents(
44
  )
45
  for doc in documents
46
  ]
47
-
48
-
49
  @router.post("/document/upload")
50
  @limiter.limit("10/minute")
51
  @log_execution(logger)
@@ -57,57 +77,12 @@ async def upload_document(
57
  ):
58
  """Upload a document."""
59
  if not user_id:
60
- raise HTTPException(
61
- status_code=400,
62
- detail="user_id is required"
63
- )
64
-
65
- try:
66
- # Read file content
67
- content = await file.read()
68
- file_size = len(content)
69
-
70
- # Get file type
71
- filename = file.filename
72
- file_type = filename.split('.')[-1].lower() if '.' in filename else 'txt'
73
-
74
- if file_type not in ['pdf', 'docx', 'txt']:
75
- raise HTTPException(
76
- status_code=400,
77
- detail="Unsupported file type. Supported: pdf, docx, txt"
78
- )
79
-
80
- # Upload to blob storage
81
- blob_name = await blob_storage.upload_file(content, filename, user_id)
82
-
83
- # Create document record
84
- document = await document_service.create_document(
85
- db=db,
86
- user_id=user_id,
87
- filename=filename,
88
- blob_name=blob_name,
89
- file_size=file_size,
90
- file_type=file_type
91
- )
92
-
93
- return {
94
- "status": "success",
95
- "message": "Document uploaded successfully",
96
- "data": {
97
- "id": document.id,
98
- "filename": document.filename,
99
- "status": document.status
100
- }
101
- }
102
-
103
- except Exception as e:
104
- logger.error(f"Upload failed for user {user_id}", error=str(e))
105
- raise HTTPException(
106
- status_code=500,
107
- detail=f"Upload failed: {str(e)}"
108
- )
109
-
110
-
111
  @router.delete("/document/delete")
112
  @log_execution(logger)
113
  async def delete_document(
@@ -116,31 +91,10 @@ async def delete_document(
116
  db: AsyncSession = Depends(get_db)
117
  ):
118
  """Delete a document."""
119
- document = await document_service.get_document(db, document_id)
120
-
121
- if not document:
122
- raise HTTPException(
123
- status_code=404,
124
- detail="Document not found"
125
- )
126
-
127
- if document.user_id != user_id:
128
- raise HTTPException(
129
- status_code=403,
130
- detail="Access denied"
131
- )
132
-
133
- success = await document_service.delete_document(db, document_id)
134
-
135
- if success:
136
- return {"status": "success", "message": "Document deleted successfully"}
137
- else:
138
- raise HTTPException(
139
- status_code=500,
140
- detail="Failed to delete document"
141
- )
142
-
143
-
144
  @router.post("/document/process")
145
  @log_execution(logger)
146
  async def process_document(
@@ -149,45 +103,6 @@ async def process_document(
149
  db: AsyncSession = Depends(get_db)
150
  ):
151
  """Process document and ingest to vector index."""
152
- document = await document_service.get_document(db, document_id)
153
-
154
- if not document:
155
- raise HTTPException(
156
- status_code=404,
157
- detail="Document not found"
158
- )
159
-
160
- if document.user_id != user_id:
161
- raise HTTPException(
162
- status_code=403,
163
- detail="Access denied"
164
- )
165
-
166
- try:
167
- # Update status to processing
168
- await document_service.update_document_status(db, document_id, "processing")
169
-
170
- # Process document
171
- chunks_count = await knowledge_processor.process_document(document, db)
172
-
173
- # Update status to completed
174
- await document_service.update_document_status(db, document_id, "completed")
175
-
176
- return {
177
- "status": "success",
178
- "message": "Document processed successfully",
179
- "data": {
180
- "document_id": document_id,
181
- "chunks_processed": chunks_count
182
- }
183
- }
184
-
185
- except Exception as e:
186
- logger.error(f"Processing failed for document {document_id}", error=str(e))
187
- await document_service.update_document_status(
188
- db, document_id, "failed", str(e)
189
- )
190
- raise HTTPException(
191
- status_code=500,
192
- detail=f"Processing failed: {str(e)}"
193
- )
 
1
  """Document management API endpoints."""
2
+
3
+ from fastapi import APIRouter, Depends, HTTPException, Request, UploadFile, File
4
  from sqlalchemy.ext.asyncio import AsyncSession
5
  from src.db.postgres.connection import get_db
6
  from src.document.document_service import document_service
 
 
7
  from src.middlewares.logging import get_logger, log_execution
8
  from src.middlewares.rate_limit import limiter
9
+ from src.pipeline.document_pipeline.document_pipeline import document_pipeline
10
  from pydantic import BaseModel
11
  from typing import List
12
+
13
  logger = get_logger("document_api")
14
+
15
  router = APIRouter(prefix="/api/v1", tags=["Documents"])
16
+
17
+
18
  class DocumentResponse(BaseModel):
19
  id: str
20
  filename: str
 
22
  file_size: int
23
  file_type: str
24
  created_at: str
25
+
26
+
27
+ # NOTE: Keep in sync with SUPPORTED_FILE_TYPES in src/pipeline/document_pipeline/document_pipeline.py
28
+ _DOC_TYPES = [
29
+ {"doc_type": "pdf", "max_size": 10, "status": "active", "message": None},
30
+ {"doc_type": "docx", "max_size": 10, "status": "active", "message": None},
31
+ {"doc_type": "txt", "max_size": 10, "status": "active", "message": None},
32
+ {"doc_type": "csv", "max_size": 10, "status": "active", "message": None},
33
+ {"doc_type": "xlsx", "max_size": 10, "status": "active", "message": None},
34
+ ]
35
+
36
+
37
+ @router.get(
38
+ "/documents/doctypes",
39
+ summary="List supported document types",
40
+ response_description="All document types supported by DataEyond with their size limits and status.",
41
+ )
42
+ @log_execution(logger)
43
+ async def get_document_types():
44
+ """Return every document type DataEyond can process, with max file size and active/inactive status."""
45
+ return {"status": "success", "data": _DOC_TYPES}
46
 
47
 
48
  @router.get("/documents/{user_id}", response_model=List[DocumentResponse])
 
64
  )
65
  for doc in documents
66
  ]
67
+
68
+
69
  @router.post("/document/upload")
70
  @limiter.limit("10/minute")
71
  @log_execution(logger)
 
77
  ):
78
  """Upload a document."""
79
  if not user_id:
80
+ raise HTTPException(status_code=400, detail="user_id is required")
81
+
82
+ data = await document_pipeline.upload(file, user_id, db)
83
+ return {"status": "success", "message": "Document uploaded successfully", "data": data}
84
+
85
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  @router.delete("/document/delete")
87
  @log_execution(logger)
88
  async def delete_document(
 
91
  db: AsyncSession = Depends(get_db)
92
  ):
93
  """Delete a document."""
94
+ await document_pipeline.delete(document_id, user_id, db)
95
+ return {"status": "success", "message": "Document deleted successfully"}
96
+
97
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  @router.post("/document/process")
99
  @log_execution(logger)
100
  async def process_document(
 
103
  db: AsyncSession = Depends(get_db)
104
  ):
105
  """Process document and ingest to vector index."""
106
+ data = await document_pipeline.process(document_id, user_id, db)
107
+ return {"status": "success", "message": "Document processed successfully", "data": data}
108
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/config/agents/system_prompt.md CHANGED
@@ -3,8 +3,7 @@ You are a helpful AI assistant with access to user's uploaded documents. Your ro
3
  1. Answer questions based on provided document context
4
  2. If no relevant information is found in documents, acknowledge this honestly
5
  3. Be concise and direct in your responses
6
- 4. Cite source documents when providing information
7
- 5. If user's question is unclear, ask for clarification
8
 
9
  When document context is provided:
10
  - Use information from documents to answer accurately
 
3
  1. Answer questions based on provided document context
4
  2. If no relevant information is found in documents, acknowledge this honestly
5
  3. Be concise and direct in your responses
6
+ 4. If user's question is unclear, ask for clarification
 
7
 
8
  When document context is provided:
9
  - Use information from documents to answer accurately
src/config/settings.py CHANGED
@@ -61,6 +61,11 @@ class Settings(BaseSettings):
61
  # Bcrypt salt (for users - existing)
62
  emarcal_bcrypt_salt: str = Field(alias="emarcal__bcrypt__salt", default="")
63
 
 
 
 
 
 
64
 
65
  # Singleton instance
66
  settings = Settings()
 
61
  # Bcrypt salt (for users - existing)
62
  emarcal_bcrypt_salt: str = Field(alias="emarcal__bcrypt__salt", default="")
63
 
64
+ # DB credential encryption (Fernet key for user-registered database creds)
65
+ dataeyond_db_credential_key: str = Field(
66
+ alias="dataeyond__db__credential__key"
67
+ )
68
+
69
 
70
  # Singleton instance
71
  settings = Settings()
src/database_client/database_client_service.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Service for managing user-registered external database connections."""
2
+
3
+ import uuid
4
+ from typing import List, Optional
5
+
6
+ from sqlalchemy import delete, select
7
+ from sqlalchemy.ext.asyncio import AsyncSession
8
+
9
+ from src.db.postgres.models import DatabaseClient
10
+ from src.middlewares.logging import get_logger
11
+ from src.utils.db_credential_encryption import (
12
+ decrypt_credentials_dict,
13
+ encrypt_credentials_dict,
14
+ )
15
+
16
+ logger = get_logger("database_client_service")
17
+
18
+
19
+ # Fields that identify the same physical database per db_type.
20
+ _CONNECTION_IDENTITY_KEYS: dict[str, tuple[str, ...]] = {
21
+ "postgres": ("host", "port", "database"),
22
+ "supabase": ("host", "port", "database"),
23
+ "mysql": ("host", "port", "database"),
24
+ "sqlserver": ("host", "port", "database"),
25
+ "bigquery": ("project_id", "dataset_id"),
26
+ "snowflake": ("account", "warehouse", "database"),
27
+ }
28
+
29
+
30
+ class DatabaseClientService:
31
+ """Service for managing user-registered external database connections."""
32
+
33
+ async def _find_duplicate(
34
+ self,
35
+ db: AsyncSession,
36
+ user_id: str,
37
+ db_type: str,
38
+ credentials: dict,
39
+ ) -> Optional[DatabaseClient]:
40
+ """Return an existing client if it points to the same physical database."""
41
+ identity_keys = _CONNECTION_IDENTITY_KEYS.get(db_type, ())
42
+ if not identity_keys:
43
+ return None
44
+
45
+ result = await db.execute(
46
+ select(DatabaseClient).where(
47
+ DatabaseClient.user_id == user_id,
48
+ DatabaseClient.db_type == db_type,
49
+ )
50
+ )
51
+ for existing in result.scalars().all():
52
+ decrypted = decrypt_credentials_dict(existing.credentials)
53
+ if all(
54
+ decrypted.get(k) == credentials.get(k) for k in identity_keys
55
+ ):
56
+ return existing
57
+ return None
58
+
59
+ async def create(
60
+ self,
61
+ db: AsyncSession,
62
+ user_id: str,
63
+ name: str,
64
+ db_type: str,
65
+ credentials: dict,
66
+ ) -> DatabaseClient:
67
+ """Register a new database client connection.
68
+
69
+ If a connection to the same physical database already exists for this
70
+ user, the existing record is returned instead of creating a duplicate.
71
+ Credentials are encrypted before being stored.
72
+ """
73
+ existing = await self._find_duplicate(db, user_id, db_type, credentials)
74
+ if existing:
75
+ logger.info(
76
+ f"Duplicate connection detected, returning existing client {existing.id}"
77
+ )
78
+ return existing
79
+
80
+ client = DatabaseClient(
81
+ id=str(uuid.uuid4()),
82
+ user_id=user_id,
83
+ name=name,
84
+ db_type=db_type,
85
+ credentials=encrypt_credentials_dict(credentials),
86
+ status="active",
87
+ )
88
+ db.add(client)
89
+ await db.commit()
90
+ await db.refresh(client)
91
+ logger.info(f"Created database client {client.id} for user {user_id}")
92
+ return client
93
+
94
+ async def get_user_clients(
95
+ self,
96
+ db: AsyncSession,
97
+ user_id: str,
98
+ ) -> List[DatabaseClient]:
99
+ """Return all active and inactive database clients for a user."""
100
+ result = await db.execute(
101
+ select(DatabaseClient)
102
+ .where(DatabaseClient.user_id == user_id)
103
+ .order_by(DatabaseClient.created_at.desc())
104
+ )
105
+ return result.scalars().all()
106
+
107
+ async def get(
108
+ self,
109
+ db: AsyncSession,
110
+ client_id: str,
111
+ ) -> Optional[DatabaseClient]:
112
+ """Return a single database client by its ID."""
113
+ result = await db.execute(
114
+ select(DatabaseClient).where(DatabaseClient.id == client_id)
115
+ )
116
+ return result.scalars().first()
117
+
118
+ async def update(
119
+ self,
120
+ db: AsyncSession,
121
+ client_id: str,
122
+ name: Optional[str] = None,
123
+ credentials: Optional[dict] = None,
124
+ status: Optional[str] = None,
125
+ ) -> Optional[DatabaseClient]:
126
+ """Update an existing database client connection.
127
+
128
+ Only non-None fields are updated.
129
+ Credentials are re-encrypted if provided.
130
+ """
131
+ client = await self.get(db, client_id)
132
+ if not client:
133
+ return None
134
+
135
+ if name is not None:
136
+ client.name = name
137
+ if credentials is not None:
138
+ client.credentials = encrypt_credentials_dict(credentials)
139
+ if status is not None:
140
+ client.status = status
141
+
142
+ await db.commit()
143
+ await db.refresh(client)
144
+ logger.info(f"Updated database client {client_id}")
145
+ return client
146
+
147
+ async def delete(
148
+ self,
149
+ db: AsyncSession,
150
+ client_id: str,
151
+ ) -> bool:
152
+ """Permanently delete a database client connection."""
153
+ result = await db.execute(
154
+ delete(DatabaseClient).where(DatabaseClient.id == client_id)
155
+ )
156
+ await db.commit()
157
+ deleted = result.rowcount > 0
158
+ if deleted:
159
+ logger.info(f"Deleted database client {client_id}")
160
+ return deleted
161
+
162
+
163
+ database_client_service = DatabaseClientService()
164
+
src/db/postgres/init_db.py CHANGED
@@ -2,7 +2,14 @@
2
 
3
  from sqlalchemy import text
4
  from src.db.postgres.connection import engine, Base
5
- from src.db.postgres.models import Document, Room, ChatMessage, User, MessageSource
 
 
 
 
 
 
 
6
 
7
 
8
  async def init_db():
@@ -21,3 +28,38 @@ async def init_db():
21
  await conn.execute(text(
22
  "ALTER TABLE rooms ADD COLUMN IF NOT EXISTS status VARCHAR NOT NULL DEFAULT 'active'"
23
  ))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
  from sqlalchemy import text
4
  from src.db.postgres.connection import engine, Base
5
+ from src.db.postgres.models import (
6
+ ChatMessage,
7
+ DatabaseClient,
8
+ Document,
9
+ MessageSource,
10
+ Room,
11
+ User,
12
+ )
13
 
14
 
15
  async def init_db():
 
28
  await conn.execute(text(
29
  "ALTER TABLE rooms ADD COLUMN IF NOT EXISTS status VARCHAR NOT NULL DEFAULT 'active'"
30
  ))
31
+
32
+ # HNSW index for fast approximate vector similarity search
33
+ # Only created when the embedding column has explicit dimensions (HNSW requirement).
34
+ # atttypmod > 0 means the vector column was created with a dimension (e.g. vector(1536));
35
+ # atttypmod = -1 means dimensionless — HNSW would fail with "column does not have dimensions".
36
+ await conn.execute(text("""
37
+ DO $$
38
+ BEGIN
39
+ IF EXISTS (
40
+ SELECT FROM pg_attribute a
41
+ JOIN pg_class c ON c.oid = a.attrelid
42
+ WHERE c.relname = 'langchain_pg_embedding'
43
+ AND a.attname = 'embedding'
44
+ AND a.atttypmod > 0
45
+ ) THEN
46
+ CREATE INDEX IF NOT EXISTS idx_langchain_pg_embedding_hnsw
47
+ ON langchain_pg_embedding USING hnsw (embedding vector_cosine_ops);
48
+ END IF;
49
+ END $$
50
+ """))
51
+
52
+ # GIN index for FTS on schema chunks — only created if table exists
53
+ # (langchain_pg_embedding is created by PGVector on first use, not by create_all)
54
+ await conn.execute(text("""
55
+ DO $$
56
+ BEGIN
57
+ IF EXISTS (
58
+ SELECT FROM information_schema.tables
59
+ WHERE table_name = 'langchain_pg_embedding'
60
+ ) THEN
61
+ CREATE INDEX IF NOT EXISTS idx_langchain_pg_embedding_fts
62
+ ON langchain_pg_embedding USING GIN (to_tsvector('english', document));
63
+ END IF;
64
+ END $$
65
+ """))
src/db/postgres/models.py CHANGED
@@ -4,6 +4,7 @@ from uuid import uuid4
4
  from sqlalchemy import Column, String, DateTime, Text, Integer, ForeignKey
5
  from sqlalchemy.orm import relationship
6
  from sqlalchemy.sql import func
 
7
  from src.db.postgres.connection import Base
8
 
9
 
@@ -81,3 +82,18 @@ class MessageSource(Base):
81
  created_at = Column(DateTime(timezone=True), server_default=func.now())
82
 
83
  message = relationship("ChatMessage", back_populates="sources")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  from sqlalchemy import Column, String, DateTime, Text, Integer, ForeignKey
5
  from sqlalchemy.orm import relationship
6
  from sqlalchemy.sql import func
7
+ from sqlalchemy.dialects.postgresql import JSONB
8
  from src.db.postgres.connection import Base
9
 
10
 
 
82
  created_at = Column(DateTime(timezone=True), server_default=func.now())
83
 
84
  message = relationship("ChatMessage", back_populates="sources")
85
+
86
+
87
+ class DatabaseClient(Base):
88
+ """User-registered external database connections."""
89
+ __tablename__ = "databases"
90
+
91
+ id = Column(String, primary_key=True, default=lambda: str(uuid4()))
92
+ user_id = Column(String, nullable=False, index=True)
93
+ name = Column(String, nullable=False) # display name, e.g. "Prod DB"
94
+ db_type = Column(String, nullable=False) # postgres|mysql|sqlserver|supabase|bigquery|snowflake
95
+ credentials = Column(JSONB, nullable=False) # per-type JSON; sensitive fields Fernet-encrypted
96
+ status = Column(String, nullable=False, default="active") # active | inactive
97
+ created_at = Column(DateTime(timezone=True), server_default=func.now())
98
+ updated_at = Column(DateTime(timezone=True), onupdate=func.now())
99
+
src/document/document_service.py CHANGED
@@ -1,8 +1,9 @@
1
  """Service for managing documents."""
2
 
3
  from sqlalchemy.ext.asyncio import AsyncSession
4
- from sqlalchemy import select, delete
5
  from src.db.postgres.models import Document
 
6
  from src.storage.az_blob.az_blob import blob_storage
7
  from src.middlewares.logging import get_logger
8
  from typing import List, Optional
@@ -77,6 +78,21 @@ class DocumentService:
77
  # Delete from blob storage
78
  await blob_storage.delete_file(document.blob_name)
79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  # Delete from database
81
  await db.execute(
82
  delete(Document).where(Document.id == document_id)
 
1
  """Service for managing documents."""
2
 
3
  from sqlalchemy.ext.asyncio import AsyncSession
4
+ from sqlalchemy import select, delete, text
5
  from src.db.postgres.models import Document
6
+ from src.db.postgres.connection import _pgvector_engine
7
  from src.storage.az_blob.az_blob import blob_storage
8
  from src.middlewares.logging import get_logger
9
  from typing import List, Optional
 
78
  # Delete from blob storage
79
  await blob_storage.delete_file(document.blob_name)
80
 
81
+ # Delete vector embeddings from pgvector (scoped to user + collection to avoid cross-user over-delete)
82
+ async with _pgvector_engine.begin() as conn:
83
+ await conn.execute(
84
+ text("""
85
+ DELETE FROM langchain_pg_embedding
86
+ WHERE cmetadata->>'user_id' = :user_id
87
+ AND cmetadata->>'source_type' = 'document'
88
+ AND cmetadata->'data'->>'document_id' = :doc_id
89
+ AND collection_id = (
90
+ SELECT uuid FROM langchain_pg_collection WHERE name = 'document_embeddings'
91
+ )
92
+ """),
93
+ {"user_id": document.user_id, "doc_id": document_id},
94
+ )
95
+
96
  # Delete from database
97
  await db.execute(
98
  delete(Document).where(Document.id == document_id)
src/knowledge/parquet_service.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Parquet service — converts, uploads, downloads, and deletes Parquet files for CSV/XLSX.
2
+
3
+ Parquet files are stored in Azure Blob alongside the original document using
4
+ a deterministic naming convention based on document_id:
5
+
6
+ CSV: {user_id}/{document_id}.parquet
7
+ XLSX sheet: {user_id}/{document_id}__{safe_sheet_name}.parquet
8
+
9
+ This allows tabular.py to construct the correct blob name at retrieval time
10
+ without needing to store it separately, and allows document_pipeline.py to
11
+ delete all Parquet files for a document using a prefix delete.
12
+ """
13
+
14
+ import io
15
+
16
+ import pandas as pd
17
+
18
+ from src.middlewares.logging import get_logger
19
+ from src.storage.az_blob.az_blob import blob_storage
20
+
21
+ logger = get_logger("parquet_service")
22
+
23
+
24
+ def _safe_sheet_name(sheet_name: str) -> str:
25
+ return sheet_name.replace("/", "_").replace(" ", "_").replace("\\", "_")
26
+
27
+
28
+ def parquet_blob_name(user_id: str, document_id: str, sheet_name: str | None = None) -> str:
29
+ """Construct deterministic Parquet blob name."""
30
+ if sheet_name:
31
+ return f"{user_id}/{document_id}__{_safe_sheet_name(sheet_name)}.parquet"
32
+ return f"{user_id}/{document_id}.parquet"
33
+
34
+
35
+ def _to_parquet_bytes(df: pd.DataFrame) -> bytes:
36
+ buf = io.BytesIO()
37
+ df.to_parquet(buf, index=False)
38
+ return buf.getvalue()
39
+
40
+
41
+ async def upload_parquet(
42
+ df: pd.DataFrame,
43
+ user_id: str,
44
+ document_id: str,
45
+ sheet_name: str | None = None,
46
+ ) -> str:
47
+ """Convert DataFrame to Parquet and upload to Azure Blob. Returns blob_name."""
48
+ blob_name = parquet_blob_name(user_id, document_id, sheet_name)
49
+ parquet_bytes = _to_parquet_bytes(df)
50
+ await blob_storage.upload_bytes(parquet_bytes, blob_name)
51
+ logger.info(f"Uploaded Parquet {blob_name} ({len(parquet_bytes)} bytes)")
52
+ return blob_name
53
+
54
+
55
+ async def download_parquet(
56
+ user_id: str,
57
+ document_id: str,
58
+ sheet_name: str | None = None,
59
+ ) -> pd.DataFrame:
60
+ """Download Parquet from Azure Blob and return as DataFrame."""
61
+ blob_name = parquet_blob_name(user_id, document_id, sheet_name)
62
+ content = await blob_storage.download_file(blob_name)
63
+ df = pd.read_parquet(io.BytesIO(content))
64
+ logger.info(f"Downloaded Parquet {blob_name}: {len(df)} rows, {len(df.columns)} columns")
65
+ return df
66
+
67
+
68
+ async def delete_document_parquets(user_id: str, document_id: str) -> int:
69
+ """Delete all Parquet files for a document (CSV = 1 file, XLSX = one per sheet).
70
+
71
+ Uses prefix delete: {user_id}/{document_id} matches all Parquet variants
72
+ for this document without touching the original blob (which uses a random UUID name).
73
+ """
74
+ prefix = f"{user_id}/{document_id}"
75
+ deleted = await blob_storage.delete_blobs_with_prefix(prefix)
76
+ logger.info(f"Deleted {deleted} Parquet file(s) for document {document_id}")
77
+ return deleted
src/knowledge/processing_service.py CHANGED
@@ -5,16 +5,20 @@ from langchain_core.documents import Document as LangChainDocument
5
  from src.db.postgres.vector_store import get_vector_store
6
  from src.storage.az_blob.az_blob import blob_storage
7
  from src.db.postgres.models import Document as DBDocument
8
- from src.config.settings import settings
9
  from sqlalchemy.ext.asyncio import AsyncSession
10
  from src.middlewares.logging import get_logger
11
- from azure.ai.documentintelligence.aio import DocumentIntelligenceClient
12
- from azure.core.credentials import AzureKeyCredential
13
  from typing import List
14
- import pypdf
 
15
  import docx
 
 
 
16
  from io import BytesIO
17
 
 
 
18
  logger = get_logger("knowledge_processing")
19
 
20
 
@@ -40,6 +44,10 @@ class KnowledgeProcessingService:
40
 
41
  if db_doc.file_type == "pdf":
42
  documents = await self._build_pdf_documents(content, db_doc)
 
 
 
 
43
  else:
44
  text = self._extract_text(content, db_doc.file_type)
45
  if not text.strip():
@@ -49,10 +57,15 @@ class KnowledgeProcessingService:
49
  LangChainDocument(
50
  page_content=chunk,
51
  metadata={
52
- "document_id": db_doc.id,
53
  "user_id": db_doc.user_id,
54
- "filename": db_doc.filename,
55
- "chunk_index": i,
 
 
 
 
 
 
56
  }
57
  )
58
  for i, chunk in enumerate(chunks)
@@ -74,62 +87,138 @@ class KnowledgeProcessingService:
74
  async def _build_pdf_documents(
75
  self, content: bytes, db_doc: DBDocument
76
  ) -> List[LangChainDocument]:
77
- """Build LangChain documents from PDF with page_label metadata.
78
-
79
- Uses Azure Document Intelligence (per-page) when credentials are present,
80
- falls back to pypdf (also per-page) otherwise.
81
- """
82
  documents: List[LangChainDocument] = []
83
 
84
- if settings.azureai_docintel_endpoint and settings.azureai_docintel_key:
85
- async with DocumentIntelligenceClient(
86
- endpoint=settings.azureai_docintel_endpoint,
87
- credential=AzureKeyCredential(settings.azureai_docintel_key),
88
- ) as client:
89
- poller = await client.begin_analyze_document(
90
- model_id="prebuilt-read",
91
- body=BytesIO(content),
92
- content_type="application/pdf",
93
- )
94
- result = await poller.result()
95
- logger.info(f"Azure DI extracted {len(result.pages or [])} pages")
96
-
97
- for page in result.pages or []:
98
- page_text = "\n".join(
99
- line.content for line in (page.lines or [])
100
- )
101
- if not page_text.strip():
102
- continue
103
- for chunk in self.text_splitter.split_text(page_text):
104
- documents.append(LangChainDocument(
105
- page_content=chunk,
106
- metadata={
107
- "document_id": db_doc.id,
108
- "user_id": db_doc.user_id,
109
- "filename": db_doc.filename,
110
- "chunk_index": len(documents),
111
- "page_label": page.page_number,
112
- }
113
- ))
114
- else:
115
- logger.warning("Azure DI not configured, using pypdf")
116
- pdf_reader = pypdf.PdfReader(BytesIO(content))
117
- for page_num, page in enumerate(pdf_reader.pages, start=1):
118
- page_text = page.extract_text() or ""
119
- if not page_text.strip():
120
- continue
121
- for chunk in self.text_splitter.split_text(page_text):
122
- documents.append(LangChainDocument(
123
- page_content=chunk,
124
- metadata={
125
  "document_id": db_doc.id,
126
- "user_id": db_doc.user_id,
127
  "filename": db_doc.filename,
 
128
  "chunk_index": len(documents),
129
  "page_label": page_num,
130
- }
131
- ))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
  return documents
134
 
135
  def _extract_text(self, content: bytes, file_type: str) -> str:
 
5
  from src.db.postgres.vector_store import get_vector_store
6
  from src.storage.az_blob.az_blob import blob_storage
7
  from src.db.postgres.models import Document as DBDocument
 
8
  from sqlalchemy.ext.asyncio import AsyncSession
9
  from src.middlewares.logging import get_logger
10
+ from src.knowledge.parquet_service import upload_parquet
 
11
  from typing import List
12
+ from datetime import datetime, timezone, timedelta
13
+ import sys
14
  import docx
15
+ import pandas as pd
16
+ import pytesseract
17
+ from pdf2image import convert_from_bytes
18
  from io import BytesIO
19
 
20
+ _JAKARTA_TZ = timezone(timedelta(hours=7))
21
+
22
  logger = get_logger("knowledge_processing")
23
 
24
 
 
44
 
45
  if db_doc.file_type == "pdf":
46
  documents = await self._build_pdf_documents(content, db_doc)
47
+ elif db_doc.file_type == "csv":
48
+ documents = await self._build_csv_documents(content, db_doc)
49
+ elif db_doc.file_type == "xlsx":
50
+ documents = await self._build_excel_documents(content, db_doc)
51
  else:
52
  text = self._extract_text(content, db_doc.file_type)
53
  if not text.strip():
 
57
  LangChainDocument(
58
  page_content=chunk,
59
  metadata={
 
60
  "user_id": db_doc.user_id,
61
+ "source_type": "document",
62
+ "updated_at": datetime.now(_JAKARTA_TZ).isoformat(),
63
+ "data": {
64
+ "document_id": db_doc.id,
65
+ "filename": db_doc.filename,
66
+ "file_type": db_doc.file_type,
67
+ "chunk_index": i,
68
+ },
69
  }
70
  )
71
  for i, chunk in enumerate(chunks)
 
87
  async def _build_pdf_documents(
88
  self, content: bytes, db_doc: DBDocument
89
  ) -> List[LangChainDocument]:
90
+ """Build LangChain documents from PDF with page_label metadata using Tesseract OCR."""
 
 
 
 
91
  documents: List[LangChainDocument] = []
92
 
93
+ poppler_path = None
94
+ if sys.platform == "win32":
95
+ pytesseract.pytesseract.tesseract_cmd = r"./software/Tesseract-OCR/tesseract.exe"
96
+ poppler_path = "./software/poppler-24.08.0/Library/bin"
97
+
98
+ images = convert_from_bytes(content, poppler_path=poppler_path)
99
+ logger.info(f"Tesseract OCR: converting {len(images)} pages")
100
+
101
+ for page_num, image in enumerate(images, start=1):
102
+ page_text = pytesseract.image_to_string(image)
103
+ if not page_text.strip():
104
+ continue
105
+ for chunk in self.text_splitter.split_text(page_text):
106
+ documents.append(LangChainDocument(
107
+ page_content=chunk,
108
+ metadata={
109
+ "user_id": db_doc.user_id,
110
+ "source_type": "document",
111
+ "updated_at": datetime.now(_JAKARTA_TZ).isoformat(),
112
+ "data": {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  "document_id": db_doc.id,
 
114
  "filename": db_doc.filename,
115
+ "file_type": db_doc.file_type,
116
  "chunk_index": len(documents),
117
  "page_label": page_num,
118
+ },
119
+ }
120
+ ))
121
+
122
+ return documents
123
+
124
+ def _profile_dataframe(
125
+ self, df: pd.DataFrame, source_name: str, db_doc: DBDocument
126
+ ) -> List[LangChainDocument]:
127
+ """Profile each column of a dataframe → one chunk per column."""
128
+ documents = []
129
+ row_count = len(df)
130
+
131
+ for col_name in df.columns:
132
+ col = df[col_name]
133
+ is_numeric = pd.api.types.is_numeric_dtype(col)
134
+ null_count = int(col.isnull().sum())
135
+ distinct_count = int(col.nunique())
136
+ distinct_ratio = distinct_count / row_count if row_count > 0 else 0
137
+
138
+ text = f"Source: {source_name} ({row_count} rows)\n"
139
+ text += f"Column: {col_name} ({col.dtype})\n"
140
+ text += f"Null count: {null_count}\n"
141
+ text += f"Distinct count: {distinct_count} ({distinct_ratio:.1%})\n"
142
+
143
+ if is_numeric:
144
+ text += f"Min: {col.min()}, Max: {col.max()}\n"
145
+ text += f"Mean: {col.mean():.4f}, Median: {col.median():.4f}\n"
146
+
147
+ if 0 < distinct_ratio <= 0.05:
148
+ top_values = col.value_counts().head(10)
149
+ top_str = ", ".join(f"{v} ({c})" for v, c in top_values.items())
150
+ text += f"Top values: {top_str}\n"
151
+
152
+ text += f"Sample values: {col.dropna().head(5).tolist()}"
153
+
154
+ documents.append(LangChainDocument(
155
+ page_content=text,
156
+ metadata={
157
+ "user_id": db_doc.user_id,
158
+ "source_type": "document",
159
+ "chunk_level": "column",
160
+ "updated_at": datetime.now(_JAKARTA_TZ).isoformat(),
161
+ "data": {
162
+ "document_id": db_doc.id,
163
+ "filename": db_doc.filename,
164
+ "file_type": db_doc.file_type,
165
+ "source": source_name,
166
+ "column_name": col_name,
167
+ "column_type": str(col.dtype),
168
+ }
169
+ }
170
+ ))
171
+ return documents
172
+
173
+ def _to_sheet_document(
174
+ self, df: pd.DataFrame, db_doc: DBDocument, sheet_name: str | None, source_name: str
175
+ ) -> LangChainDocument:
176
+ col_summary = ", ".join(f"{c} ({df[c].dtype})" for c in df.columns)
177
+ text = (
178
+ f"Source: {source_name} ({len(df)} rows)\n"
179
+ f"Columns ({len(df.columns)}): {col_summary}"
180
+ )
181
+ return LangChainDocument(
182
+ page_content=text,
183
+ metadata={
184
+ "user_id": db_doc.user_id,
185
+ "source_type": "document",
186
+ "chunk_level": "sheet",
187
+ "updated_at": datetime.now(_JAKARTA_TZ).isoformat(),
188
+ "data": {
189
+ "document_id": db_doc.id,
190
+ "filename": db_doc.filename,
191
+ "file_type": db_doc.file_type,
192
+ "sheet_name": sheet_name,
193
+ "column_names": list(df.columns),
194
+ "row_count": len(df),
195
+ },
196
+ },
197
+ )
198
 
199
+ async def _build_csv_documents(self, content: bytes, db_doc: DBDocument) -> List[LangChainDocument]:
200
+ """Profile each column of a CSV file and upload Parquet to Azure Blob."""
201
+ df = pd.read_csv(BytesIO(content))
202
+ await upload_parquet(df, db_doc.user_id, db_doc.id)
203
+ logger.info(f"Uploaded Parquet for CSV {db_doc.id}")
204
+ docs = self._profile_dataframe(df, db_doc.filename, db_doc)
205
+ docs.append(self._to_sheet_document(df, db_doc, sheet_name=None, source_name=db_doc.filename))
206
+ return docs
207
+
208
+ async def _build_excel_documents(self, content: bytes, db_doc: DBDocument) -> List[LangChainDocument]:
209
+ """Profile each column of every sheet in an Excel file and upload one Parquet per sheet."""
210
+ sheets = pd.read_excel(BytesIO(content), sheet_name=None)
211
+ documents = []
212
+ for sheet_name, df in sheets.items():
213
+ source_name = f"{db_doc.filename} / sheet: {sheet_name}"
214
+ docs = self._profile_dataframe(df, source_name, db_doc)
215
+ for doc in docs:
216
+ doc.metadata["data"]["sheet_name"] = sheet_name
217
+ doc.metadata["chunk_level"] = "column"
218
+ documents.extend(docs)
219
+ documents.append(self._to_sheet_document(df, db_doc, sheet_name, source_name))
220
+ await upload_parquet(df, db_doc.user_id, db_doc.id, sheet_name)
221
+ logger.info(f"Uploaded Parquet for sheet '{sheet_name}' of {db_doc.id}")
222
  return documents
223
 
224
  def _extract_text(self, content: bytes, file_type: str) -> str:
src/models/credentials.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Pydantic credential schemas for user-registered external databases.
2
+
3
+ Imported by the `/database-clients` API router (`src/api/v1/db_client.py`) and,
4
+ via `DbType`, by the db pipeline connector (`src/pipeline/db_pipeline/connector.py`).
5
+
6
+ Sensitive fields (`password`, `service_account_json`) are Fernet-encrypted by
7
+ the database_client service before being stored in the JSONB column; these
8
+ schemas describe the plaintext wire format, not the stored shape.
9
+ """
10
+
11
+ from typing import Literal, Optional, Union
12
+
13
+ from pydantic import BaseModel, Field
14
+
15
+ # ---------------------------------------------------------------------------
16
+ # Supported DB types
17
+ # ---------------------------------------------------------------------------
18
+
19
+ DbType = Literal["postgres", "mysql", "sqlserver", "supabase", "bigquery", "snowflake"]
20
+
21
+
22
+ # ---------------------------------------------------------------------------
23
+ # Typed credential schemas per DB type
24
+ # ---------------------------------------------------------------------------
25
+
26
+
27
+ class PostgresCredentials(BaseModel):
28
+ """Connection credentials for PostgreSQL."""
29
+
30
+ host: str = Field(..., description="Hostname or IP address of the PostgreSQL server.", examples=["db.example.com"])
31
+ port: int = Field(5432, description="Port number (default: 5432).", examples=[5432])
32
+ database: str = Field(..., description="Name of the target database.", examples=["mydb"])
33
+ username: str = Field(..., description="Database username.", examples=["admin"])
34
+ password: str = Field(..., description="Database password. Will be encrypted at rest.", examples=["s3cr3t!"])
35
+ ssl_mode: Literal["disable", "require", "verify-ca", "verify-full"] = Field(
36
+ "require",
37
+ description="SSL mode for the connection.",
38
+ examples=["require"],
39
+ )
40
+
41
+
42
+ class MysqlCredentials(BaseModel):
43
+ """Connection credentials for MySQL."""
44
+
45
+ host: str = Field(..., description="Hostname or IP address of the MySQL server.", examples=["db.example.com"])
46
+ port: int = Field(3306, description="Port number (default: 3306).", examples=[3306])
47
+ database: str = Field(..., description="Name of the target database.", examples=["mydb"])
48
+ username: str = Field(..., description="Database username.", examples=["admin"])
49
+ password: str = Field(..., description="Database password. Will be encrypted at rest.", examples=["s3cr3t!"])
50
+ ssl: bool = Field(True, description="Enable SSL for the connection.", examples=[True])
51
+
52
+
53
+ class SqlServerCredentials(BaseModel):
54
+ """Connection credentials for Microsoft SQL Server."""
55
+
56
+ host: str = Field(..., description="Hostname or IP address of the SQL Server.", examples=["sqlserver.example.com"])
57
+ port: int = Field(1433, description="Port number (default: 1433).", examples=[1433])
58
+ database: str = Field(..., description="Name of the target database.", examples=["mydb"])
59
+ username: str = Field(..., description="Database username.", examples=["sa"])
60
+ password: str = Field(..., description="Database password. Will be encrypted at rest.", examples=["s3cr3t!"])
61
+ driver: Optional[str] = Field(
62
+ None,
63
+ description="ODBC driver name. Leave empty to use the default driver.",
64
+ examples=["ODBC Driver 17 for SQL Server"],
65
+ )
66
+
67
+
68
+ class SupabaseCredentials(BaseModel):
69
+ """Connection credentials for Supabase (PostgreSQL-based).
70
+
71
+ Use the connection string details from your Supabase project dashboard
72
+ under Settings > Database.
73
+ """
74
+
75
+ host: str = Field(
76
+ ...,
77
+ description="Supabase database host (e.g. db.<project-ref>.supabase.co, or the pooler host).",
78
+ examples=["db.xxxx.supabase.co"],
79
+ )
80
+ port: int = Field(
81
+ 5432,
82
+ description="Port number. Use 5432 for direct connection, 6543 for the connection pooler.",
83
+ examples=[5432],
84
+ )
85
+ database: str = Field("postgres", description="Database name (always 'postgres' for Supabase).", examples=["postgres"])
86
+ username: str = Field(
87
+ ...,
88
+ description="Database user. Use 'postgres' for direct connection, or 'postgres.<project-ref>' for the pooler.",
89
+ examples=["postgres"],
90
+ )
91
+ password: str = Field(..., description="Database password (set in Supabase dashboard). Will be encrypted at rest.", examples=["s3cr3t!"])
92
+ ssl_mode: Literal["require", "verify-ca", "verify-full"] = Field(
93
+ "require",
94
+ description="SSL mode. Supabase always requires SSL.",
95
+ examples=["require"],
96
+ )
97
+
98
+
99
+ class BigQueryCredentials(BaseModel):
100
+ """Connection credentials for Google BigQuery.
101
+
102
+ Requires a GCP Service Account with at least BigQuery Data Viewer
103
+ and BigQuery Job User roles.
104
+ """
105
+
106
+ project_id: str = Field(..., description="GCP project ID where the BigQuery dataset resides.", examples=["my-gcp-project"])
107
+ dataset_id: str = Field(..., description="BigQuery dataset name to connect to.", examples=["my_dataset"])
108
+ location: Optional[str] = Field(
109
+ "US",
110
+ description="Dataset location/region (default: US).",
111
+ examples=["US", "EU", "asia-southeast1"],
112
+ )
113
+ service_account_json: str = Field(
114
+ ...,
115
+ description=(
116
+ "Full content of the GCP Service Account key JSON file as a string. "
117
+ "Will be encrypted at rest."
118
+ ),
119
+ examples=['{"type":"service_account","project_id":"my-gcp-project","private_key_id":"..."}'],
120
+ )
121
+
122
+
123
+ class SnowflakeCredentials(BaseModel):
124
+ """Connection credentials for Snowflake."""
125
+
126
+ account: str = Field(
127
+ ...,
128
+ description="Snowflake account identifier, including region if applicable (e.g. myaccount.us-east-1).",
129
+ examples=["myaccount.us-east-1"],
130
+ )
131
+ warehouse: str = Field(..., description="Name of the virtual warehouse to use for queries.", examples=["COMPUTE_WH"])
132
+ database: str = Field(..., description="Name of the target Snowflake database.", examples=["MY_DB"])
133
+ db_schema: Optional[str] = Field("PUBLIC", alias="schema", description="Schema name (default: PUBLIC).", examples=["PUBLIC"])
134
+ username: str = Field(..., description="Snowflake username.", examples=["admin"])
135
+ password: str = Field(..., description="Snowflake password. Will be encrypted at rest.", examples=["s3cr3t!"])
136
+ role: Optional[str] = Field(None, description="Snowflake role to assume for the session.", examples=["SYSADMIN"])
137
+
138
+
139
+ # Union of all credential shapes — reserved for future typed validation on
140
+ # DatabaseClientCreate.credentials (currently Dict[str, Any]). Kept exported
141
+ # so downstream code can reference it without re-declaring.
142
+ CredentialsUnion = Union[
143
+ PostgresCredentials,
144
+ MysqlCredentials,
145
+ SqlServerCredentials,
146
+ SupabaseCredentials,
147
+ BigQueryCredentials,
148
+ SnowflakeCredentials,
149
+ ]
150
+
151
+
152
+ # Doc-only helper: surfaces per-type credential shapes in the Swagger "Schemas"
153
+ # panel so API consumers can discover the exact field set for each db_type.
154
+ # Not referenced by any endpoint — importing it in db_client.py is enough for
155
+ # FastAPI's OpenAPI generator to pick it up.
156
+ class CredentialSchemas(BaseModel):
157
+ """Reference schemas for `credentials` per `db_type` (Swagger-only, not used by endpoints)."""
158
+
159
+ postgres: PostgresCredentials
160
+ mysql: MysqlCredentials
161
+ sqlserver: SqlServerCredentials
162
+ supabase: SupabaseCredentials
163
+ bigquery: BigQueryCredentials
164
+ snowflake: SnowflakeCredentials
src/models/sql_query.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ """Structured output model for LLM-generated SQL queries."""
2
+
3
+ from pydantic import BaseModel, Field
4
+
5
+
6
+ class SQLQuery(BaseModel):
7
+ sql: str = Field(description="A single SQL SELECT statement. No markdown, no explanation inline.")
8
+ reasoning: str = Field(description="One sentence: what this query answers.")
src/models/structured_output.py CHANGED
@@ -19,3 +19,7 @@ class IntentClassification(BaseModel):
19
  default="",
20
  description="Direct response if no search needed (for greetings, etc.)"
21
  )
 
 
 
 
 
19
  default="",
20
  description="Direct response if no search needed (for greetings, etc.)"
21
  )
22
+ source_hint: str = Field(
23
+ default="both",
24
+ description="Which sources to search: 'document' (PDF/DOCX/TXT), 'schema' (DB/CSV/XLSX), or 'both'"
25
+ )
src/pipeline/db_pipeline/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from src.pipeline.db_pipeline.db_pipeline_service import DbPipelineService, db_pipeline_service
2
+
3
+ __all__ = ["DbPipelineService", "db_pipeline_service"]
src/pipeline/db_pipeline/db_pipeline_service.py ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Service for ingesting a user's external database into the vector store.
2
+
3
+ End-to-end flow: connect -> introspect schema -> profile columns -> build text
4
+ -> embed + store in the shared PGVector collection (tagged with
5
+ `source_type="database"`, retrievable via the same retriever used for docs).
6
+
7
+ Sync DB work (SQLAlchemy inspect, pandas read_sql) runs in a threadpool;
8
+ async vector writes stay on the event loop.
9
+ """
10
+
11
+ import asyncio
12
+ from contextlib import contextmanager
13
+ from datetime import datetime, timezone, timedelta
14
+ from typing import Any, Iterator, Optional
15
+
16
+ from langchain_core.documents import Document as LangChainDocument
17
+ from sqlalchemy import URL, create_engine, text
18
+ from sqlalchemy.engine import Engine
19
+
20
+ from src.db.postgres.connection import _pgvector_engine
21
+ from src.db.postgres.vector_store import get_vector_store
22
+ from src.middlewares.logging import get_logger
23
+ from src.models.credentials import DbType
24
+ from src.pipeline.db_pipeline.extractor import (
25
+ build_table_chunk,
26
+ fetch_sample_row,
27
+ get_row_count,
28
+ get_schema,
29
+ profile_table,
30
+ )
31
+
32
+ logger = get_logger("db_pipeline")
33
+
34
+
35
+ class DbPipelineService:
36
+ """End-to-end DB ingestion: connect -> introspect -> profile -> embed -> store."""
37
+
38
+ def connect(self, db_type: DbType, credentials: dict[str, Any]) -> Engine:
39
+ """Build a SQLAlchemy engine for the user's database.
40
+
41
+ `credentials` is the plaintext dict matching the per-type schema in
42
+ `src/models/credentials.py`. BigQuery/Snowflake auth models differ
43
+ from host/port/user/pass, so every shape flows through one dict.
44
+
45
+ Optional driver imports (snowflake-sqlalchemy, json for BigQuery) are
46
+ done lazily so an env missing one driver doesn't break module import.
47
+ """
48
+ logger.info("connecting to user db", db_type=db_type)
49
+
50
+ if db_type in ("postgres", "supabase"):
51
+ query = (
52
+ {"sslmode": credentials["ssl_mode"]} if credentials.get("ssl_mode") else {}
53
+ )
54
+ url = URL.create(
55
+ drivername="postgresql+psycopg2",
56
+ username=credentials["username"],
57
+ password=credentials["password"],
58
+ host=credentials["host"],
59
+ port=credentials["port"],
60
+ database=credentials["database"],
61
+ query=query,
62
+ )
63
+ return create_engine(url)
64
+
65
+ if db_type == "mysql":
66
+ url = URL.create(
67
+ drivername="mysql+pymysql",
68
+ username=credentials["username"],
69
+ password=credentials["password"],
70
+ host=credentials["host"],
71
+ port=credentials["port"],
72
+ database=credentials["database"],
73
+ )
74
+ # pymysql only activates TLS when the `ssl` dict is truthy
75
+ # (empty dict is falsy and silently disables TLS). Use system-
76
+ # default CAs via certifi + hostname verification — required by
77
+ # managed MySQL providers like TiDB Cloud / PlanetScale / Aiven.
78
+ if credentials.get("ssl", True):
79
+ import certifi
80
+
81
+ connect_args = {
82
+ "ssl": {
83
+ "ca": certifi.where(),
84
+ "check_hostname": True,
85
+ }
86
+ }
87
+ else:
88
+ connect_args = {}
89
+ return create_engine(url, connect_args=connect_args)
90
+
91
+ if db_type == "sqlserver":
92
+ # `driver` applies to pyodbc only; we ship pymssql. Accept-and-ignore
93
+ # keeps the credential schema stable.
94
+ if credentials.get("driver"):
95
+ logger.info(
96
+ "sqlserver driver hint ignored (using pymssql)",
97
+ driver=credentials["driver"],
98
+ )
99
+ url = URL.create(
100
+ drivername="mssql+pymssql",
101
+ username=credentials["username"],
102
+ password=credentials["password"],
103
+ host=credentials["host"],
104
+ port=credentials["port"],
105
+ database=credentials["database"],
106
+ )
107
+ return create_engine(url)
108
+
109
+ if db_type == "bigquery":
110
+ import json
111
+
112
+ sa_info = json.loads(credentials["service_account_json"])
113
+ # sqlalchemy-bigquery URL shape: bigquery://<project>/<dataset>
114
+ url = f"bigquery://{credentials['project_id']}/{credentials['dataset_id']}"
115
+ return create_engine(
116
+ url,
117
+ credentials_info=sa_info,
118
+ location=credentials.get("location", "US"),
119
+ )
120
+
121
+ if db_type == "snowflake":
122
+ from snowflake.sqlalchemy import URL as SnowflakeURL
123
+
124
+ url = SnowflakeURL(
125
+ account=credentials["account"],
126
+ user=credentials["username"],
127
+ password=credentials["password"],
128
+ database=credentials["database"],
129
+ schema=(
130
+ credentials.get("db_schema")
131
+ or credentials.get("schema")
132
+ or "PUBLIC"
133
+ ),
134
+ warehouse=credentials["warehouse"],
135
+ role=credentials.get("role") or "",
136
+ )
137
+ return create_engine(url)
138
+
139
+ raise NotImplementedError(f"Unsupported db_type: {db_type}")
140
+
141
+ @contextmanager
142
+ def engine_scope(
143
+ self, db_type: DbType, credentials: dict[str, Any]
144
+ ) -> Iterator[Engine]:
145
+ """Yield a connected Engine and dispose its pool on exit.
146
+
147
+ API callers should prefer this over raw `connect(...)` so user DB
148
+ connection pools do not leak between pipeline runs.
149
+ """
150
+ engine = self.connect(db_type, credentials)
151
+ try:
152
+ yield engine
153
+ finally:
154
+ engine.dispose()
155
+
156
+ def _to_document(
157
+ self, user_id: str, client_id: str, table_name: str, entry: dict, updated_at: str
158
+ ) -> LangChainDocument:
159
+ col = entry["col"]
160
+ return LangChainDocument(
161
+ page_content=entry["text"],
162
+ metadata={
163
+ "user_id": user_id,
164
+ "source_type": "database",
165
+ "chunk_level": "column",
166
+ "database_client_id": client_id,
167
+ "updated_at": updated_at,
168
+ "data": {
169
+ "table_name": table_name,
170
+ "column_name": col["name"],
171
+ "column_type": col["type"],
172
+ "is_primary_key": col.get("is_primary_key", False),
173
+ "foreign_key": col.get("foreign_key"),
174
+ },
175
+ },
176
+ )
177
+
178
+ def _to_table_document(
179
+ self,
180
+ user_id: str,
181
+ client_id: str,
182
+ table_name: str,
183
+ columns: list[dict],
184
+ row_count: int,
185
+ text: str,
186
+ updated_at: str,
187
+ ) -> LangChainDocument:
188
+ foreign_keys = []
189
+ for c in columns:
190
+ fk = c.get("foreign_key")
191
+ if not fk:
192
+ continue
193
+ target_table, _, target_column = fk.partition(".")
194
+ foreign_keys.append({
195
+ "column": c["name"],
196
+ "target_table": target_table,
197
+ "target_column": target_column,
198
+ })
199
+
200
+ return LangChainDocument(
201
+ page_content=text,
202
+ metadata={
203
+ "user_id": user_id,
204
+ "source_type": "database",
205
+ "chunk_level": "table",
206
+ "database_client_id": client_id,
207
+ "updated_at": updated_at,
208
+ "data": {
209
+ "table_name": table_name,
210
+ "row_count": row_count,
211
+ "primary_key": [c["name"] for c in columns if c.get("is_primary_key")],
212
+ "foreign_keys": foreign_keys,
213
+ "column_names": [c["name"] for c in columns],
214
+ },
215
+ },
216
+ )
217
+
218
+ async def run(
219
+ self,
220
+ user_id: str,
221
+ client_id: str,
222
+ engine: Engine,
223
+ exclude_tables: Optional[frozenset[str]] = None,
224
+ ) -> int:
225
+ """Introspect the user's DB, profile columns, embed descriptions, store in PGVector.
226
+
227
+ Returns:
228
+ Total number of chunks ingested.
229
+ """
230
+ vector_store = get_vector_store()
231
+ logger.info("db pipeline start", user_id=user_id)
232
+
233
+ # Profile first — if this fails, old embeddings are untouched
234
+ schema = await asyncio.to_thread(get_schema, engine, exclude_tables)
235
+
236
+ updated_at = datetime.now(timezone(timedelta(hours=7))).isoformat()
237
+ all_docs: list = []
238
+ for table_name, columns in schema.items():
239
+ logger.info("profiling table", table=table_name, columns=len(columns))
240
+ entries = await asyncio.to_thread(profile_table, engine, table_name, columns)
241
+ docs = [self._to_document(user_id, client_id, table_name, e, updated_at) for e in entries]
242
+ all_docs.extend(docs)
243
+
244
+ # Table-level chunk. Failures here are logged and skipped — column
245
+ # chunks above are already in all_docs and will still be written.
246
+ try:
247
+ row_count = await asyncio.to_thread(get_row_count, engine, table_name)
248
+ sample_row = (
249
+ await asyncio.to_thread(fetch_sample_row, engine, table_name)
250
+ if row_count > 0
251
+ else None
252
+ )
253
+ table_text = build_table_chunk(
254
+ table_name, row_count, columns, entries, sample_row
255
+ )
256
+ all_docs.append(
257
+ self._to_table_document(
258
+ user_id, client_id, table_name, columns, row_count, table_text, updated_at
259
+ )
260
+ )
261
+ except Exception as e:
262
+ logger.error(
263
+ "table chunk generation failed", table=table_name, error=str(e)
264
+ )
265
+
266
+ logger.info("profiled table", table=table_name, count=len(docs))
267
+
268
+ # Insert new chunks first; only delete stale chunks after the insert succeeds.
269
+ # Prevents data loss if aadd_documents fails — old embeddings stay queryable
270
+ # until they're proven replaceable. Stale rows are identified by an older
271
+ # updated_at than this run.
272
+ if not all_docs:
273
+ logger.warning(
274
+ "no docs produced from schema; skipping delete to preserve existing embeddings",
275
+ user_id=user_id,
276
+ client_id=client_id,
277
+ )
278
+ return 0
279
+
280
+ await vector_store.aadd_documents(all_docs)
281
+
282
+ async with _pgvector_engine.begin() as conn:
283
+ result = await conn.execute(
284
+ text(
285
+ "DELETE FROM langchain_pg_embedding "
286
+ "WHERE cmetadata->>'user_id' = :user_id "
287
+ " AND cmetadata->>'source_type' = 'database' "
288
+ " AND cmetadata->>'database_client_id' = :client_id "
289
+ " AND cmetadata->>'updated_at' < :updated_at "
290
+ " AND collection_id = ("
291
+ " SELECT uuid FROM langchain_pg_collection WHERE name = 'document_embeddings'"
292
+ " )"
293
+ ),
294
+ {"user_id": user_id, "client_id": client_id, "updated_at": updated_at},
295
+ )
296
+ logger.info("cleared stale db embeddings", user_id=user_id, deleted=result.rowcount)
297
+
298
+ logger.info("db pipeline complete", user_id=user_id, total=len(all_docs))
299
+ return len(all_docs)
300
+
301
+
302
+ db_pipeline_service = DbPipelineService()
src/pipeline/db_pipeline/extractor.py ADDED
@@ -0,0 +1,283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Schema introspection and per-column profiling for a user's database.
2
+
3
+ Identifiers (table/column names) are quoted via the engine's dialect preparer,
4
+ which handles reserved words, mixed case, and embedded quotes correctly across
5
+ dialects. Values used in SQL come from SQLAlchemy inspection of the DB itself,
6
+ not user input.
7
+ """
8
+
9
+ from typing import Optional
10
+
11
+ import pandas as pd
12
+ from sqlalchemy import Float, Integer, Numeric, inspect
13
+ from sqlalchemy.engine import Engine
14
+
15
+ from src.middlewares.logging import get_logger
16
+
17
+ logger = get_logger("db_extractor")
18
+
19
+ TOP_VALUES_THRESHOLD = 0.05 # show top values if distinct_ratio <= 5%
20
+
21
+ # Dialects where PERCENTILE_CONT(...) WITHIN GROUP is supported as an aggregate.
22
+ # MySQL has no percentile aggregate; BigQuery has PERCENTILE_CONT only as an
23
+ # analytic (window) function — both drop median and keep min/max/mean.
24
+ _MEDIAN_DIALECTS = frozenset({"postgresql", "mssql", "snowflake"})
25
+
26
+
27
+ def _supports_median(engine: Engine) -> bool:
28
+ return engine.dialect.name in _MEDIAN_DIALECTS
29
+
30
+
31
+ def _head_query(
32
+ engine: Engine,
33
+ select_clause: str,
34
+ from_clause: str,
35
+ n: int,
36
+ order_by: str = "",
37
+ ) -> str:
38
+ """LIMIT/TOP-equivalent head query for the engine's dialect."""
39
+ if engine.dialect.name == "mssql":
40
+ return f"SELECT TOP {n} {select_clause} FROM {from_clause} {order_by}".strip()
41
+ return f"SELECT {select_clause} FROM {from_clause} {order_by} LIMIT {n}".strip()
42
+
43
+
44
+ def _qi(engine: Engine, name: str) -> str:
45
+ """Dialect-correct identifier quoting (schema.table also handled if dotted)."""
46
+ preparer = engine.dialect.identifier_preparer
47
+ if "." in name:
48
+ schema, _, table = name.partition(".")
49
+ return f"{preparer.quote(schema)}.{preparer.quote(table)}"
50
+ return preparer.quote(name)
51
+
52
+
53
+ def get_schema(
54
+ engine: Engine, exclude_tables: Optional[frozenset[str]] = None
55
+ ) -> dict[str, list[dict]]:
56
+ """Returns {table_name: [{name, type, is_numeric, is_primary_key, foreign_key}, ...]}."""
57
+ exclude = exclude_tables or frozenset()
58
+ inspector = inspect(engine)
59
+ schema = {}
60
+ for table_name in inspector.get_table_names():
61
+ if table_name in exclude:
62
+ continue
63
+
64
+ pk = inspector.get_pk_constraint(table_name)
65
+ pk_cols = set(pk["constrained_columns"]) if pk else set()
66
+
67
+ fk_map = {}
68
+ for fk in inspector.get_foreign_keys(table_name):
69
+ for col, ref_col in zip(fk["constrained_columns"], fk["referred_columns"]):
70
+ fk_map[col] = f"{fk['referred_table']}.{ref_col}"
71
+
72
+ cols = inspector.get_columns(table_name)
73
+ schema[table_name] = [
74
+ {
75
+ "name": c["name"],
76
+ "type": str(c["type"]),
77
+ "is_numeric": isinstance(c["type"], (Integer, Numeric, Float)),
78
+ "is_primary_key": c["name"] in pk_cols,
79
+ "foreign_key": fk_map.get(c["name"]),
80
+ }
81
+ for c in cols
82
+ ]
83
+ logger.info("extracted schema", table_count=len(schema))
84
+ return schema
85
+
86
+
87
+ def get_row_count(engine: Engine, table_name: str) -> int:
88
+ # Cast to plain int — pandas returns numpy.int64 which fails JSONB serialization
89
+ # when the value lands in PGVector cmetadata via the table-level chunk.
90
+ return int(pd.read_sql(f"SELECT COUNT(*) FROM {_qi(engine, table_name)}", engine).iloc[0, 0])
91
+
92
+
93
+ def profile_column(
94
+ engine: Engine,
95
+ table_name: str,
96
+ col_name: str,
97
+ is_numeric: bool,
98
+ row_count: int,
99
+ ) -> dict:
100
+ """Returns null_count, distinct_count, min/max, top values, and sample values."""
101
+ if row_count == 0:
102
+ return {
103
+ "null_count": 0,
104
+ "distinct_count": 0,
105
+ "distinct_ratio": 0.0,
106
+ "sample_values": [],
107
+ }
108
+
109
+ qt = _qi(engine, table_name)
110
+ qc = _qi(engine, col_name)
111
+
112
+ # Combined stats query: null_count, distinct_count, and min/max (if numeric).
113
+ # One round-trip instead of two.
114
+ select_cols = [
115
+ f"COUNT(*) - COUNT({qc}) AS nulls",
116
+ f"COUNT(DISTINCT {qc}) AS distincts",
117
+ ]
118
+ if is_numeric:
119
+ select_cols.append(f"MIN({qc}) AS min_val")
120
+ select_cols.append(f"MAX({qc}) AS max_val")
121
+ select_cols.append(f"AVG({qc}) AS mean_val")
122
+ if _supports_median(engine):
123
+ select_cols.append(
124
+ f"PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY {qc}) AS median_val"
125
+ )
126
+ stats = pd.read_sql(f"SELECT {', '.join(select_cols)} FROM {qt}", engine)
127
+
128
+ null_count = int(stats.iloc[0]["nulls"])
129
+ distinct_count = int(stats.iloc[0]["distincts"])
130
+ distinct_ratio = distinct_count / row_count if row_count > 0 else 0
131
+
132
+ profile = {
133
+ "null_count": null_count,
134
+ "distinct_count": distinct_count,
135
+ "distinct_ratio": round(distinct_ratio, 4),
136
+ }
137
+
138
+ if is_numeric:
139
+ profile["min"] = stats.iloc[0]["min_val"]
140
+ profile["max"] = stats.iloc[0]["max_val"]
141
+ profile["mean"] = stats.iloc[0]["mean_val"]
142
+ if _supports_median(engine):
143
+ profile["median"] = stats.iloc[0]["median_val"]
144
+
145
+ if 0 < distinct_ratio <= TOP_VALUES_THRESHOLD:
146
+ top_sql = _head_query(
147
+ engine,
148
+ select_clause=f"{qc}, COUNT(*) AS cnt",
149
+ from_clause=f"{qt} GROUP BY {qc}",
150
+ n=10,
151
+ order_by="ORDER BY cnt DESC",
152
+ )
153
+ top = pd.read_sql(top_sql, engine)
154
+ profile["top_values"] = list(zip(top.iloc[:, 0].tolist(), top["cnt"].tolist()))
155
+
156
+ sample = pd.read_sql(_head_query(engine, qc, qt, 5), engine)
157
+ profile["sample_values"] = sample.iloc[:, 0].tolist()
158
+
159
+ return profile
160
+
161
+
162
+ def profile_table(engine: Engine, table_name: str, columns: list[dict]) -> list[dict]:
163
+ """Profile every column in a table. Returns [{col, profile, text}, ...].
164
+
165
+ Per-column errors are logged and skipped so one bad column doesn't abort
166
+ the whole table.
167
+ """
168
+ row_count = get_row_count(engine, table_name)
169
+ if row_count == 0:
170
+ logger.info("skipping empty table", table=table_name)
171
+ return []
172
+
173
+ results = []
174
+ for col in columns:
175
+ try:
176
+ profile = profile_column(
177
+ engine, table_name, col["name"], col.get("is_numeric", False), row_count
178
+ )
179
+ text = build_text(table_name, row_count, col, profile)
180
+ results.append({"col": col, "profile": profile, "text": text})
181
+ except Exception as e:
182
+ logger.error(
183
+ "column profiling failed",
184
+ table=table_name,
185
+ column=col["name"],
186
+ error=str(e),
187
+ )
188
+ continue
189
+ return results
190
+
191
+
192
+ def fetch_sample_row(engine: Engine, table_name: str) -> Optional[dict]:
193
+ """First row of the table as a dict, or None if the table is empty.
194
+
195
+ Reuses _qi for dialect-correct quoting and _head_query for TOP/LIMIT.
196
+ """
197
+ qt = _qi(engine, table_name)
198
+ sql = _head_query(engine, "*", qt, 1)
199
+ df = pd.read_sql(sql, engine)
200
+ if df.empty:
201
+ return None
202
+ return df.iloc[0].to_dict()
203
+
204
+
205
+ def build_table_chunk(
206
+ table_name: str,
207
+ row_count: int,
208
+ columns: list[dict],
209
+ column_profiles: list[dict],
210
+ sample_row: Optional[dict],
211
+ ) -> str:
212
+ """Build the table-level chunk text.
213
+
214
+ Format (lines omitted when not applicable):
215
+ Table: {name} ({row_count} rows)
216
+ Primary key: {pk_cols}
217
+ Foreign keys: {col} -> {target_table}.{target_col}, ...
218
+ Columns ({n}): {col1}, {col2}, ...
219
+ Numeric ranges: {col} [{min}-{max}], ...
220
+ Sample row: {dict}
221
+
222
+ Pure formatter — no DB I/O. column_profiles is the output of profile_table
223
+ and is reused so we don't re-introspect.
224
+ """
225
+ lines = [f"Table: {table_name} ({row_count} rows)"]
226
+
227
+ pk_cols = [c["name"] for c in columns if c.get("is_primary_key")]
228
+ if pk_cols:
229
+ lines.append(f"Primary key: {', '.join(pk_cols)}")
230
+
231
+ fk_parts = [
232
+ f"{c['name']} -> {c['foreign_key']}" for c in columns if c.get("foreign_key")
233
+ ]
234
+ if fk_parts:
235
+ lines.append(f"Foreign keys: {', '.join(fk_parts)}")
236
+
237
+ col_names = [c["name"] for c in columns]
238
+ lines.append(f"Columns ({len(col_names)}): {', '.join(col_names)}")
239
+
240
+ range_parts = []
241
+ for entry in column_profiles:
242
+ col = entry["col"]
243
+ profile = entry["profile"]
244
+ if not col.get("is_numeric"):
245
+ continue
246
+ mn = profile.get("min")
247
+ mx = profile.get("max")
248
+ if mn is None or mx is None:
249
+ continue
250
+ range_parts.append(f"{col['name']} [{mn}-{mx}]")
251
+ if range_parts:
252
+ lines.append(f"Numeric ranges: {', '.join(range_parts)}")
253
+
254
+ if sample_row is not None:
255
+ lines.append(f"Sample row: {sample_row}")
256
+
257
+ return "\n".join(lines)
258
+
259
+
260
+ def build_text(table_name: str, row_count: int, col: dict, profile: dict) -> str:
261
+ col_name = col["name"]
262
+ col_type = col["type"]
263
+
264
+ key_label = ""
265
+ if col.get("is_primary_key"):
266
+ key_label = " [PRIMARY KEY]"
267
+ elif col.get("foreign_key"):
268
+ key_label = f" [FK -> {col['foreign_key']}]"
269
+
270
+ text = f"Table: {table_name} ({row_count} rows)\n"
271
+ text += f"Column: {col_name} ({col_type}){key_label}\n"
272
+ text += f"Null count: {profile['null_count']}\n"
273
+ text += f"Distinct count: {profile['distinct_count']} ({profile['distinct_ratio']:.1%})\n"
274
+ if "min" in profile:
275
+ text += f"Min: {profile['min']}, Max: {profile['max']}\n"
276
+ text += f"Mean: {profile['mean']}\n"
277
+ if profile.get("median") is not None:
278
+ text += f"Median: {profile['median']}\n"
279
+ if "top_values" in profile:
280
+ top_str = ", ".join(f"{v} ({c})" for v, c in profile["top_values"])
281
+ text += f"Top values: {top_str}\n"
282
+ text += f"Sample values: {profile['sample_values']}"
283
+ return text
src/pipeline/document_pipeline/__init__.py ADDED
File without changes
src/pipeline/document_pipeline/document_pipeline.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Document upload and processing pipeline."""
2
+
3
+ from fastapi import HTTPException, UploadFile
4
+ from sqlalchemy.ext.asyncio import AsyncSession
5
+
6
+ from src.document.document_service import document_service
7
+ from src.knowledge.processing_service import knowledge_processor
8
+ from src.knowledge.parquet_service import delete_document_parquets
9
+ from src.middlewares.logging import get_logger
10
+ from src.storage.az_blob.az_blob import blob_storage
11
+
12
+ logger = get_logger("document_pipeline")
13
+
14
+ # NOTE: Keep in sync with _DOC_TYPES in src/api/v1/document.py
15
+ SUPPORTED_FILE_TYPES = ["pdf", "docx", "txt", "csv", "xlsx"]
16
+ MAX_FILE_SIZE_BYTES = 10 * 1024 * 1024 # 10 MB
17
+
18
+
19
+ class DocumentPipeline:
20
+ """Orchestrates the full document upload, process, and delete flows."""
21
+
22
+ async def upload(self, file: UploadFile, user_id: str, db: AsyncSession) -> dict:
23
+ """Validate → upload to blob → save to DB."""
24
+ content = await file.read()
25
+ if not file.filename:
26
+ raise HTTPException(status_code=400, detail="Filename is required.")
27
+ file_type = file.filename.split(".")[-1].lower() if "." in file.filename else "txt"
28
+
29
+ if len(content) > MAX_FILE_SIZE_BYTES:
30
+ raise HTTPException(
31
+ status_code=400,
32
+ detail="File size exceeds maximum allowed size of 10 MB.",
33
+ )
34
+
35
+ if file_type not in SUPPORTED_FILE_TYPES:
36
+ raise HTTPException(
37
+ status_code=400,
38
+ detail=f"Unsupported file type. Supported: {', '.join(SUPPORTED_FILE_TYPES)}",
39
+ )
40
+
41
+ blob_name = await blob_storage.upload_file(content, file.filename, user_id)
42
+ document = await document_service.create_document(
43
+ db=db,
44
+ user_id=user_id,
45
+ filename=file.filename,
46
+ blob_name=blob_name,
47
+ file_size=len(content),
48
+ file_type=file_type,
49
+ )
50
+
51
+ logger.info(f"Uploaded document {document.id} for user {user_id}")
52
+ return {"id": document.id, "filename": document.filename, "status": document.status}
53
+
54
+ async def process(self, document_id: str, user_id: str, db: AsyncSession) -> dict:
55
+ """Validate ownership → extract text → chunk → ingest to vector store."""
56
+ document = await document_service.get_document(db, document_id)
57
+
58
+ if not document:
59
+ raise HTTPException(status_code=404, detail="Document not found")
60
+ if document.user_id != user_id:
61
+ raise HTTPException(status_code=403, detail="Access denied")
62
+
63
+ try:
64
+ await document_service.update_document_status(db, document_id, "processing")
65
+ chunks_count = await knowledge_processor.process_document(document, db)
66
+ await document_service.update_document_status(db, document_id, "completed")
67
+
68
+ logger.info(f"Processed document {document_id}: {chunks_count} chunks")
69
+ return {"document_id": document_id, "chunks_processed": chunks_count}
70
+
71
+ except Exception as e:
72
+ logger.error(f"Processing failed for document {document_id}", error=str(e))
73
+ await document_service.update_document_status(db, document_id, "failed", str(e))
74
+ raise HTTPException(status_code=500, detail=f"Processing failed: {str(e)}")
75
+
76
+ async def delete(self, document_id: str, user_id: str, db: AsyncSession) -> dict:
77
+ """Validate ownership → delete from blob and DB."""
78
+ document = await document_service.get_document(db, document_id)
79
+
80
+ if not document:
81
+ raise HTTPException(status_code=404, detail="Document not found")
82
+ if document.user_id != user_id:
83
+ raise HTTPException(status_code=403, detail="Access denied")
84
+
85
+ await document_service.delete_document(db, document_id)
86
+
87
+ if document.file_type in ("csv", "xlsx"):
88
+ await delete_document_parquets(user_id, document_id)
89
+
90
+ logger.info(f"Deleted document {document_id} for user {user_id}")
91
+ return {"document_id": document_id}
92
+
93
+
94
+ document_pipeline = DocumentPipeline()
src/query/__init__.py ADDED
File without changes
src/query/base.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Shared contract for query executors."""
2
+
3
+ from abc import ABC, abstractmethod
4
+ from dataclasses import dataclass, field
5
+
6
+ from sqlalchemy.ext.asyncio import AsyncSession
7
+
8
+ from src.rag.base import RetrievalResult
9
+
10
+
11
+ @dataclass
12
+ class QueryResult:
13
+ source_type: str # "database" or "document"
14
+ source_id: str # database_client_id or document_id
15
+ table_or_file: str
16
+ columns: list[str]
17
+ rows: list[dict]
18
+ row_count: int
19
+ metadata: dict = field(default_factory=dict)
20
+ # metadata should include "column_types": {"col_name": "dtype"} when available
21
+
22
+
23
+ class BaseExecutor(ABC):
24
+ @abstractmethod
25
+ async def execute(
26
+ self,
27
+ results: list[RetrievalResult],
28
+ user_id: str,
29
+ db: AsyncSession,
30
+ question: str,
31
+ limit: int = 100,
32
+ ) -> list[QueryResult]: ...
src/query/executors/__init__.py ADDED
File without changes
src/query/executors/db_executor.py ADDED
@@ -0,0 +1,648 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Executor for registered database sources (source_type="database").
2
+
3
+ Flow per (client_id, question):
4
+ 1. Collect all relevant (table_name, column_name) pairs from retrieval results.
5
+ 2. Fetch the FULL schema for those tables from PGVector (not just top-k columns).
6
+ 3. Build a schema context string and send to LLM → structured SQLQuery output.
7
+ 4. Validate via sqlglot: SELECT-only, schema-grounded, LIMIT enforced.
8
+ 5. Execute on the user's DB via engine_scope + asyncio.to_thread.
9
+ 6. Return QueryResult per client_id (may span multiple tables via JOINs).
10
+
11
+ Supported db_types: postgres, supabase, mysql.
12
+ Other types are skipped with a warning — they do not raise.
13
+ """
14
+
15
+ import asyncio
16
+ from collections import defaultdict
17
+ from typing import Any
18
+
19
+ import sqlglot
20
+ import sqlglot.expressions as exp
21
+ import tiktoken
22
+ from langchain_core.prompts import ChatPromptTemplate
23
+ from langchain_openai import AzureChatOpenAI
24
+ from sqlalchemy import text
25
+ from sqlalchemy.ext.asyncio import AsyncSession
26
+
27
+ from src.config.settings import settings
28
+ from src.database_client.database_client_service import database_client_service
29
+ from src.db.postgres.connection import _pgvector_engine
30
+ from src.middlewares.logging import get_logger
31
+ from src.models.sql_query import SQLQuery
32
+ from src.pipeline.db_pipeline import db_pipeline_service
33
+ from src.query.base import BaseExecutor, QueryResult
34
+ from src.rag.base import RetrievalResult
35
+ from src.utils.db_credential_encryption import decrypt_credentials_dict
36
+
37
+ logger = get_logger("db_executor")
38
+
39
+ _enc = tiktoken.get_encoding("cl100k_base")
40
+
41
+ _SUPPORTED_DB_TYPES = {"postgres", "supabase", "mysql"}
42
+ _MAX_RETRIES = 3
43
+ _MAX_LIMIT = 500
44
+ _FK_EXPANSION_MAX_TABLES = 5
45
+
46
+ _SQL_SYSTEM_PROMPT = """\
47
+ You are a SQL data analyst working with a user's database.
48
+ Generate a single SQL SELECT statement that answers the user's question.
49
+
50
+ Database dialect: {dialect}
51
+
52
+ Rules:
53
+ - ONLY reference tables and columns listed in the schema below. Do not invent names.
54
+ - Always include a LIMIT clause (max {limit}).
55
+ - Do not use DELETE, UPDATE, INSERT, DROP, TRUNCATE, ALTER, CREATE, or any DDL.
56
+ - Prefer explicit JOINs over subqueries when combining tables.
57
+ - For aggregations, always alias the result column (e.g. COUNT(*) AS order_count).
58
+ - For date filtering, use dialect-appropriate functions ({dialect} syntax).
59
+
60
+ Schema:
61
+ {schema}
62
+
63
+ {error_section}"""
64
+
65
+
66
+ class DbExecutor(BaseExecutor):
67
+ def __init__(self) -> None:
68
+ self._llm = AzureChatOpenAI(
69
+ azure_deployment=settings.azureai_deployment_name_4o,
70
+ openai_api_version=settings.azureai_api_version_4o,
71
+ azure_endpoint=settings.azureai_endpoint_url_4o,
72
+ api_key=settings.azureai_api_key_4o,
73
+ temperature=0,
74
+ )
75
+ self._prompt = ChatPromptTemplate.from_messages([
76
+ ("system", _SQL_SYSTEM_PROMPT),
77
+ ("human", "{question}"),
78
+ ])
79
+ self._chain = self._prompt | self._llm.with_structured_output(SQLQuery)
80
+
81
+ # ------------------------------------------------------------------
82
+ # Public interface
83
+ # ------------------------------------------------------------------
84
+
85
+ async def execute(
86
+ self,
87
+ results: list[RetrievalResult],
88
+ user_id: str,
89
+ db: AsyncSession,
90
+ question: str,
91
+ limit: int = 100,
92
+ ) -> list[QueryResult]:
93
+ db_results = [r for r in results if r.source_type == "database"]
94
+ if not db_results:
95
+ return []
96
+
97
+ # Group by client_id — one SQL generation + execution pass per client
98
+ by_client: dict[str, list[RetrievalResult]] = defaultdict(list)
99
+ for r in db_results:
100
+ client_id = r.metadata.get("database_client_id", "")
101
+ if client_id:
102
+ by_client[client_id].append(r)
103
+ else:
104
+ logger.warning("db result missing database_client_id, skipping")
105
+
106
+ query_results: list[QueryResult] = []
107
+ for client_id, client_results in by_client.items():
108
+ try:
109
+ qr = await self._execute_for_client(client_id, client_results, user_id, db, question, limit)
110
+ if qr:
111
+ query_results.append(qr)
112
+ except Exception as e:
113
+ logger.error("db executor failed for client", client_id=client_id, error=str(e))
114
+
115
+ return query_results
116
+
117
+ # ------------------------------------------------------------------
118
+ # Per-client execution
119
+ # ------------------------------------------------------------------
120
+
121
+ async def _execute_for_client(
122
+ self,
123
+ client_id: str,
124
+ results: list[RetrievalResult],
125
+ user_id: str,
126
+ db: AsyncSession,
127
+ question: str,
128
+ limit: int,
129
+ ) -> QueryResult | None:
130
+ client = await database_client_service.get(db, client_id)
131
+ if not client:
132
+ logger.warning("database client not found", client_id=client_id)
133
+ return None
134
+ if client.user_id != user_id:
135
+ logger.warning("client ownership mismatch", client_id=client_id)
136
+ return None
137
+ if client.db_type not in _SUPPORTED_DB_TYPES:
138
+ logger.warning("unsupported db_type for query execution", db_type=client.db_type)
139
+ return None
140
+
141
+ # Hit tables = tables retrieval pointed at directly. Get full per-column
142
+ # schema for these. Related tables (one FK hop away, both directions) are
143
+ # fetched separately in abbreviated form to give the LLM enough context
144
+ # to JOIN without paying the per-column profile token cost.
145
+ hit_tables = list({
146
+ r.metadata.get("data", {}).get("table_name")
147
+ for r in results
148
+ if r.metadata.get("data", {}).get("table_name")
149
+ })
150
+ if not hit_tables:
151
+ logger.warning("no table_name on any retrieval result", client_id=client_id)
152
+ return None
153
+
154
+ full_schema = await self._fetch_full_schema(client_id, hit_tables, user_id)
155
+ if not full_schema:
156
+ logger.warning("no schema found in vector store", client_id=client_id, tables=hit_tables)
157
+ return None
158
+
159
+ related_tables = await self._find_related_tables(client_id, user_id, hit_tables)
160
+ related_schema = (
161
+ await self._fetch_abbreviated_schema(client_id, user_id, related_tables)
162
+ if related_tables else {}
163
+ )
164
+
165
+ schema_ctx = self._build_schema_context(full_schema, related_schema)
166
+ capped_limit = min(limit, _MAX_LIMIT)
167
+ dialect = client.db_type
168
+
169
+ # SQL generation with retry
170
+ validated_sql: str | None = None
171
+ prev_error: str = ""
172
+ prev_reasoning: str = ""
173
+ for attempt in range(_MAX_RETRIES):
174
+ if prev_error:
175
+ error_section = (
176
+ f"Previous attempt reasoning: {prev_reasoning}\n"
177
+ f"Previous attempt failed: {prev_error}\n"
178
+ "Fix the issue above."
179
+ )
180
+ else:
181
+ error_section = ""
182
+ try:
183
+ prompt_text = schema_ctx + error_section + question
184
+ input_tokens = len(_enc.encode(prompt_text))
185
+ logger.info("sql generation input tokens", attempt=attempt + 1, tokens=input_tokens)
186
+
187
+ result: SQLQuery = await self._chain.ainvoke({
188
+ "schema": schema_ctx,
189
+ "dialect": dialect,
190
+ "limit": capped_limit,
191
+ "error_section": error_section,
192
+ "question": question,
193
+ })
194
+ sql = result.sql.strip()
195
+ allowed_tables = set(full_schema) | set(related_schema)
196
+ column_map: dict[str, set[str]] = {
197
+ t: {c["name"] for c in cols} for t, cols in full_schema.items()
198
+ }
199
+ for t, info in related_schema.items():
200
+ column_map[t] = set(info.get("column_names") or [])
201
+ validation_error = self._validate(sql, allowed_tables, capped_limit, column_map)
202
+ if validation_error:
203
+ prev_error = validation_error
204
+ prev_reasoning = result.reasoning
205
+ logger.warning("sql validation failed", attempt=attempt + 1, error=validation_error)
206
+ continue
207
+ validated_sql = self._enforce_limit(sql, capped_limit)
208
+ output_tokens = len(_enc.encode(result.sql)) + len(_enc.encode(result.reasoning))
209
+ logger.info(
210
+ "sql generated",
211
+ attempt=attempt + 1,
212
+ input_tokens=input_tokens,
213
+ output_tokens=output_tokens,
214
+ total_tokens=input_tokens + output_tokens,
215
+ reasoning=result.reasoning,
216
+ )
217
+ break
218
+ except Exception as e:
219
+ prev_error = str(e)
220
+ logger.warning("sql generation error", attempt=attempt + 1, error=prev_error)
221
+
222
+ if not validated_sql:
223
+ logger.error("sql generation failed after retries", client_id=client_id)
224
+ return None
225
+
226
+ # Execute on user's DB
227
+ creds = decrypt_credentials_dict(client.credentials)
228
+ with db_pipeline_service.engine_scope(client.db_type, creds) as engine:
229
+ rows = await asyncio.to_thread(self._run_sql, engine, validated_sql)
230
+
231
+ column_types = {
232
+ col["name"]: col["type"]
233
+ for cols in full_schema.values()
234
+ for col in cols
235
+ }
236
+ columns = list(rows[0].keys()) if rows else []
237
+
238
+ return QueryResult(
239
+ source_type="database",
240
+ source_id=client_id,
241
+ table_or_file=", ".join(hit_tables),
242
+ columns=columns,
243
+ rows=rows,
244
+ row_count=len(rows),
245
+ metadata={
246
+ "db_type": client.db_type,
247
+ "client_name": client.name,
248
+ "sql": validated_sql,
249
+ "column_types": {c: column_types.get(c, "unknown") for c in columns},
250
+ },
251
+ )
252
+
253
+ # ------------------------------------------------------------------
254
+ # Schema helpers
255
+ # ------------------------------------------------------------------
256
+
257
+ async def _find_related_tables(
258
+ self,
259
+ client_id: str,
260
+ user_id: str,
261
+ hit_tables: list[str],
262
+ ) -> list[str]:
263
+ """One-hop FK neighbours of `hit_tables`, both directions, excluding hits.
264
+
265
+ Prefers chunk_level='table' rows; if none exist for the client (legacy
266
+ ingest predating Phase 1), falls back to aggregating from column-chunk
267
+ metadata. Returns [] when no FK metadata is available.
268
+
269
+ Capped at _FK_EXPANSION_MAX_TABLES, ranked by edge count desc then
270
+ table name asc. A warning is logged when the cap kicks in.
271
+ """
272
+ if not hit_tables:
273
+ return []
274
+
275
+ hit_set = set(hit_tables)
276
+ # edge_counts[related_table] = number of FK edges connecting it to the hit set
277
+ edge_counts: dict[str, int] = defaultdict(int)
278
+
279
+ # ---- Primary path: table-level chunks ----
280
+ sql = text("""
281
+ SELECT lpe.cmetadata
282
+ FROM langchain_pg_embedding lpe
283
+ JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
284
+ WHERE lpc.name = 'document_embeddings'
285
+ AND lpe.cmetadata->>'user_id' = :user_id
286
+ AND lpe.cmetadata->>'source_type' = 'database'
287
+ AND lpe.cmetadata->>'database_client_id' = :client_id
288
+ AND lpe.cmetadata->>'chunk_level' = 'table'
289
+ """)
290
+ async with _pgvector_engine.connect() as conn:
291
+ result = await conn.execute(sql, {"user_id": user_id, "client_id": client_id})
292
+ table_rows = result.fetchall()
293
+
294
+ if table_rows:
295
+ for row in table_rows:
296
+ data = row.cmetadata.get("data", {})
297
+ table = data.get("table_name")
298
+ fks = data.get("foreign_keys") or []
299
+ if not table:
300
+ continue
301
+ if table in hit_set:
302
+ # Outgoing: this hit's FKs point at related tables
303
+ for fk in fks:
304
+ target = fk.get("target_table")
305
+ if target and target not in hit_set:
306
+ edge_counts[target] += 1
307
+ else:
308
+ # Incoming: this non-hit table's FKs point into the hit set
309
+ for fk in fks:
310
+ target = fk.get("target_table")
311
+ if target in hit_set:
312
+ edge_counts[table] += 1
313
+ else:
314
+ # ---- Fallback: aggregate from column chunks ----
315
+ sql = text("""
316
+ SELECT lpe.cmetadata->'data'->>'table_name' AS src_table,
317
+ lpe.cmetadata->'data'->>'foreign_key' AS fk
318
+ FROM langchain_pg_embedding lpe
319
+ JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
320
+ WHERE lpc.name = 'document_embeddings'
321
+ AND lpe.cmetadata->>'user_id' = :user_id
322
+ AND lpe.cmetadata->>'source_type' = 'database'
323
+ AND lpe.cmetadata->>'database_client_id' = :client_id
324
+ AND lpe.cmetadata->>'chunk_level' = 'column'
325
+ AND lpe.cmetadata->'data'->>'foreign_key' IS NOT NULL
326
+ """)
327
+ async with _pgvector_engine.connect() as conn:
328
+ result = await conn.execute(sql, {"user_id": user_id, "client_id": client_id})
329
+ col_rows = result.fetchall()
330
+
331
+ for row in col_rows:
332
+ src = row.src_table
333
+ fk = row.fk
334
+ if not src or not fk:
335
+ continue
336
+ target = fk.split(".", 1)[0]
337
+ if src in hit_set and target and target not in hit_set:
338
+ edge_counts[target] += 1
339
+ elif src not in hit_set and target in hit_set:
340
+ edge_counts[src] += 1
341
+
342
+ if not edge_counts:
343
+ return []
344
+
345
+ ranked = sorted(edge_counts.items(), key=lambda kv: (-kv[1], kv[0]))
346
+ if len(ranked) > _FK_EXPANSION_MAX_TABLES:
347
+ logger.warning(
348
+ "fk expansion cap hit",
349
+ client_id=client_id,
350
+ total=len(ranked),
351
+ cap=_FK_EXPANSION_MAX_TABLES,
352
+ dropped=[t for t, _ in ranked[_FK_EXPANSION_MAX_TABLES:]],
353
+ )
354
+ ranked = ranked[:_FK_EXPANSION_MAX_TABLES]
355
+
356
+ related = [t for t, _ in ranked]
357
+ logger.info("fk-related tables", hit=sorted(hit_set), related=related)
358
+ return related
359
+
360
+ async def _fetch_abbreviated_schema(
361
+ self,
362
+ client_id: str,
363
+ user_id: str,
364
+ table_names: list[str],
365
+ ) -> dict[str, dict[str, Any]]:
366
+ """Abbreviated schema: name, row_count, PK, FKs, column names — no profiles.
367
+
368
+ Prefers chunk_level='table' rows. Falls back to aggregating column-chunk
369
+ metadata when table chunks are missing for a given table_name.
370
+
371
+ Returns {table_name: {"row_count": int|None, "primary_key": [str],
372
+ "foreign_keys": [{column, target_table, target_column}],
373
+ "column_names": [str]}}.
374
+ """
375
+ if not table_names:
376
+ return {}
377
+
378
+ placeholders = ", ".join(f":t{i}" for i in range(len(table_names)))
379
+ params: dict[str, Any] = {"user_id": user_id, "client_id": client_id}
380
+ for i, name in enumerate(table_names):
381
+ params[f"t{i}"] = name
382
+
383
+ # Primary path: one row per table from chunk_level='table'
384
+ sql_table = text(f"""
385
+ SELECT lpe.cmetadata
386
+ FROM langchain_pg_embedding lpe
387
+ JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
388
+ WHERE lpc.name = 'document_embeddings'
389
+ AND lpe.cmetadata->>'user_id' = :user_id
390
+ AND lpe.cmetadata->>'source_type' = 'database'
391
+ AND lpe.cmetadata->>'database_client_id' = :client_id
392
+ AND lpe.cmetadata->>'chunk_level' = 'table'
393
+ AND lpe.cmetadata->'data'->>'table_name' IN ({placeholders})
394
+ """)
395
+ async with _pgvector_engine.connect() as conn:
396
+ result = await conn.execute(sql_table, params)
397
+ t_rows = result.fetchall()
398
+
399
+ out: dict[str, dict[str, Any]] = {}
400
+ for row in t_rows:
401
+ data = row.cmetadata.get("data", {})
402
+ tname = data.get("table_name")
403
+ if not tname:
404
+ continue
405
+ out[tname] = {
406
+ "row_count": data.get("row_count"),
407
+ "primary_key": list(data.get("primary_key") or []),
408
+ "foreign_keys": list(data.get("foreign_keys") or []),
409
+ "column_names": list(data.get("column_names") or []),
410
+ }
411
+
412
+ # Fallback for tables with no table-chunk: aggregate column chunks
413
+ missing = [t for t in table_names if t not in out]
414
+ if missing:
415
+ placeholders_m = ", ".join(f":m{i}" for i in range(len(missing)))
416
+ params_m: dict[str, Any] = {"user_id": user_id, "client_id": client_id}
417
+ for i, name in enumerate(missing):
418
+ params_m[f"m{i}"] = name
419
+ sql_col = text(f"""
420
+ SELECT lpe.cmetadata
421
+ FROM langchain_pg_embedding lpe
422
+ JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
423
+ WHERE lpc.name = 'document_embeddings'
424
+ AND lpe.cmetadata->>'user_id' = :user_id
425
+ AND lpe.cmetadata->>'source_type' = 'database'
426
+ AND lpe.cmetadata->>'database_client_id' = :client_id
427
+ AND lpe.cmetadata->>'chunk_level' = 'column'
428
+ AND lpe.cmetadata->'data'->>'table_name' IN ({placeholders_m})
429
+ ORDER BY lpe.cmetadata->'data'->>'table_name', lpe.cmetadata->'data'->>'column_name'
430
+ """)
431
+ async with _pgvector_engine.connect() as conn:
432
+ result = await conn.execute(sql_col, params_m)
433
+ c_rows = result.fetchall()
434
+
435
+ agg: dict[str, dict[str, Any]] = {
436
+ t: {"row_count": None, "primary_key": [], "foreign_keys": [], "column_names": []}
437
+ for t in missing
438
+ }
439
+ for row in c_rows:
440
+ data = row.cmetadata.get("data", {})
441
+ tname = data.get("table_name")
442
+ cname = data.get("column_name")
443
+ if not tname or tname not in agg or not cname:
444
+ continue
445
+ bucket = agg[tname]
446
+ bucket["column_names"].append(cname)
447
+ if data.get("is_primary_key"):
448
+ bucket["primary_key"].append(cname)
449
+ fk = data.get("foreign_key")
450
+ if fk:
451
+ target_table, _, target_col = fk.partition(".")
452
+ bucket["foreign_keys"].append({
453
+ "column": cname,
454
+ "target_table": target_table,
455
+ "target_column": target_col,
456
+ })
457
+ for t, v in agg.items():
458
+ if v["column_names"]:
459
+ out[t] = v
460
+
461
+ return out
462
+
463
+ async def _fetch_full_schema(
464
+ self,
465
+ client_id: str,
466
+ table_names: list[str],
467
+ user_id: str,
468
+ ) -> dict[str, list[dict[str, Any]]]:
469
+ """Fetch ALL column chunks for the given tables from PGVector.
470
+
471
+ Returns {table_name: [{"name": ..., "type": ..., "is_primary_key": ...,
472
+ "foreign_key": ..., "content": ...}]}
473
+ """
474
+ placeholders = ", ".join(f":t{i}" for i in range(len(table_names)))
475
+ sql = text(f"""
476
+ SELECT lpe.cmetadata, lpe.document
477
+ FROM langchain_pg_embedding lpe
478
+ JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
479
+ WHERE lpc.name = 'document_embeddings'
480
+ AND lpe.cmetadata->>'user_id' = :user_id
481
+ AND lpe.cmetadata->>'source_type' = 'database'
482
+ AND lpe.cmetadata->>'chunk_level' = 'column'
483
+ AND lpe.cmetadata->>'database_client_id' = :client_id
484
+ AND lpe.cmetadata->'data'->>'table_name' IN ({placeholders})
485
+ ORDER BY lpe.cmetadata->'data'->>'table_name', lpe.cmetadata->'data'->>'column_name'
486
+ """)
487
+
488
+ params: dict[str, Any] = {"user_id": user_id, "client_id": client_id}
489
+ for i, name in enumerate(table_names):
490
+ params[f"t{i}"] = name
491
+
492
+ async with _pgvector_engine.connect() as conn:
493
+ result = await conn.execute(sql, params)
494
+ rows = result.fetchall()
495
+
496
+ schema: dict[str, list[dict[str, Any]]] = defaultdict(list)
497
+ for row in rows:
498
+ data = row.cmetadata.get("data", {})
499
+ table = data.get("table_name")
500
+ if table:
501
+ schema[table].append({
502
+ "name": data.get("column_name", ""),
503
+ "type": data.get("column_type", ""),
504
+ "is_primary_key": data.get("is_primary_key", False),
505
+ "foreign_key": data.get("foreign_key"),
506
+ "content": row.document, # chunk text includes top values / samples
507
+ })
508
+ return dict(schema)
509
+
510
+ def _build_schema_context(
511
+ self,
512
+ schema: dict[str, list[dict[str, Any]]],
513
+ related_schema: dict[str, dict[str, Any]] | None = None,
514
+ ) -> str:
515
+ lines: list[str] = []
516
+ for table, columns in schema.items():
517
+ lines.append(f"Table: {table}")
518
+ for col in columns:
519
+ flags = []
520
+ if col["is_primary_key"]:
521
+ flags.append("PRIMARY KEY")
522
+ if col["foreign_key"]:
523
+ flags.append(f"FK -> {col['foreign_key']}")
524
+ flag_str = f" [{', '.join(flags)}]" if flags else ""
525
+ lines.append(f" - {col['name']} {col['type']}{flag_str}")
526
+ # Include sample/top-values line from chunk content if present
527
+ for line in col["content"].splitlines():
528
+ if line.startswith(("Top values:", "Sample values:")):
529
+ lines.append(f" {line}")
530
+ break
531
+ lines.append("")
532
+
533
+ related_block = self._build_related_schema_block(related_schema or {})
534
+ if related_block:
535
+ lines.append(related_block)
536
+
537
+ return "\n".join(lines).strip()
538
+
539
+ def _build_related_schema_block(self, related_schema: dict[str, dict[str, Any]]) -> str:
540
+ """Format the abbreviated FK-related-tables section. Empty string when no related."""
541
+ if not related_schema:
542
+ return ""
543
+ lines: list[str] = ["Related tables (one hop via FK, abbreviated — use for JOINs only):"]
544
+ for table, info in related_schema.items():
545
+ row_count = info.get("row_count")
546
+ header = f"- {table} ({row_count} rows)" if row_count is not None else f"- {table}"
547
+ lines.append(header)
548
+ pk = info.get("primary_key") or []
549
+ lines.append(f" Primary key: {', '.join(pk) if pk else '(none)'}")
550
+ fks = info.get("foreign_keys") or []
551
+ if fks:
552
+ fk_strs = [
553
+ f"{fk.get('column')} -> {fk.get('target_table')}.{fk.get('target_column')}"
554
+ for fk in fks
555
+ ]
556
+ lines.append(f" Foreign keys: {', '.join(fk_strs)}")
557
+ else:
558
+ lines.append(" Foreign keys: (none)")
559
+ cols = info.get("column_names") or []
560
+ lines.append(f" Columns: {', '.join(cols)}")
561
+ return "\n".join(lines)
562
+
563
+ # ------------------------------------------------------------------
564
+ # Guardrails
565
+ # ------------------------------------------------------------------
566
+
567
+ def _validate(
568
+ self,
569
+ sql: str,
570
+ allowed_tables: set[str],
571
+ limit: int,
572
+ column_map: dict[str, set[str]] | None = None,
573
+ ) -> str:
574
+ """Return an error string if validation fails, empty string if OK.
575
+
576
+ `allowed_tables` is the union of hit-table names and FK-related table
577
+ names — both are legal targets for SELECT/JOIN.
578
+
579
+ `column_map` maps table_name → set of valid column names. When provided,
580
+ any qualified table.column reference not found in the map triggers a retry
581
+ with an informative error so the LLM can self-correct without hallucinating.
582
+ """
583
+ # Layer 1: sqlglot parse + SELECT-only check
584
+ try:
585
+ parsed = sqlglot.parse_one(sql)
586
+ except sqlglot.errors.ParseError as e:
587
+ return f"SQL parse error: {e}"
588
+
589
+ if not isinstance(parsed, exp.Select):
590
+ return f"Only SELECT statements are allowed. Got: {type(parsed).__name__}"
591
+
592
+ # Check for DML anywhere in the AST (including writeable CTEs)
593
+ for node in parsed.find_all((exp.Insert, exp.Update, exp.Delete)):
594
+ return f"DML ({type(node).__name__}) is not allowed."
595
+
596
+ # Layer 2: schema grounding — table names
597
+ known_tables = {t.lower() for t in allowed_tables}
598
+ alias_to_table: dict[str, str] = {}
599
+ for tbl in parsed.find_all(exp.Table):
600
+ name = tbl.name.lower()
601
+ if name and name not in known_tables:
602
+ return f"Unknown table '{tbl.name}'. Only use tables from the schema."
603
+ alias = (tbl.alias or tbl.name).lower()
604
+ alias_to_table[alias] = name
605
+
606
+ # Layer 3: column grounding — qualified references only (table.column)
607
+ if column_map:
608
+ normalized_map = {t.lower(): {c.lower() for c in cols} for t, cols in column_map.items()}
609
+ for col_node in parsed.find_all(exp.Column):
610
+ tbl_ref = col_node.table
611
+ if not tbl_ref:
612
+ continue # unqualified — skip, can't resolve without full alias tracking
613
+ tbl_name = alias_to_table.get(tbl_ref.lower(), tbl_ref.lower())
614
+ col_name = col_node.name.lower()
615
+ if tbl_name in normalized_map and col_name not in normalized_map[tbl_name]:
616
+ available = ", ".join(sorted(normalized_map[tbl_name]))
617
+ return (
618
+ f"Column '{col_node.name}' does not exist on table '{tbl_name}'. "
619
+ f"Available columns: {available}."
620
+ )
621
+
622
+ # Layer 4: LIMIT enforcement (inject if missing — done before execution)
623
+ return ""
624
+
625
+ # ------------------------------------------------------------------
626
+ # SQL execution
627
+ # ------------------------------------------------------------------
628
+
629
+ def _enforce_limit(self, sql: str, limit: int) -> str:
630
+ """Inject or cap LIMIT using sqlglot AST manipulation."""
631
+ parsed = sqlglot.parse_one(sql)
632
+ existing = parsed.find(exp.Limit)
633
+ if existing:
634
+ current = int(existing.expression.this)
635
+ if current > limit:
636
+ return parsed.limit(limit).sql()
637
+ else:
638
+ return parsed.limit(limit).sql()
639
+ return parsed.sql()
640
+
641
+ def _run_sql(self, engine: Any, sql: str) -> list[dict]:
642
+ # Ensure the user DB connection is a read-only credential — sqlglot validation alone is not sufficient.
643
+ with engine.connect() as conn:
644
+ result = conn.execute(text(sql))
645
+ return [dict(row) for row in result.mappings()]
646
+
647
+
648
+ db_executor = DbExecutor()
src/query/executors/tabular.py ADDED
@@ -0,0 +1,287 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Executor for tabular document sources (source_type="document", file_type csv/xlsx).
2
+
3
+ Flow:
4
+ 1. Group RetrievalResult chunks by (document_id, sheet_name).
5
+ 2. Per group: download Parquet from Azure Blob → pandas DataFrame.
6
+ 3. Build schema context from DataFrame columns + sample values.
7
+ 4. LLM decides operation (groupby_sum, filter, top_n, etc.) via structured output.
8
+ 5. Pandas runs the operation; retry up to 3x on error with feedback to LLM.
9
+ 6. Fallback to raw rows if all retries fail.
10
+ 7. Return QueryResult per group.
11
+ """
12
+ import asyncio
13
+ from typing import Literal, TypedDict
14
+
15
+ import pandas as pd
16
+ from langchain_core.prompts import ChatPromptTemplate
17
+ from langchain_openai import AzureChatOpenAI
18
+ from pydantic import BaseModel
19
+ from sqlalchemy.ext.asyncio import AsyncSession
20
+
21
+ from src.config.settings import settings
22
+ from src.knowledge.parquet_service import download_parquet
23
+ from src.middlewares.logging import get_logger
24
+ from src.query.base import BaseExecutor, QueryResult
25
+ from src.rag.base import RetrievalResult
26
+
27
+ logger = get_logger("tabular_executor")
28
+
29
+
30
+ class _GroupInfo(TypedDict):
31
+ filename: str
32
+ file_type: str
33
+
34
+
35
+ _TABULAR_FILE_TYPES = ("csv", "xlsx")
36
+ _MAX_RETRIES = 3
37
+
38
+ _SYSTEM_PROMPT = """\
39
+ You are a data analyst. Given a DataFrame schema and a user question, \
40
+ decide which pandas operation to perform.
41
+
42
+ IMPORTANT rules:
43
+ - Use ONLY the exact column names as written in the schema below. Never translate or rename them.
44
+ - For top_n: always set value_col to the column to sort by. Do NOT use sort_col for top_n.
45
+ - For sort: use sort_col for the column to sort by.
46
+ - For filter with comparison (>, <, >=, <=, !=): set filter_operator accordingly (gt, lt, gte, lte, ne). Default is eq (==).
47
+ - For multi-condition filters (AND logic), use the filters field as a list of {{"col", "value", "op"}} dicts instead of filter_col/filter_value.
48
+ Example: status=SUCCESS AND amount_paid>200000 → filters=[{{"col":"status","value":"SUCCESS","op":"eq"}},{{"col":"amount_paid","value":"200000","op":"gt"}}]
49
+ - For OR conditions on a column (e.g. value is A or B), use or_filters. Combine with filters for mixed AND+OR logic.
50
+ Example: (status=FAILED OR status=REVERSED) AND payment_channel=X → or_filters=[{{"col":"status","value":"FAILED","op":"eq"}},{{"col":"status","value":"REVERSED","op":"eq"}}], filters=[{{"col":"payment_channel","value":"X","op":"eq"}}]
51
+ - For groupby with a pre-filter (e.g. count SUCCESS per channel): use filters or or_filters to narrow rows first, then use groupby_count/groupby_sum/groupby_avg on the filtered data by setting both filters and group_col.
52
+
53
+ Schema:
54
+ {schema}
55
+
56
+ {error_section}"""
57
+
58
+
59
+ class TabularOperation(BaseModel):
60
+ operation: Literal[
61
+ "filter", "groupby_sum", "groupby_avg", "groupby_count",
62
+ "top_n", "sort", "aggregate", "raw"
63
+ ]
64
+ group_col: str | None = None # for groupby_*
65
+ value_col: str | None = None # for groupby_*, top_n, aggregate
66
+ filter_col: str | None = None # for single filter
67
+ filter_value: str | None = None # for single filter
68
+ filter_operator: Literal["eq", "ne", "gt", "gte", "lt", "lte"] = "eq" # for single filter
69
+ filters: list[dict] | None = None # for multi-condition AND: [{"col": ..., "value": ..., "op": ...}]
70
+ or_filters: list[dict] | None = None # for OR conditions, applied before AND filters
71
+ sort_col: str | None = None # for sort
72
+ ascending: bool = True # for sort
73
+ n: int | None = None # for top_n
74
+ agg_func: Literal["sum", "avg", "min", "max", "count"] | None = None # for aggregate
75
+ reasoning: str
76
+
77
+
78
+ def _get_filter_mask(df: pd.DataFrame, col: str, value: str, operator: str) -> pd.Series:
79
+ numeric = pd.to_numeric(df[col], errors="coerce")
80
+ if operator == "eq":
81
+ return df[col].astype(str) == str(value)
82
+ elif operator == "ne":
83
+ return df[col].astype(str) != str(value)
84
+ elif operator == "gt":
85
+ return numeric > float(value)
86
+ elif operator == "gte":
87
+ return numeric >= float(value)
88
+ elif operator == "lt":
89
+ return numeric < float(value)
90
+ elif operator == "lte":
91
+ return numeric <= float(value)
92
+ raise ValueError(f"Unknown operator: {operator}")
93
+
94
+
95
+ def _apply_single_filter(df: pd.DataFrame, col: str, value: str, operator: str) -> pd.DataFrame:
96
+ return df[_get_filter_mask(df, col, value, operator)]
97
+
98
+
99
+ def _build_schema_context(df: pd.DataFrame) -> str:
100
+ lines = []
101
+ for col in df.columns:
102
+ sample = df[col].dropna().head(3).tolist()
103
+ lines.append(f"- {col} ({df[col].dtype}): sample values: {sample}")
104
+ return "\n".join(lines)
105
+
106
+
107
+ def _apply_operation(df: pd.DataFrame, op: TabularOperation, limit: int) -> pd.DataFrame:
108
+ if op.operation == "groupby_sum":
109
+ if not op.group_col or not op.value_col:
110
+ raise ValueError(f"groupby_sum requires group_col and value_col, got {op}")
111
+ return df.groupby(op.group_col)[op.value_col].sum().reset_index().nlargest(limit, op.value_col)
112
+ elif op.operation == "groupby_avg":
113
+ if not op.group_col or not op.value_col:
114
+ raise ValueError(f"groupby_avg requires group_col and value_col, got {op}")
115
+ return df.groupby(op.group_col)[op.value_col].mean().reset_index().nlargest(limit, op.value_col)
116
+ elif op.operation == "groupby_count":
117
+ if not op.group_col:
118
+ raise ValueError(f"groupby_count requires group_col, got {op}")
119
+ df_filtered = df.copy()
120
+ if op.or_filters:
121
+ or_mask = pd.Series([False] * len(df_filtered), index=df_filtered.index)
122
+ for f in op.or_filters:
123
+ or_mask = or_mask | _get_filter_mask(df_filtered, f["col"], f["value"], f.get("op", "eq"))
124
+ df_filtered = df_filtered[or_mask]
125
+ if op.filters:
126
+ for f in op.filters:
127
+ df_filtered = _apply_single_filter(df_filtered, f["col"], f["value"], f.get("op", "eq"))
128
+ elif op.filter_col and op.filter_value is not None:
129
+ df_filtered = _apply_single_filter(df_filtered, op.filter_col, op.filter_value, op.filter_operator)
130
+ return df_filtered.groupby(op.group_col).size().reset_index(name="count").nlargest(limit, "count")
131
+ elif op.operation == "filter":
132
+ result = df.copy()
133
+ if op.or_filters:
134
+ or_mask = pd.Series([False] * len(result), index=result.index)
135
+ for f in op.or_filters:
136
+ or_mask = or_mask | _get_filter_mask(result, f["col"], f["value"], f.get("op", "eq"))
137
+ result = result[or_mask]
138
+ if op.filters:
139
+ for f in op.filters:
140
+ result = _apply_single_filter(result, f["col"], f["value"], f.get("op", "eq"))
141
+ elif op.filter_col and op.filter_value is not None and not op.or_filters:
142
+ result = _apply_single_filter(result, op.filter_col, op.filter_value, op.filter_operator)
143
+ elif not op.or_filters and not op.filters and (not op.filter_col or op.filter_value is None):
144
+ raise ValueError(f"filter requires filter_col/filter_value or filters or or_filters, got {op}")
145
+ return result.head(limit)
146
+ elif op.operation == "top_n":
147
+ col = op.value_col
148
+ if not col:
149
+ raise ValueError(f"top_n requires value_col, got {op}")
150
+ n = op.n or limit
151
+ return df.nlargest(n, col)
152
+ elif op.operation == "sort":
153
+ if not op.sort_col:
154
+ raise ValueError(f"sort requires sort_col, got {op}")
155
+ return df.sort_values(op.sort_col, ascending=op.ascending).head(limit)
156
+ elif op.operation == "aggregate":
157
+ if not op.value_col or not op.agg_func:
158
+ raise ValueError(f"aggregate requires value_col and agg_func, got {op}")
159
+ funcs = {"sum": "sum", "avg": "mean", "min": "min", "max": "max", "count": "count"}
160
+ value = getattr(df[op.value_col], funcs[op.agg_func])()
161
+ return pd.DataFrame([{op.value_col: value, "operation": op.agg_func}])
162
+ else: # "raw"
163
+ return df.head(limit)
164
+
165
+
166
+ class TabularExecutor(BaseExecutor):
167
+ def __init__(self) -> None:
168
+ self._llm = AzureChatOpenAI(
169
+ azure_deployment=settings.azureai_deployment_name_4o,
170
+ openai_api_version=settings.azureai_api_version_4o,
171
+ azure_endpoint=settings.azureai_endpoint_url_4o,
172
+ api_key=settings.azureai_api_key_4o,
173
+ temperature=0,
174
+ )
175
+ self._prompt = ChatPromptTemplate.from_messages([
176
+ ("system", _SYSTEM_PROMPT),
177
+ ("human", "{question}"),
178
+ ])
179
+ self._chain = self._prompt | self._llm.with_structured_output(TabularOperation)
180
+
181
+ async def execute(
182
+ self,
183
+ results: list[RetrievalResult],
184
+ user_id: str,
185
+ _db: AsyncSession,
186
+ question: str,
187
+ limit: int = 100,
188
+ ) -> list[QueryResult]:
189
+ tabular = [
190
+ r for r in results
191
+ if r.source_type == "document"
192
+ and r.metadata.get("data", {}).get("file_type") in _TABULAR_FILE_TYPES
193
+ ]
194
+
195
+ if not tabular:
196
+ return []
197
+
198
+ # Group by (document_id, sheet_name) — one parquet download per group
199
+ groups: dict[tuple[str, str | None], _GroupInfo] = {}
200
+ for r in tabular:
201
+ data = r.metadata.get("data", {})
202
+ doc_id = data.get("document_id")
203
+ if not doc_id:
204
+ continue
205
+ sheet_name = data.get("sheet_name") # None for CSV
206
+ key = (doc_id, sheet_name)
207
+ if key not in groups:
208
+ groups[key] = {
209
+ "filename": data.get("filename", ""),
210
+ "file_type": data.get("file_type", ""),
211
+ }
212
+
213
+ async def _process_group(
214
+ doc_id: str, sheet_name: str | None, info: _GroupInfo
215
+ ) -> QueryResult | None:
216
+ try:
217
+ df = await download_parquet(user_id, doc_id, sheet_name)
218
+ df_result = await self._query_with_agent(df, question, limit)
219
+
220
+ table_label = info["filename"]
221
+ if sheet_name:
222
+ table_label += f" / sheet: {sheet_name}"
223
+
224
+ logger.info(
225
+ "tabular query complete",
226
+ document_id=doc_id,
227
+ sheet=sheet_name,
228
+ file_type=info["file_type"],
229
+ rows=len(df_result),
230
+ columns=len(df_result.columns),
231
+ )
232
+ return QueryResult(
233
+ source_type="document",
234
+ source_id=doc_id,
235
+ table_or_file=table_label,
236
+ columns=list(df_result.columns),
237
+ rows=df_result.to_dict(orient="records"),
238
+ row_count=len(df_result),
239
+ )
240
+ except Exception as e:
241
+ logger.error(
242
+ "tabular query failed",
243
+ document_id=doc_id,
244
+ sheet=sheet_name,
245
+ error=str(e),
246
+ )
247
+ return None
248
+
249
+ gathered = await asyncio.gather(*[
250
+ _process_group(doc_id, sheet_name, info)
251
+ for (doc_id, sheet_name), info in groups.items()
252
+ ])
253
+ return [r for r in gathered if r is not None]
254
+
255
+ async def _query_with_agent(
256
+ self, df: pd.DataFrame, question: str, limit: int
257
+ ) -> pd.DataFrame:
258
+ schema_ctx = _build_schema_context(df)
259
+ prev_error = ""
260
+
261
+ for attempt in range(_MAX_RETRIES):
262
+ error_section = (
263
+ f"Previous attempt failed: {prev_error}\nFix the issue."
264
+ if prev_error else ""
265
+ )
266
+ try:
267
+ op: TabularOperation = await self._chain.ainvoke({
268
+ "schema": schema_ctx,
269
+ "error_section": error_section,
270
+ "question": question,
271
+ })
272
+ logger.info(
273
+ "tabular operation decided",
274
+ operation=op.operation,
275
+ reasoning=op.reasoning,
276
+ )
277
+ return _apply_operation(df, op, limit)
278
+ except Exception as e:
279
+ prev_error = str(e)
280
+ logger.warning("tabular agent error", attempt=attempt + 1, error=prev_error)
281
+
282
+ # Fallback: return raw rows
283
+ logger.warning("tabular agent failed after retries, returning raw rows")
284
+ return df.head(limit)
285
+
286
+
287
+ tabular_executor = TabularExecutor()
src/query/query_executor.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """QueryExecutor — dispatches retrieval results to the appropriate executor by source_type."""
2
+
3
+ import asyncio
4
+
5
+ from sqlalchemy.ext.asyncio import AsyncSession
6
+
7
+ from src.middlewares.logging import get_logger
8
+ from src.query.base import QueryResult
9
+ from src.query.executors.db_executor import db_executor
10
+ from src.query.executors.tabular import tabular_executor
11
+ from src.rag.base import RetrievalResult
12
+
13
+ logger = get_logger("query_executor")
14
+
15
+
16
+ class QueryExecutor:
17
+ async def execute(
18
+ self,
19
+ results: list[RetrievalResult],
20
+ user_id: str,
21
+ db: AsyncSession,
22
+ question: str,
23
+ limit: int = 100,
24
+ ) -> list[QueryResult]:
25
+ batches = await asyncio.gather(
26
+ db_executor.execute(results, user_id, db, question, limit),
27
+ tabular_executor.execute(results, user_id, db, question, limit),
28
+ return_exceptions=True,
29
+ )
30
+
31
+ query_results: list[QueryResult] = []
32
+ for batch in batches:
33
+ if isinstance(batch, Exception):
34
+ logger.error("executor failed", error=str(batch))
35
+ continue
36
+ query_results.extend(batch)
37
+
38
+ logger.info("query execution complete", total=len(query_results))
39
+ return query_results
40
+
41
+
42
+ query_executor = QueryExecutor()
src/rag/base.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Shared contract for all retriever implementations."""
2
+
3
+ from abc import ABC, abstractmethod
4
+ from dataclasses import dataclass
5
+ from typing import Any
6
+
7
+
8
+ @dataclass
9
+ class RetrievalResult:
10
+ content: str
11
+ metadata: dict[str, Any]
12
+ score: float
13
+ source_type: str # "document" | "database"
14
+
15
+
16
+ class BaseRetriever(ABC):
17
+ @abstractmethod
18
+ async def retrieve(
19
+ self, query: str, user_id: str, k: int = 5
20
+ ) -> list[RetrievalResult]: ...
src/rag/retriever.py CHANGED
@@ -1,69 +1,45 @@
1
- """Service for retrieving relevant documents from vector store."""
2
 
3
- import hashlib
4
- import json
5
- from src.db.postgres.vector_store import get_vector_store
6
- from src.db.redis.connection import get_redis
7
  from sqlalchemy.ext.asyncio import AsyncSession
 
8
  from src.middlewares.logging import get_logger
9
- from typing import List, Dict, Any
 
 
 
10
 
11
  logger = get_logger("retriever")
12
 
13
- _RETRIEVAL_CACHE_TTL = 3600 # 1 hour
14
-
15
 
16
  class RetrieverService:
17
- """Service for retrieving relevant documents."""
 
 
 
 
 
 
 
 
18
 
19
  def __init__(self):
20
- self.vector_store = get_vector_store()
 
 
 
21
 
22
  async def retrieve(
23
  self,
24
  query: str,
25
  user_id: str,
26
  db: AsyncSession,
27
- k: int = 5
28
- ) -> List[Dict[str, Any]]:
29
- """Retrieve relevant chunks for a query, scoped to the user's documents.
30
-
31
- Returns:
32
- List of dicts with keys: content, metadata
33
- metadata includes: document_id, user_id, filename, chunk_index, page_label (if PDF)
34
- """
35
  try:
36
- redis = await get_redis()
37
- query_hash = hashlib.md5(query.encode()).hexdigest()
38
- cache_key = f"retrieval:{user_id}:{query_hash}:{k}"
39
-
40
- cached = await redis.get(cache_key)
41
- if cached:
42
- logger.info("Returning cached retrieval results")
43
- return json.loads(cached)
44
-
45
- logger.info(f"Retrieving for user {user_id}, query: {query[:50]}...")
46
-
47
- docs = await self.vector_store.asimilarity_search(
48
- query=query,
49
- k=k,
50
- filter={"user_id": user_id}
51
- )
52
-
53
- results = [
54
- {
55
- "content": doc.page_content,
56
- "metadata": doc.metadata,
57
- }
58
- for doc in docs
59
- ]
60
-
61
- logger.info(f"Retrieved {len(results)} chunks")
62
- await redis.setex(cache_key, _RETRIEVAL_CACHE_TTL, json.dumps(results))
63
- return results
64
-
65
  except Exception as e:
66
- logger.error("Retrieval failed", error=str(e))
67
  return []
68
 
69
 
 
1
+ """Public retrieval API thin wrapper around RetrievalRouter."""
2
 
 
 
 
 
3
  from sqlalchemy.ext.asyncio import AsyncSession
4
+
5
  from src.middlewares.logging import get_logger
6
+ from src.rag.base import RetrievalResult
7
+ from src.rag.retrievers.document import document_retriever
8
+ from src.rag.retrievers.schema import schema_retriever
9
+ from src.rag.router import RetrievalRouter, SourceHint
10
 
11
  logger = get_logger("retriever")
12
 
 
 
13
 
14
  class RetrieverService:
15
+ """Public retrieval service used by chat.py and search tools.
16
+
17
+ Delegates to RetrievalRouter which dispatches based on source_hint.
18
+ Returns RetrievalResult objects directly so downstream consumers
19
+ (db_executor, tabular_executor) can be fed without lossy dict
20
+ conversion. The `db` parameter is accepted for call-site compatibility
21
+ but currently unused — retrieval reads PGVector via _pgvector_engine
22
+ inside each retriever.
23
+ """
24
 
25
  def __init__(self):
26
+ self._router = RetrievalRouter(
27
+ schema_retriever=schema_retriever,
28
+ document_retriever=document_retriever,
29
+ )
30
 
31
  async def retrieve(
32
  self,
33
  query: str,
34
  user_id: str,
35
  db: AsyncSession,
36
+ k: int = 5,
37
+ source_hint: SourceHint = "both",
38
+ ) -> list[RetrievalResult]:
 
 
 
 
 
39
  try:
40
+ return await self._router.retrieve(query, user_id, source_hint, k)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  except Exception as e:
42
+ logger.error("retrieval failed", error=str(e))
43
  return []
44
 
45
 
src/rag/retrievers/__init__.py ADDED
File without changes
src/rag/retrievers/baseline.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Service for retrieving relevant documents from vector store."""
2
+
3
+ import hashlib
4
+ import json
5
+ from src.db.postgres.vector_store import get_vector_store
6
+ from src.db.redis.connection import get_redis
7
+ from sqlalchemy.ext.asyncio import AsyncSession
8
+ from src.middlewares.logging import get_logger
9
+ from typing import List, Dict, Any
10
+
11
+ logger = get_logger("retriever")
12
+
13
+ _RETRIEVAL_CACHE_TTL = 3600 # 1 hour
14
+
15
+
16
+ class BaselineRetrieverService:
17
+ """Baseline (pre-Phase-1) retriever — preserved for benchmark comparison.
18
+
19
+ Renamed from RetrieverService so it doesn't shadow the production wrapper
20
+ at src/rag/retriever.py. Production code imports from src.rag.retriever;
21
+ benchmark scripts that want this baseline must import explicitly from
22
+ src.rag.retrievers.baseline.
23
+ """
24
+
25
+ def __init__(self):
26
+ self.vector_store = get_vector_store()
27
+
28
+ async def retrieve(
29
+ self,
30
+ query: str,
31
+ user_id: str,
32
+ db: AsyncSession,
33
+ k: int = 5
34
+ ) -> List[Dict[str, Any]]:
35
+ """Retrieve relevant chunks for a query, scoped to the user's documents.
36
+
37
+ Returns:
38
+ List of dicts with keys: content, metadata
39
+ metadata includes: document_id, user_id, filename, chunk_index, page_label (if PDF)
40
+ """
41
+ try:
42
+ redis = await get_redis()
43
+ query_hash = hashlib.md5(query.encode()).hexdigest()
44
+ cache_key = f"retrieval:{user_id}:{query_hash}:{k}"
45
+
46
+ cached = await redis.get(cache_key)
47
+ if cached:
48
+ logger.info("Returning cached retrieval results")
49
+ return json.loads(cached)
50
+
51
+ logger.info(f"Retrieving for user {user_id}, query: {query[:50]}...")
52
+
53
+ docs = await self.vector_store.asimilarity_search(
54
+ query=query,
55
+ k=k,
56
+ filter={"user_id": user_id}
57
+ )
58
+
59
+ results = [
60
+ {
61
+ "content": doc.page_content,
62
+ "metadata": doc.metadata,
63
+ }
64
+ for doc in docs
65
+ ]
66
+
67
+ logger.info(f"Retrieved {len(results)} chunks")
68
+ await redis.setex(cache_key, _RETRIEVAL_CACHE_TTL, json.dumps(results))
69
+ return results
70
+
71
+ except Exception as e:
72
+ logger.error("Retrieval failed", error=str(e))
73
+ return []
74
+
75
+
76
+ baseline_retriever = BaselineRetrieverService()
src/rag/retrievers/document.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Document retriever — handles PDF, DOCX, TXT chunks (source_type="document", non-tabular)."""
2
+
3
+ import math
4
+
5
+ from langchain_postgres import PGVector
6
+ from langchain_postgres.vectorstores import DistanceStrategy
7
+ from langchain_openai import AzureOpenAIEmbeddings
8
+ from sqlalchemy import text
9
+
10
+ from src.config.settings import settings
11
+ from src.db.postgres.connection import _pgvector_engine
12
+ from src.db.postgres.vector_store import get_vector_store
13
+ from src.middlewares.logging import get_logger
14
+ from src.rag.base import BaseRetriever, RetrievalResult
15
+
16
+ logger = get_logger("document_retriever")
17
+
18
+ # Change this one line to switch retrieval method
19
+ # Options: "mmr" | "cosine" | "euclidean" | "inner_product" | "manhattan"
20
+ _RETRIEVAL_METHOD = "mmr"
21
+
22
+ _TABULAR_TYPES = {"csv", "xlsx"}
23
+ _FETCH_K = 20
24
+ _LAMBDA_MULT = 0.5
25
+ _COLLECTION_NAME = "document_embeddings"
26
+
27
+ _embeddings = AzureOpenAIEmbeddings(
28
+ azure_deployment=settings.azureai_deployment_name_embedding,
29
+ openai_api_version=settings.azureai_api_version_embedding,
30
+ azure_endpoint=settings.azureai_endpoint_url_embedding,
31
+ api_key=settings.azureai_api_key_embedding,
32
+ )
33
+
34
+ _euclidean_store = PGVector(
35
+ embeddings=_embeddings,
36
+ connection=_pgvector_engine,
37
+ collection_name=_COLLECTION_NAME,
38
+ distance_strategy=DistanceStrategy.EUCLIDEAN,
39
+ use_jsonb=True,
40
+ async_mode=True,
41
+ create_extension=False,
42
+ )
43
+
44
+ _ip_store = PGVector(
45
+ embeddings=_embeddings,
46
+ connection=_pgvector_engine,
47
+ collection_name=_COLLECTION_NAME,
48
+ distance_strategy=DistanceStrategy.MAX_INNER_PRODUCT,
49
+ use_jsonb=True,
50
+ async_mode=True,
51
+ create_extension=False,
52
+ )
53
+
54
+ _MANHATTAN_SQL = text("""
55
+ SELECT
56
+ lpe.document,
57
+ lpe.cmetadata,
58
+ lpe.embedding <+> CAST(:embedding AS vector) AS distance
59
+ FROM langchain_pg_embedding lpe
60
+ JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
61
+ WHERE lpc.name = :collection
62
+ AND lpe.cmetadata->>'user_id' = :user_id
63
+ AND lpe.cmetadata->>'source_type' = 'document'
64
+ ORDER BY distance ASC
65
+ LIMIT :k
66
+ """)
67
+
68
+
69
+ class DocumentRetriever(BaseRetriever):
70
+ def __init__(self) -> None:
71
+ self.vector_store = get_vector_store()
72
+
73
+ async def retrieve(
74
+ self, query: str, user_id: str, k: int = 5
75
+ ) -> list[RetrievalResult]:
76
+ filter_ = {"user_id": user_id, "source_type": "document"}
77
+ fetch_k = k + len(_TABULAR_TYPES)
78
+
79
+ if _RETRIEVAL_METHOD == "manhattan":
80
+ return await self._retrieve_manhattan(query, user_id, k, fetch_k)
81
+
82
+ if _RETRIEVAL_METHOD == "mmr":
83
+ docs = await self.vector_store.amax_marginal_relevance_search(
84
+ query=query,
85
+ k=fetch_k,
86
+ fetch_k=_FETCH_K,
87
+ lambda_mult=_LAMBDA_MULT,
88
+ filter=filter_,
89
+ )
90
+ cosine = await self.vector_store.asimilarity_search_with_score(
91
+ query=query, k=fetch_k, filter=filter_,
92
+ )
93
+ score_map = {doc.page_content: score for doc, score in cosine}
94
+ docs_with_scores = [(doc, score_map.get(doc.page_content, 0.0)) for doc in docs]
95
+ elif _RETRIEVAL_METHOD == "euclidean":
96
+ docs_with_scores = await _euclidean_store.asimilarity_search_with_score(
97
+ query=query, k=fetch_k, filter=filter_,
98
+ )
99
+ elif _RETRIEVAL_METHOD == "inner_product":
100
+ docs_with_scores = await _ip_store.asimilarity_search_with_score(
101
+ query=query, k=fetch_k, filter=filter_,
102
+ )
103
+ else: # cosine
104
+ docs_with_scores = await self.vector_store.asimilarity_search_with_score(
105
+ query=query, k=fetch_k, filter=filter_,
106
+ )
107
+
108
+ results = []
109
+ for doc, score in docs_with_scores:
110
+ file_type = doc.metadata.get("data", {}).get("file_type", "")
111
+ if file_type not in _TABULAR_TYPES:
112
+ results.append(RetrievalResult(
113
+ content=doc.page_content,
114
+ metadata=doc.metadata,
115
+ score=score,
116
+ source_type="document",
117
+ ))
118
+ if len(results) == k:
119
+ break
120
+
121
+ logger.info("retrieved chunks", method=_RETRIEVAL_METHOD, count=len(results))
122
+ return results
123
+
124
+ async def _retrieve_manhattan(
125
+ self, query: str, user_id: str, k: int, fetch_k: int
126
+ ) -> list[RetrievalResult]:
127
+ query_vector = await _embeddings.aembed_query(query)
128
+ if not all(math.isfinite(v) for v in query_vector):
129
+ raise ValueError("Embedding vector contains NaN or Infinity values.")
130
+ vector_str = "[" + ",".join(str(v) for v in query_vector) + "]"
131
+
132
+ async with _pgvector_engine.connect() as conn:
133
+ result = await conn.execute(_MANHATTAN_SQL, {
134
+ "embedding": vector_str,
135
+ "collection": _COLLECTION_NAME,
136
+ "user_id": user_id,
137
+ "k": fetch_k,
138
+ })
139
+ rows = result.fetchall()
140
+
141
+ results = []
142
+ for row in rows:
143
+ file_type = row.cmetadata.get("data", {}).get("file_type", "")
144
+ if file_type not in _TABULAR_TYPES:
145
+ results.append(RetrievalResult(
146
+ content=row.document,
147
+ metadata=row.cmetadata,
148
+ score=float(row.distance),
149
+ source_type="document",
150
+ ))
151
+ if len(results) == k:
152
+ break
153
+
154
+ logger.info("retrieved chunks", method="manhattan", count=len(results))
155
+ return results
156
+
157
+
158
+ document_retriever = DocumentRetriever()
src/rag/retrievers/schema.py ADDED
@@ -0,0 +1,411 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Schema retriever — handles DB schemas (source_type="database") and tabular file
2
+ columns stored as source_type="document" with file_type in ("csv","xlsx").
3
+
4
+ Strategy: hybrid_bm25 — RRF merge of dense cosine search (DB columns + DB tables
5
+ + tabular columns + tabular sheets) and PostgreSQL full-text search (DB columns only).
6
+ Embeds the query once, fans out five legs in parallel.
7
+
8
+ The DB-tables leg surfaces table-level summary chunks (chunk_level='table') as
9
+ a recall signal for multi-table questions: when a relevant table's columns
10
+ don't individually win on similarity, the table chunk can still pull the table
11
+ into the hit set, where db_executor's downstream full-schema fetch picks up
12
+ the per-column detail.
13
+
14
+ FTS requires a GIN index on langchain_pg_embedding.document (created by init_db.py).
15
+ """
16
+
17
+ import asyncio
18
+
19
+ from sqlalchemy import text
20
+
21
+ from src.db.postgres.connection import _pgvector_engine
22
+ from src.db.postgres.vector_store import get_vector_store
23
+ from src.middlewares.logging import get_logger
24
+ from src.rag.base import BaseRetriever, RetrievalResult
25
+
26
+ logger = get_logger("schema_retriever")
27
+
28
+ _TABULAR_FILE_TYPES = ("csv", "xlsx")
29
+ _TABLE_CHUNK_K_MULTIPLIER = 2 # how many table chunks to pull before RRF
30
+
31
+
32
+ class SchemaRetriever(BaseRetriever):
33
+ def __init__(self):
34
+ self.vector_store = get_vector_store()
35
+
36
+ # ------------------------------------------------------------------
37
+ # Internal helpers
38
+ # ------------------------------------------------------------------
39
+
40
+ async def _embed_query(self, query: str) -> list[float]:
41
+ return await asyncio.to_thread(self.vector_store.embeddings.embed_query, query)
42
+
43
+ async def _search_db(
44
+ self, embedding: list[float], user_id: str, k: int
45
+ ) -> list[RetrievalResult]:
46
+ """Cosine vector search over database chunks."""
47
+ emb_str = "[" + ",".join(str(x) for x in embedding) + "]"
48
+
49
+ sql = text(f"""
50
+ SELECT lpe.document, lpe.cmetadata,
51
+ 1.0 - (lpe.embedding <=> '{emb_str}'::vector) AS score
52
+ FROM langchain_pg_embedding lpe
53
+ JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
54
+ WHERE lpc.name = 'document_embeddings'
55
+ AND lpe.cmetadata->>'user_id' = :user_id
56
+ AND lpe.cmetadata->>'source_type' = 'database'
57
+ AND lpe.cmetadata->>'chunk_level' = 'column'
58
+ ORDER BY lpe.embedding <=> '{emb_str}'::vector ASC
59
+ LIMIT :k
60
+ """)
61
+
62
+ async with _pgvector_engine.connect() as conn:
63
+ result = await conn.execute(sql, {"user_id": user_id, "k": k * 4})
64
+ rows = result.fetchall()
65
+
66
+ return [
67
+ RetrievalResult(
68
+ content=row.document,
69
+ metadata=row.cmetadata,
70
+ score=float(row.score),
71
+ source_type="database",
72
+ )
73
+ for row in rows
74
+ ]
75
+
76
+ async def _search_db_tables(
77
+ self, embedding: list[float], user_id: str, k: int
78
+ ) -> list[RetrievalResult]:
79
+ """Cosine vector search over database TABLE-level chunks.
80
+
81
+ Recall channel for multi-table questions. The chunk's content is
82
+ discarded downstream — db_executor only consumes its `data.table_name`
83
+ to seed full-schema fetch.
84
+ """
85
+ emb_str = "[" + ",".join(str(x) for x in embedding) + "]"
86
+
87
+ sql = text(f"""
88
+ SELECT lpe.document, lpe.cmetadata,
89
+ 1.0 - (lpe.embedding <=> '{emb_str}'::vector) AS score
90
+ FROM langchain_pg_embedding lpe
91
+ JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
92
+ WHERE lpc.name = 'document_embeddings'
93
+ AND lpe.cmetadata->>'user_id' = :user_id
94
+ AND lpe.cmetadata->>'source_type' = 'database'
95
+ AND lpe.cmetadata->>'chunk_level' = 'table'
96
+ ORDER BY lpe.embedding <=> '{emb_str}'::vector ASC
97
+ LIMIT :k
98
+ """)
99
+
100
+ async with _pgvector_engine.connect() as conn:
101
+ result = await conn.execute(
102
+ sql, {"user_id": user_id, "k": k * _TABLE_CHUNK_K_MULTIPLIER}
103
+ )
104
+ rows = result.fetchall()
105
+
106
+ return [
107
+ RetrievalResult(
108
+ content=row.document,
109
+ metadata=row.cmetadata,
110
+ score=float(row.score),
111
+ source_type="database",
112
+ )
113
+ for row in rows
114
+ ]
115
+
116
+ async def _search_tabular(
117
+ self, embedding: list[float], user_id: str, k: int
118
+ ) -> list[RetrievalResult]:
119
+ """Cosine vector search over tabular document chunks (csv/xlsx)."""
120
+ emb_str = "[" + ",".join(str(x) for x in embedding) + "]"
121
+
122
+ sql = text(f"""
123
+ SELECT lpe.document, lpe.cmetadata,
124
+ 1.0 - (lpe.embedding <=> '{emb_str}'::vector) AS score
125
+ FROM langchain_pg_embedding lpe
126
+ JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
127
+ WHERE lpc.name = 'document_embeddings'
128
+ AND lpe.cmetadata->>'user_id' = :user_id
129
+ AND lpe.cmetadata->>'source_type' = 'document'
130
+ AND lpe.cmetadata->>'chunk_level' = 'column'
131
+ AND (lpe.cmetadata->'data'->>'file_type' = 'csv'
132
+ OR lpe.cmetadata->'data'->>'file_type' = 'xlsx')
133
+ ORDER BY lpe.embedding <=> '{emb_str}'::vector ASC
134
+ LIMIT :k
135
+ """)
136
+
137
+ async with _pgvector_engine.connect() as conn:
138
+ result = await conn.execute(sql, {"user_id": user_id, "k": k * 4})
139
+ rows = result.fetchall()
140
+
141
+ return [
142
+ RetrievalResult(
143
+ content=row.document,
144
+ metadata=row.cmetadata,
145
+ score=float(row.score),
146
+ source_type="document",
147
+ )
148
+ for row in rows
149
+ ]
150
+
151
+ async def _search_tabular_sheets(
152
+ self, embedding: list[float], user_id: str, k: int
153
+ ) -> list[RetrievalResult]:
154
+ """Leg 5: sheet-level summary chunks from CSV/XLSX files."""
155
+ emb_str = "[" + ",".join(str(x) for x in embedding) + "]"
156
+
157
+ sql = text(f"""
158
+ SELECT lpe.document, lpe.cmetadata,
159
+ 1.0 - (lpe.embedding <=> '{emb_str}'::vector) AS score
160
+ FROM langchain_pg_embedding lpe
161
+ JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
162
+ WHERE lpc.name = 'document_embeddings'
163
+ AND lpe.cmetadata->>'user_id' = :user_id
164
+ AND lpe.cmetadata->>'source_type' = 'document'
165
+ AND lpe.cmetadata->>'chunk_level' = 'sheet'
166
+ AND (lpe.cmetadata->'data'->>'file_type' = 'csv'
167
+ OR lpe.cmetadata->'data'->>'file_type' = 'xlsx')
168
+ ORDER BY lpe.embedding <=> '{emb_str}'::vector ASC
169
+ LIMIT :k
170
+ """)
171
+
172
+ async with _pgvector_engine.connect() as conn:
173
+ result = await conn.execute(sql, {"user_id": user_id, "k": k})
174
+ rows = result.fetchall()
175
+
176
+ return [
177
+ RetrievalResult(
178
+ content=row.document,
179
+ metadata=row.cmetadata,
180
+ score=float(row.score),
181
+ source_type="document",
182
+ )
183
+ for row in rows
184
+ ]
185
+
186
+ async def _search_fts_db(self, query: str, user_id: str, k: int) -> list[RetrievalResult]:
187
+ """Full-text search over DB schema chunks using PostgreSQL tsvector."""
188
+ sql = text("""
189
+ SELECT lpe.document, lpe.cmetadata,
190
+ ts_rank(to_tsvector('english', lpe.document),
191
+ plainto_tsquery('english', :query)) AS rank
192
+ FROM langchain_pg_embedding lpe
193
+ JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
194
+ WHERE lpc.name = 'document_embeddings'
195
+ AND lpe.cmetadata->>'user_id' = :user_id
196
+ AND lpe.cmetadata->>'source_type' = 'database'
197
+ AND lpe.cmetadata->>'chunk_level' = 'column'
198
+ AND to_tsvector('english', lpe.document) @@ plainto_tsquery('english', :query)
199
+ ORDER BY rank DESC
200
+ LIMIT :k
201
+ """)
202
+
203
+ async with _pgvector_engine.connect() as conn:
204
+ result = await conn.execute(sql, {"query": query, "user_id": user_id, "k": k})
205
+ rows = result.fetchall()
206
+
207
+ return [
208
+ RetrievalResult(
209
+ content=row.document,
210
+ metadata=row.cmetadata,
211
+ score=float(row.rank),
212
+ source_type="database",
213
+ )
214
+ for row in rows
215
+ ]
216
+
217
+ def _rank_tabular_sheets(
218
+ self,
219
+ sheet_results: list[RetrievalResult],
220
+ column_results: list[RetrievalResult],
221
+ top_k: int,
222
+ k_rrf: int = 60,
223
+ ) -> list[RetrievalResult]:
224
+ """Rank tabular sheets by RRF across two voting legs:
225
+ L1 (primary): sheet-chunk cosine score
226
+ L2 (vote): best column-chunk position per (doc_id, sheet_name)
227
+
228
+ Returns top-k sheet-level RetrievalResults. The full column list of
229
+ each sheet is already in the sheet chunk's data.column_names from
230
+ ingestion, so downstream tabular_executor can read full sheet context.
231
+
232
+ For sheets surfaced by column votes but missing a sheet chunk (rare —
233
+ ingestion always creates one), a minimal stub is returned and
234
+ tabular_executor falls back to reading columns from the parquet.
235
+ """
236
+ # L1: sheets indexed by (doc_id, sheet_name) from sheet chunks
237
+ sheet_index: dict[tuple, RetrievalResult] = {}
238
+ sheet_ranked: list[tuple] = []
239
+ for r in sheet_results:
240
+ d = r.metadata.get("data", {})
241
+ key = (d.get("document_id"), d.get("sheet_name"))
242
+ if key[0] and key not in sheet_index:
243
+ sheet_index[key] = r
244
+ sheet_ranked.append(key)
245
+
246
+ # L2: sheets ranked by first-appearance in column-chunk results
247
+ col_sheet_ranked: list[tuple] = []
248
+ seen: set[tuple] = set()
249
+ for r in column_results:
250
+ d = r.metadata.get("data", {})
251
+ key = (d.get("document_id"), d.get("sheet_name"))
252
+ if key[0] and key not in seen:
253
+ col_sheet_ranked.append(key)
254
+ seen.add(key)
255
+
256
+ # RRF over (doc_id, sheet_name) across the two legs
257
+ rrf_scores: dict[tuple, float] = {}
258
+ for ranked_list in [sheet_ranked, col_sheet_ranked]:
259
+ for rank, key in enumerate(ranked_list):
260
+ rrf_scores[key] = rrf_scores.get(key, 0.0) + 1.0 / (k_rrf + rank + 1)
261
+
262
+ top_sheets = sorted(rrf_scores, key=lambda k: rrf_scores[k], reverse=True)[:top_k]
263
+
264
+ results: list[RetrievalResult] = []
265
+ for key in top_sheets:
266
+ if key in sheet_index:
267
+ r = sheet_index[key]
268
+ r.score = rrf_scores[key]
269
+ results.append(r)
270
+ else:
271
+ # Surfaced by column votes only — build stub from a representative
272
+ # column result so tabular_executor can group correctly.
273
+ doc_id, sheet_name = key
274
+ rep = next(
275
+ (r for r in column_results
276
+ if r.metadata.get("data", {}).get("document_id") == doc_id
277
+ and r.metadata.get("data", {}).get("sheet_name") == sheet_name),
278
+ None,
279
+ )
280
+ if rep is None:
281
+ continue
282
+ stub_data = dict(rep.metadata.get("data", {}))
283
+ stub_data.pop("column_name", None)
284
+ stub_data.pop("column_type", None)
285
+ results.append(RetrievalResult(
286
+ content=f"Sheet: {stub_data.get('filename', '')}"
287
+ + (f" / sheet: {sheet_name}" if sheet_name else ""),
288
+ metadata={**rep.metadata, "data": stub_data, "chunk_level": "sheet"},
289
+ score=rrf_scores[key],
290
+ source_type="document",
291
+ ))
292
+ return results
293
+
294
+ def _rank_db_tables(
295
+ self,
296
+ tbl_results: list[RetrievalResult],
297
+ col_results: list[RetrievalResult],
298
+ fts_results: list[RetrievalResult],
299
+ top_k: int,
300
+ k_rrf: int = 60,
301
+ ) -> list[RetrievalResult]:
302
+ """Rank DB tables by RRF across three legs:
303
+ L1 (primary): table-summary chunk similarity
304
+ L2 (vote): best column-chunk position per table
305
+ L3 (vote): best FTS position per table
306
+
307
+ Returns top-k table-chunk RetrievalResults. For tables surfaced by
308
+ L2/L3 but missing a table chunk, a minimal stub is returned so that
309
+ db_executor._fetch_full_schema can seed off data.table_name.
310
+ """
311
+ # L1: tables ranked by table-chunk cosine score
312
+ tbl_index: dict[str, RetrievalResult] = {}
313
+ tbl_ranked: list[str] = []
314
+ for r in tbl_results:
315
+ tname = r.metadata.get("data", {}).get("table_name")
316
+ if tname and tname not in tbl_index:
317
+ tbl_index[tname] = r
318
+ tbl_ranked.append(tname)
319
+
320
+ # L2: tables ranked by first-appearance in column-chunk list (best col score)
321
+ col_table_ranked: list[str] = []
322
+ seen: set[str] = set()
323
+ for r in col_results:
324
+ tname = r.metadata.get("data", {}).get("table_name")
325
+ if tname and tname not in seen:
326
+ col_table_ranked.append(tname)
327
+ seen.add(tname)
328
+
329
+ # L3: tables ranked by first-appearance in FTS list
330
+ fts_table_ranked: list[str] = []
331
+ seen = set()
332
+ for r in fts_results:
333
+ tname = r.metadata.get("data", {}).get("table_name")
334
+ if tname and tname not in seen:
335
+ fts_table_ranked.append(tname)
336
+ seen.add(tname)
337
+
338
+ # RRF over table names across the three legs
339
+ rrf_scores: dict[str, float] = {}
340
+ for ranked_list in [tbl_ranked, col_table_ranked, fts_table_ranked]:
341
+ for rank, tname in enumerate(ranked_list):
342
+ rrf_scores[tname] = rrf_scores.get(tname, 0.0) + 1.0 / (k_rrf + rank + 1)
343
+
344
+ top_tables = sorted(rrf_scores, key=lambda t: rrf_scores[t], reverse=True)[:top_k]
345
+
346
+ results: list[RetrievalResult] = []
347
+ for tname in top_tables:
348
+ if tname in tbl_index:
349
+ r = tbl_index[tname]
350
+ r.score = rrf_scores[tname]
351
+ results.append(r)
352
+ else:
353
+ # Surfaced by column/FTS votes with no table chunk — minimal stub
354
+ results.append(RetrievalResult(
355
+ content=f"Table: {tname}",
356
+ metadata={"data": {"table_name": tname}, "source_type": "database"},
357
+ score=rrf_scores[tname],
358
+ source_type="database",
359
+ ))
360
+ return results
361
+
362
+ # ------------------------------------------------------------------
363
+ # Public interface — called by the router
364
+ # ------------------------------------------------------------------
365
+
366
+ async def retrieve(self, query: str, user_id: str, k: int = 5) -> list[RetrievalResult]:
367
+ """Table-first retrieval for DB sources; chunk-level for tabular.
368
+
369
+ DB tables are ranked via RRF across three legs:
370
+ L1 (primary): table-summary chunk similarity
371
+ L2 (vote): top-K column-chunk cosine, grouped by table
372
+ L3 (vote): top-K FTS column hits, grouped by table
373
+
374
+ db_executor downstream fetches the full per-column schema for the
375
+ ranked table set via _fetch_full_schema — the column chunks returned
376
+ here are intentionally NOT used as the schema source, only for voting.
377
+
378
+ Tabular (CSV/XLSX) sheets are ranked via RRF across two legs:
379
+ L1: sheet-chunk cosine
380
+ L2: column-chunk votes (best position per sheet)
381
+ Returns sheet-level RetrievalResults so tabular_executor receives
382
+ full sheet context (all columns) rather than fragmented column hits.
383
+ """
384
+ embedding = await self._embed_query(query)
385
+ db_col_results, db_tbl_results, tabular_results, fts_results, sheet_results = await asyncio.gather(
386
+ self._search_db(embedding, user_id, k),
387
+ self._search_db_tables(embedding, user_id, k),
388
+ self._search_tabular(embedding, user_id, k),
389
+ self._search_fts_db(query, user_id, k * 4),
390
+ self._search_tabular_sheets(embedding, user_id, k),
391
+ )
392
+
393
+ db_ranked = self._rank_db_tables(db_tbl_results, db_col_results, fts_results, top_k=k)
394
+ tabular_ranked = self._rank_tabular_sheets(sheet_results, tabular_results, top_k=k)
395
+
396
+ results = sorted(db_ranked + tabular_ranked, key=lambda r: r.score, reverse=True)
397
+ logger.info(
398
+ "schema retrieval",
399
+ count=len(results),
400
+ db_tables_ranked=len(db_ranked),
401
+ db_cols=len(db_col_results),
402
+ db_tables=len(db_tbl_results),
403
+ tabular_cols=len(tabular_results),
404
+ tabular_sheets=len(sheet_results),
405
+ tabular_ranked=len(tabular_ranked),
406
+ fts=len(fts_results),
407
+ )
408
+ return results
409
+
410
+
411
+ schema_retriever = SchemaRetriever()
src/rag/router.py ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Routes retrieval requests to the appropriate retriever based on source_hint.
2
+
3
+ Cross-retriever merging uses Reciprocal Rank Fusion (RRF) on per-retriever
4
+ ranked lists — score scales differ across retrievers (RRF, cosine, distance)
5
+ and aren't directly comparable, so we rank-merge instead of score-merge.
6
+ """
7
+
8
+ import asyncio
9
+ import hashlib
10
+ import json
11
+ from dataclasses import asdict
12
+ from typing import Literal
13
+
14
+ from src.db.redis.connection import get_redis
15
+ from src.middlewares.logging import get_logger
16
+ from src.rag.base import BaseRetriever, RetrievalResult
17
+
18
+ logger = get_logger("retrieval_router")
19
+
20
+ _CACHE_TTL = 3600 # 1 hour
21
+ _CACHE_KEY_PREFIX = "retrieval"
22
+ _RRF_K = 60 # standard RRF constant
23
+ SourceHint = Literal["document", "schema", "both"]
24
+
25
+
26
+ def _result_dedup_key(r: RetrievalResult) -> tuple:
27
+ """Cross-retriever dedup key — distinguishes DB columns vs DB tables vs
28
+ tabular columns vs prose chunks vs sheet-level chunks."""
29
+ data = r.metadata.get("data", {})
30
+ return (
31
+ r.source_type,
32
+ data.get("table_name"),
33
+ data.get("column_name"),
34
+ data.get("filename"),
35
+ data.get("sheet_name"),
36
+ data.get("chunk_index"), # disambiguates multiple prose chunks per doc
37
+ r.metadata.get("chunk_level"), # distinguishes sheet vs column chunks
38
+ )
39
+
40
+
41
+ def _rrf_merge(
42
+ ranked_lists: list[list[RetrievalResult]],
43
+ top_k: int,
44
+ k_rrf: int = _RRF_K,
45
+ ) -> list[RetrievalResult]:
46
+ """Reciprocal Rank Fusion across retriever batches.
47
+
48
+ Each input list is treated as already best-first ordered. Items are
49
+ deduped via _result_dedup_key and re-ranked by aggregated reciprocal
50
+ rank across all lists. Score on the returned RetrievalResult is the
51
+ aggregated RRF score (uniform scale across legs).
52
+ """
53
+ scores: dict[tuple, float] = {}
54
+ index: dict[tuple, RetrievalResult] = {}
55
+
56
+ for ranked in ranked_lists:
57
+ for rank, result in enumerate(ranked):
58
+ key = _result_dedup_key(result)
59
+ scores[key] = scores.get(key, 0.0) + 1.0 / (k_rrf + rank + 1)
60
+ # Keep the first occurrence; metadata is identical for the same
61
+ # key across lists, so any copy is fine.
62
+ if key not in index:
63
+ index[key] = result
64
+
65
+ merged = sorted(index.values(), key=lambda r: scores[_result_dedup_key(r)], reverse=True)
66
+ # Overwrite score with RRF score so downstream consumers see a uniform scale.
67
+ for r in merged:
68
+ r.score = scores[_result_dedup_key(r)]
69
+ return merged[:top_k]
70
+
71
+
72
+ async def invalidate_retrieval_cache(user_id: str) -> int:
73
+ """Delete every cached retrieval entry for `user_id`.
74
+
75
+ Called by ingest/upload/delete API handlers after a successful write so
76
+ the next retrieval picks up the new data instead of stale cached top-k.
77
+ Returns the number of keys removed.
78
+ """
79
+ redis = await get_redis()
80
+ pattern = f"{_CACHE_KEY_PREFIX}:{user_id}:*"
81
+ keys = [key async for key in redis.scan_iter(match=pattern)]
82
+ if not keys:
83
+ return 0
84
+ deleted = await redis.delete(*keys)
85
+ logger.info("retrieval cache invalidated", user_id=user_id, deleted=deleted)
86
+ return int(deleted)
87
+
88
+
89
+ class RetrievalRouter:
90
+ def __init__(
91
+ self,
92
+ schema_retriever: BaseRetriever,
93
+ document_retriever: BaseRetriever,
94
+ ):
95
+ self._retrievers: dict[str, BaseRetriever] = {
96
+ "schema": schema_retriever,
97
+ "document": document_retriever,
98
+ }
99
+
100
+ def _route(self, source_hint: SourceHint) -> list[tuple[str, BaseRetriever]]:
101
+ if source_hint == "schema":
102
+ return [("schema", self._retrievers["schema"])]
103
+ if source_hint == "document":
104
+ return [("document", self._retrievers["document"])]
105
+ return list(self._retrievers.items())
106
+
107
+ async def retrieve(
108
+ self,
109
+ query: str,
110
+ user_id: str,
111
+ source_hint: SourceHint = "both",
112
+ k: int = 10,
113
+ ) -> list[RetrievalResult]:
114
+ redis = await get_redis()
115
+ query_hash = hashlib.md5(query.encode()).hexdigest()
116
+ cache_key = f"{_CACHE_KEY_PREFIX}:{user_id}:{source_hint}:{query_hash}:{k}"
117
+
118
+ cached = await redis.get(cache_key)
119
+ if cached:
120
+ try:
121
+ raw = json.loads(cached)
122
+ logger.info("returning cached retrieval results", source_hint=source_hint)
123
+ return [RetrievalResult(**r) for r in raw]
124
+ except Exception:
125
+ logger.warning("corrupted retrieval cache, fetching fresh", cache_key=cache_key)
126
+
127
+ results = await self._retrieve_uncached(query, user_id, source_hint, k)
128
+
129
+ # Empty-result fallback: orchestrator may have misclassified intent.
130
+ # Retry once with "both" before giving up. No-op when source_hint is
131
+ # already "both".
132
+ if not results and source_hint != "both":
133
+ logger.warning(
134
+ "empty retrieval, falling back to source_hint='both'",
135
+ original_source_hint=source_hint,
136
+ )
137
+ results = await self._retrieve_uncached(query, user_id, "both", k)
138
+
139
+ await redis.setex(
140
+ cache_key,
141
+ _CACHE_TTL,
142
+ json.dumps([asdict(r) for r in results]),
143
+ )
144
+ return results
145
+
146
+ async def _retrieve_uncached(
147
+ self,
148
+ query: str,
149
+ user_id: str,
150
+ source_hint: SourceHint,
151
+ k: int,
152
+ ) -> list[RetrievalResult]:
153
+ routed = self._route(source_hint)
154
+ batches = await asyncio.gather(
155
+ *[r.retrieve(query, user_id, k) for _, r in routed],
156
+ return_exceptions=True,
157
+ )
158
+
159
+ valid_lists: list[list[RetrievalResult]] = []
160
+ per_retriever: dict[str, int | str] = {}
161
+ for (name, _), batch in zip(routed, batches):
162
+ if isinstance(batch, Exception):
163
+ logger.error("retriever failed", retriever=name, error=str(batch))
164
+ per_retriever[name] = "error"
165
+ continue
166
+ valid_lists.append(batch)
167
+ per_retriever[name] = len(batch)
168
+
169
+ results = _rrf_merge(valid_lists, top_k=k)
170
+
171
+ logger.info(
172
+ "router result",
173
+ source_hint=source_hint,
174
+ per_retriever=per_retriever,
175
+ final_count=len(results),
176
+ top_score=results[0].score if results else None,
177
+ bottom_score=results[-1].score if results else None,
178
+ )
179
+ return results
src/storage/az_blob/az_blob.py CHANGED
@@ -57,6 +57,22 @@ class AzureBlobStorage:
57
  logger.error(f"Failed to download blob {blob_name}", error=str(e))
58
  raise
59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  async def delete_file(self, blob_name: str) -> bool:
61
  """Delete file from Azure Blob Storage."""
62
  try:
@@ -71,6 +87,24 @@ class AzureBlobStorage:
71
  logger.error(f"Failed to delete blob {blob_name}", error=str(e))
72
  return False
73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
  # Singleton instance
76
  blob_storage = AzureBlobStorage()
 
57
  logger.error(f"Failed to download blob {blob_name}", error=str(e))
58
  raise
59
 
60
+ async def upload_bytes(self, content: bytes, blob_name: str) -> str:
61
+ """Upload bytes to Azure Blob Storage using a specific blob name.
62
+
63
+ Unlike upload_file(), this does not generate a UUID name — caller controls the blob_name.
64
+ Used for Parquet files where the name must be deterministic (derived from document_id).
65
+ """
66
+ try:
67
+ async with self._get_blob_client(blob_name) as blob_client:
68
+ logger.info(f"Uploading bytes to blob {blob_name}")
69
+ await blob_client.upload_blob(content, overwrite=True)
70
+ logger.info(f"Successfully uploaded {blob_name}")
71
+ return blob_name
72
+ except Exception as e:
73
+ logger.error(f"Failed to upload bytes to {blob_name}", error=str(e))
74
+ raise
75
+
76
  async def delete_file(self, blob_name: str) -> bool:
77
  """Delete file from Azure Blob Storage."""
78
  try:
 
87
  logger.error(f"Failed to delete blob {blob_name}", error=str(e))
88
  return False
89
 
90
+ async def delete_blobs_with_prefix(self, prefix: str) -> int:
91
+ """Delete all blobs whose name starts with prefix. Returns count deleted.
92
+
93
+ Used to delete all Parquet files for a document in one call.
94
+ """
95
+ from azure.storage.blob.aio import ContainerClient
96
+ container_url = f"{self.account_url}/{self.container_name}?{self.sas_token}"
97
+ deleted = 0
98
+ try:
99
+ async with ContainerClient.from_container_url(container_url) as container:
100
+ async for blob in container.list_blobs(name_starts_with=prefix):
101
+ await container.delete_blob(blob.name)
102
+ deleted += 1
103
+ logger.info(f"Deleted {deleted} blobs with prefix {prefix}")
104
+ except Exception as e:
105
+ logger.error(f"Failed to delete blobs with prefix {prefix}", error=str(e))
106
+ return deleted
107
+
108
 
109
  # Singleton instance
110
  blob_storage = AzureBlobStorage()
src/tools/search.py CHANGED
@@ -34,10 +34,10 @@ async def search_documents(
34
 
35
  formatted_results = []
36
  for result in results:
37
- filename = result["metadata"].get("filename", "Unknown")
38
- page = result["metadata"].get("page_label")
39
  source_label = f"{filename}, p.{page}" if page else filename
40
- formatted_results.append(f"[Source: {source_label}]\n{result['content']}\n")
41
 
42
  return "\n".join(formatted_results)
43
 
 
34
 
35
  formatted_results = []
36
  for result in results:
37
+ filename = result.metadata.get("filename", "Unknown")
38
+ page = result.metadata.get("page_label")
39
  source_label = f"{filename}, p.{page}" if page else filename
40
+ formatted_results.append(f"[Source: {source_label}]\n{result.content}\n")
41
 
42
  return "\n".join(formatted_results)
43
 
src/utils/db_credential_encryption.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Fernet encryption utilities for user-registered database credentials.
2
+
3
+ Encryption key is sourced from `dataeyond__db__credential__key` env variable,
4
+ intentionally separate from the user-auth bcrypt salt (`emarcal__bcrypt__salt`).
5
+
6
+ Usage:
7
+ from src.utils.db_credential_encryption import encrypt_credentials_dict, decrypt_credentials_dict
8
+
9
+ # Before INSERT:
10
+ safe_creds = encrypt_credentials_dict(raw_credentials)
11
+
12
+ # After SELECT:
13
+ plain_creds = decrypt_credentials_dict(row.credentials)
14
+ """
15
+
16
+ from cryptography.fernet import Fernet
17
+ from src.config.settings import settings
18
+
19
+ # Sensitive credential field names that must be encrypted at rest.
20
+ # Covers all supported DB types:
21
+ # - password : postgres, mysql, sqlserver, supabase, snowflake
22
+ # - service_account_json : bigquery
23
+ SENSITIVE_FIELDS: frozenset[str] = frozenset({"password", "service_account_json"})
24
+
25
+
26
+ def _get_cipher() -> Fernet:
27
+ key = settings.dataeyond_db_credential_key
28
+ if not key:
29
+ raise ValueError(
30
+ "dataeyond__db__credential__key is not set. "
31
+ "Generate one with: Fernet.generate_key().decode()"
32
+ )
33
+ return Fernet(key.encode())
34
+
35
+
36
+ def encrypt_credential(value: str) -> str:
37
+ """Encrypt a single credential string value."""
38
+ return _get_cipher().encrypt(value.encode()).decode()
39
+
40
+
41
+ def decrypt_credential(value: str) -> str:
42
+ """Decrypt a single Fernet-encrypted credential string."""
43
+ return _get_cipher().decrypt(value.encode()).decode()
44
+
45
+
46
+ def encrypt_credentials_dict(creds: dict) -> dict:
47
+ """Return a copy of the credentials dict with sensitive fields encrypted.
48
+
49
+ Call this before inserting a new DatabaseClient record.
50
+ """
51
+ cipher = _get_cipher()
52
+ result = dict(creds)
53
+ for field in SENSITIVE_FIELDS:
54
+ if result.get(field):
55
+ result[field] = cipher.encrypt(result[field].encode()).decode()
56
+ return result
57
+
58
+
59
+ def decrypt_credentials_dict(creds: dict) -> dict:
60
+ """Return a copy of the credentials dict with sensitive fields decrypted.
61
+
62
+ Call this after fetching a DatabaseClient record from DB.
63
+ """
64
+ cipher = _get_cipher()
65
+ result = dict(creds)
66
+ for field in SENSITIVE_FIELDS:
67
+ if result.get(field):
68
+ result[field] = cipher.decrypt(result[field].encode()).decode()
69
+ return result
70
+
uv.lock CHANGED
@@ -1,5 +1,5 @@
1
  version = 1
2
- revision = 2
3
  requires-python = "==3.12.*"
4
  resolution-markers = [
5
  "python_full_version >= '3.12.4'",
@@ -39,6 +39,7 @@ dependencies = [
39
  { name = "orjson" },
40
  { name = "pandas" },
41
  { name = "passlib", extra = ["bcrypt"] },
 
42
  { name = "pgvector" },
43
  { name = "plotly" },
44
  { name = "presidio-analyzer" },
@@ -46,10 +47,15 @@ dependencies = [
46
  { name = "prometheus-client" },
47
  { name = "psycopg", extra = ["binary", "pool"] },
48
  { name = "psycopg2" },
 
49
  { name = "pydantic" },
50
  { name = "pydantic-settings" },
51
  { name = "pymongo" },
 
 
52
  { name = "pypdf" },
 
 
53
  { name = "python-docx" },
54
  { name = "python-dotenv" },
55
  { name = "python-multipart" },
@@ -57,8 +63,11 @@ dependencies = [
57
  { name = "redis" },
58
  { name = "sentence-transformers" },
59
  { name = "slowapi" },
 
60
  { name = "spacy" },
61
  { name = "sqlalchemy", extra = ["asyncio"] },
 
 
62
  { name = "sse-starlette" },
63
  { name = "starlette" },
64
  { name = "structlog" },
@@ -80,11 +89,8 @@ dev = [
80
 
81
  [package.dev-dependencies]
82
  dev = [
83
- { name = "mypy" },
84
- { name = "pre-commit" },
85
  { name = "pytest" },
86
  { name = "pytest-asyncio" },
87
- { name = "pytest-cov" },
88
  { name = "ruff" },
89
  ]
90
 
@@ -120,6 +126,7 @@ requires-dist = [
120
  { name = "orjson", specifier = "==3.10.12" },
121
  { name = "pandas", specifier = "==2.2.3" },
122
  { name = "passlib", extras = ["bcrypt"], specifier = "==1.7.4" },
 
123
  { name = "pgvector", specifier = "==0.3.6" },
124
  { name = "plotly", specifier = "==5.24.1" },
125
  { name = "pre-commit", marker = "extra == 'dev'", specifier = "==4.0.1" },
@@ -128,10 +135,15 @@ requires-dist = [
128
  { name = "prometheus-client", specifier = "==0.21.1" },
129
  { name = "psycopg", extras = ["binary", "pool"], specifier = "==3.2.3" },
130
  { name = "psycopg2", specifier = ">=2.9.11" },
 
131
  { name = "pydantic", specifier = "==2.10.3" },
132
  { name = "pydantic-settings", specifier = "==2.7.0" },
133
  { name = "pymongo", specifier = ">=4.14.0" },
 
 
134
  { name = "pypdf", specifier = "==5.1.0" },
 
 
135
  { name = "pytest", marker = "extra == 'dev'", specifier = "==8.3.4" },
136
  { name = "pytest-asyncio", marker = "extra == 'dev'", specifier = "==0.24.0" },
137
  { name = "pytest-cov", marker = "extra == 'dev'", specifier = "==6.0.0" },
@@ -143,8 +155,11 @@ requires-dist = [
143
  { name = "ruff", marker = "extra == 'dev'", specifier = "==0.8.4" },
144
  { name = "sentence-transformers", specifier = "==3.3.1" },
145
  { name = "slowapi", specifier = "==0.1.9" },
 
146
  { name = "spacy", specifier = "==3.8.3" },
147
  { name = "sqlalchemy", extras = ["asyncio"], specifier = "==2.0.36" },
 
 
148
  { name = "sse-starlette", specifier = "==2.1.3" },
149
  { name = "starlette", specifier = "==0.41.3" },
150
  { name = "structlog", specifier = "==24.4.0" },
@@ -156,12 +171,9 @@ provides-extras = ["dev"]
156
 
157
  [package.metadata.requires-dev]
158
  dev = [
159
- { name = "mypy", specifier = "==1.13.0" },
160
- { name = "pre-commit", specifier = "==4.0.1" },
161
- { name = "pytest", specifier = "==8.3.4" },
162
- { name = "pytest-asyncio", specifier = "==0.24.0" },
163
- { name = "pytest-cov", specifier = "==6.0.0" },
164
- { name = "ruff", specifier = "==0.8.4" },
165
  ]
166
 
167
  [[package]]
@@ -280,6 +292,15 @@ wheels = [
280
  { url = "https://files.pythonhosted.org/packages/13/b5/7af0cb920a476dccd612fbc9a21a3745fb29b1fcd74636078db8f7ba294c/APScheduler-3.10.4-py3-none-any.whl", hash = "sha256:fb91e8a768632a4756a585f79ec834e0e27aad5860bac7eaa523d9ccefd87661", size = 59303, upload-time = "2023-08-19T16:44:56.814Z" },
281
  ]
282
 
 
 
 
 
 
 
 
 
 
283
  [[package]]
284
  name = "asyncpg"
285
  version = "0.30.0"
@@ -428,6 +449,34 @@ wheels = [
428
  { url = "https://files.pythonhosted.org/packages/20/07/fb43edc2ff0a6a367e4a94fc39eb3b85aa1e55e24cc857af2db145ce9f0d/blis-1.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:f20f7ad69aaffd1ce14fe77de557b6df9b61e0c9e582f75a843715d836b5c8af", size = 6192759, upload-time = "2025-11-17T12:27:56.176Z" },
429
  ]
430
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
431
  [[package]]
432
  name = "cachetools"
433
  version = "5.5.0"
@@ -941,6 +990,109 @@ wheels = [
941
  { url = "https://files.pythonhosted.org/packages/d5/1f/5f4a3cd9e4440e9d9bc78ad0a91a1c8d46b4d429d5239ebe6793c9fe5c41/fsspec-2026.3.0-py3-none-any.whl", hash = "sha256:d2ceafaad1b3457968ed14efa28798162f1638dbb5d2a6868a2db002a5ee39a4", size = 202595, upload-time = "2026-03-27T19:11:13.595Z" },
942
  ]
943
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
944
  [[package]]
945
  name = "greenlet"
946
  version = "3.3.2"
@@ -958,6 +1110,41 @@ wheels = [
958
  { url = "https://files.pythonhosted.org/packages/58/2e/fe7f36ff1982d6b10a60d5e0740c759259a7d6d2e1dc41da6d96de32fff6/greenlet-3.3.2-cp312-cp312-win_arm64.whl", hash = "sha256:d3a62fa76a32b462a97198e4c9e99afb9ab375115e74e9a83ce180e7a496f643", size = 230331, upload-time = "2026-02-20T20:17:23.34Z" },
959
  ]
960
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
961
  [[package]]
962
  name = "h11"
963
  version = "0.16.0"
@@ -1127,6 +1314,15 @@ wheels = [
1127
  { url = "https://files.pythonhosted.org/packages/67/8a/a342b2f0251f3dac4ca17618265d93bf244a2a4d089126e81e4c1056ac50/jiter-0.13.0-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7bb00b6d26db67a05fe3e12c76edc75f32077fb51deed13822dc648fa373bc19", size = 343768, upload-time = "2026-02-02T12:37:55.055Z" },
1128
  ]
1129
 
 
 
 
 
 
 
 
 
 
1130
  [[package]]
1131
  name = "joblib"
1132
  version = "1.5.3"
@@ -1954,6 +2150,18 @@ bcrypt = [
1954
  { name = "bcrypt" },
1955
  ]
1956
 
 
 
 
 
 
 
 
 
 
 
 
 
1957
  [[package]]
1958
  name = "pgvector"
1959
  version = "0.3.6"
@@ -2121,6 +2329,33 @@ wheels = [
2121
  { url = "https://files.pythonhosted.org/packages/5b/5a/bc7b4a4ef808fa59a816c17b20c4bef6884daebbdf627ff2a161da67da19/propcache-0.4.1-py3-none-any.whl", hash = "sha256:af2a6052aeb6cf17d3e46ee169099044fd8224cbaf75c76a2ef596e8163e2237", size = 13305, upload-time = "2025-10-08T19:49:00.792Z" },
2122
  ]
2123
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2124
  [[package]]
2125
  name = "psycopg"
2126
  version = "3.2.3"
@@ -2181,6 +2416,42 @@ wheels = [
2181
  { url = "https://files.pythonhosted.org/packages/b5/bf/635fbe5dd10ed200afbbfbe98f8602829252ca1cce81cc48fb25ed8dadc0/psycopg2-2.9.11-cp312-cp312-win_amd64.whl", hash = "sha256:e03e4a6dbe87ff81540b434f2e5dc2bddad10296db5eea7bdc995bf5f4162938", size = 2713969, upload-time = "2025-10-10T11:10:15.946Z" },
2182
  ]
2183
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2184
  [[package]]
2185
  name = "pycparser"
2186
  version = "3.0"
@@ -2310,6 +2581,43 @@ wheels = [
2310
  { url = "https://files.pythonhosted.org/packages/60/4c/33f75713d50d5247f2258405142c0318ff32c6f8976171c4fcae87a9dbdf/pymongo-4.16.0-cp312-cp312-win_arm64.whl", hash = "sha256:dfc320f08ea9a7ec5b2403dc4e8150636f0d6150f4b9792faaae539c88e7db3b", size = 892971, upload-time = "2026-01-07T18:04:35.594Z" },
2311
  ]
2312
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2313
  [[package]]
2314
  name = "pyparsing"
2315
  version = "3.3.2"
@@ -2328,6 +2636,28 @@ wheels = [
2328
  { url = "https://files.pythonhosted.org/packages/04/fc/6f52588ac1cb4400a7804ef88d0d4e00cfe57a7ac6793ec3b00de5a8758b/pypdf-5.1.0-py3-none-any.whl", hash = "sha256:3bd4f503f4ebc58bae40d81e81a9176c400cbbac2ba2d877367595fb524dfdfc", size = 297976, upload-time = "2024-10-27T19:46:44.439Z" },
2329
  ]
2330
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2331
  [[package]]
2332
  name = "pytest"
2333
  version = "8.3.4"
@@ -2610,6 +2940,18 @@ wheels = [
2610
  { url = "https://files.pythonhosted.org/packages/13/9f/026e18ca7d7766783d779dae5e9c656746c6ede36ef73c6d934aaf4a6dec/ruff-0.8.4-py3-none-win_arm64.whl", hash = "sha256:9183dd615d8df50defa8b1d9a074053891ba39025cf5ae88e8bcb52edcc4bf08", size = 9074500, upload-time = "2024-12-19T13:36:23.92Z" },
2611
  ]
2612
 
 
 
 
 
 
 
 
 
 
 
 
 
2613
  [[package]]
2614
  name = "safetensors"
2615
  version = "0.7.0"
@@ -2764,6 +3106,60 @@ wheels = [
2764
  { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" },
2765
  ]
2766
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2767
  [[package]]
2768
  name = "spacy"
2769
  version = "3.8.3"
@@ -2842,6 +3238,31 @@ asyncio = [
2842
  { name = "greenlet" },
2843
  ]
2844
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2845
  [[package]]
2846
  name = "srsly"
2847
  version = "2.5.3"
@@ -3015,6 +3436,15 @@ wheels = [
3015
  { url = "https://files.pythonhosted.org/packages/72/f4/0de46cfa12cdcbcd464cc59fde36912af405696f687e53a091fb432f694c/tokenizers-0.22.2-cp39-abi3-win_arm64.whl", hash = "sha256:9ce725d22864a1e965217204946f830c37876eee3b2ba6fc6255e8e903d5fcbc", size = 2612133, upload-time = "2026-01-05T10:45:17.232Z" },
3016
  ]
3017
 
 
 
 
 
 
 
 
 
 
3018
  [[package]]
3019
  name = "torch"
3020
  version = "2.11.0"
 
1
  version = 1
2
+ revision = 3
3
  requires-python = "==3.12.*"
4
  resolution-markers = [
5
  "python_full_version >= '3.12.4'",
 
39
  { name = "orjson" },
40
  { name = "pandas" },
41
  { name = "passlib", extra = ["bcrypt"] },
42
+ { name = "pdf2image" },
43
  { name = "pgvector" },
44
  { name = "plotly" },
45
  { name = "presidio-analyzer" },
 
47
  { name = "prometheus-client" },
48
  { name = "psycopg", extra = ["binary", "pool"] },
49
  { name = "psycopg2" },
50
+ { name = "pyarrow" },
51
  { name = "pydantic" },
52
  { name = "pydantic-settings" },
53
  { name = "pymongo" },
54
+ { name = "pymssql" },
55
+ { name = "pymysql" },
56
  { name = "pypdf" },
57
+ { name = "pypdf2" },
58
+ { name = "pytesseract" },
59
  { name = "python-docx" },
60
  { name = "python-dotenv" },
61
  { name = "python-multipart" },
 
63
  { name = "redis" },
64
  { name = "sentence-transformers" },
65
  { name = "slowapi" },
66
+ { name = "snowflake-sqlalchemy" },
67
  { name = "spacy" },
68
  { name = "sqlalchemy", extra = ["asyncio"] },
69
+ { name = "sqlalchemy-bigquery" },
70
+ { name = "sqlglot" },
71
  { name = "sse-starlette" },
72
  { name = "starlette" },
73
  { name = "structlog" },
 
89
 
90
  [package.dev-dependencies]
91
  dev = [
 
 
92
  { name = "pytest" },
93
  { name = "pytest-asyncio" },
 
94
  { name = "ruff" },
95
  ]
96
 
 
126
  { name = "orjson", specifier = "==3.10.12" },
127
  { name = "pandas", specifier = "==2.2.3" },
128
  { name = "passlib", extras = ["bcrypt"], specifier = "==1.7.4" },
129
+ { name = "pdf2image", specifier = ">=1.17.0" },
130
  { name = "pgvector", specifier = "==0.3.6" },
131
  { name = "plotly", specifier = "==5.24.1" },
132
  { name = "pre-commit", marker = "extra == 'dev'", specifier = "==4.0.1" },
 
135
  { name = "prometheus-client", specifier = "==0.21.1" },
136
  { name = "psycopg", extras = ["binary", "pool"], specifier = "==3.2.3" },
137
  { name = "psycopg2", specifier = ">=2.9.11" },
138
+ { name = "pyarrow", specifier = ">=24.0.0" },
139
  { name = "pydantic", specifier = "==2.10.3" },
140
  { name = "pydantic-settings", specifier = "==2.7.0" },
141
  { name = "pymongo", specifier = ">=4.14.0" },
142
+ { name = "pymssql", specifier = ">=2.3.0" },
143
+ { name = "pymysql", specifier = ">=1.1.1" },
144
  { name = "pypdf", specifier = "==5.1.0" },
145
+ { name = "pypdf2", specifier = ">=3.0.1" },
146
+ { name = "pytesseract", specifier = ">=0.3.13" },
147
  { name = "pytest", marker = "extra == 'dev'", specifier = "==8.3.4" },
148
  { name = "pytest-asyncio", marker = "extra == 'dev'", specifier = "==0.24.0" },
149
  { name = "pytest-cov", marker = "extra == 'dev'", specifier = "==6.0.0" },
 
155
  { name = "ruff", marker = "extra == 'dev'", specifier = "==0.8.4" },
156
  { name = "sentence-transformers", specifier = "==3.3.1" },
157
  { name = "slowapi", specifier = "==0.1.9" },
158
+ { name = "snowflake-sqlalchemy", specifier = ">=1.7.0" },
159
  { name = "spacy", specifier = "==3.8.3" },
160
  { name = "sqlalchemy", extras = ["asyncio"], specifier = "==2.0.36" },
161
+ { name = "sqlalchemy-bigquery", specifier = ">=1.11.0" },
162
+ { name = "sqlglot", specifier = ">=25.0.0" },
163
  { name = "sse-starlette", specifier = "==2.1.3" },
164
  { name = "starlette", specifier = "==0.41.3" },
165
  { name = "structlog", specifier = "==24.4.0" },
 
171
 
172
  [package.metadata.requires-dev]
173
  dev = [
174
+ { name = "pytest", specifier = ">=8.3.4" },
175
+ { name = "pytest-asyncio", specifier = ">=0.24.0" },
176
+ { name = "ruff", specifier = ">=0.8.4" },
 
 
 
177
  ]
178
 
179
  [[package]]
 
292
  { url = "https://files.pythonhosted.org/packages/13/b5/7af0cb920a476dccd612fbc9a21a3745fb29b1fcd74636078db8f7ba294c/APScheduler-3.10.4-py3-none-any.whl", hash = "sha256:fb91e8a768632a4756a585f79ec834e0e27aad5860bac7eaa523d9ccefd87661", size = 59303, upload-time = "2023-08-19T16:44:56.814Z" },
293
  ]
294
 
295
+ [[package]]
296
+ name = "asn1crypto"
297
+ version = "1.5.1"
298
+ source = { registry = "https://pypi.org/simple" }
299
+ sdist = { url = "https://files.pythonhosted.org/packages/de/cf/d547feed25b5244fcb9392e288ff9fdc3280b10260362fc45d37a798a6ee/asn1crypto-1.5.1.tar.gz", hash = "sha256:13ae38502be632115abf8a24cbe5f4da52e3b5231990aff31123c805306ccb9c", size = 121080, upload-time = "2022-03-15T14:46:52.889Z" }
300
+ wheels = [
301
+ { url = "https://files.pythonhosted.org/packages/c9/7f/09065fd9e27da0eda08b4d6897f1c13535066174cc023af248fc2a8d5e5a/asn1crypto-1.5.1-py2.py3-none-any.whl", hash = "sha256:db4e40728b728508912cbb3d44f19ce188f218e9eba635821bb4b68564f8fd67", size = 105045, upload-time = "2022-03-15T14:46:51.055Z" },
302
+ ]
303
+
304
  [[package]]
305
  name = "asyncpg"
306
  version = "0.30.0"
 
449
  { url = "https://files.pythonhosted.org/packages/20/07/fb43edc2ff0a6a367e4a94fc39eb3b85aa1e55e24cc857af2db145ce9f0d/blis-1.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:f20f7ad69aaffd1ce14fe77de557b6df9b61e0c9e582f75a843715d836b5c8af", size = 6192759, upload-time = "2025-11-17T12:27:56.176Z" },
450
  ]
451
 
452
+ [[package]]
453
+ name = "boto3"
454
+ version = "1.42.89"
455
+ source = { registry = "https://pypi.org/simple" }
456
+ dependencies = [
457
+ { name = "botocore" },
458
+ { name = "jmespath" },
459
+ { name = "s3transfer" },
460
+ ]
461
+ sdist = { url = "https://files.pythonhosted.org/packages/bb/0c/f7bccb22b245cabf392816baba20f9e95f78ace7dbc580fd40136e80e732/boto3-1.42.89.tar.gz", hash = "sha256:3e43aacc0801bba9bcd23a8c271c089af297a69565f783fcdd357ae0e330bf1e", size = 113165, upload-time = "2026-04-13T19:36:17.516Z" }
462
+ wheels = [
463
+ { url = "https://files.pythonhosted.org/packages/b9/33/55103ba5ef9975ea54b8d39e69b76eb6e9fded3beae5f01065e26951a3a1/boto3-1.42.89-py3-none-any.whl", hash = "sha256:6204b189f4d0c655535f43d7eaa57ff4e8d965b8463c97e45952291211162932", size = 140556, upload-time = "2026-04-13T19:36:13.894Z" },
464
+ ]
465
+
466
+ [[package]]
467
+ name = "botocore"
468
+ version = "1.42.89"
469
+ source = { registry = "https://pypi.org/simple" }
470
+ dependencies = [
471
+ { name = "jmespath" },
472
+ { name = "python-dateutil" },
473
+ { name = "urllib3" },
474
+ ]
475
+ sdist = { url = "https://files.pythonhosted.org/packages/0f/cc/e6be943efa9051bd15c2ee14077c2b10d6e27c9e9385fc43a03a5c4ed8b5/botocore-1.42.89.tar.gz", hash = "sha256:95ac52f472dad29942f3088b278ab493044516c16dbf9133c975af16527baa99", size = 15206290, upload-time = "2026-04-13T19:36:02.321Z" }
476
+ wheels = [
477
+ { url = "https://files.pythonhosted.org/packages/91/f1/90a7b8eda38b7c3a65ca7ee0075bdf310b6b471cb1b95fab6e8994323a50/botocore-1.42.89-py3-none-any.whl", hash = "sha256:d9b786c8d9db6473063b4cc5be0ba7e6a381082307bd6afb69d4216f9fa95f35", size = 14887287, upload-time = "2026-04-13T19:35:56.677Z" },
478
+ ]
479
+
480
  [[package]]
481
  name = "cachetools"
482
  version = "5.5.0"
 
990
  { url = "https://files.pythonhosted.org/packages/d5/1f/5f4a3cd9e4440e9d9bc78ad0a91a1c8d46b4d429d5239ebe6793c9fe5c41/fsspec-2026.3.0-py3-none-any.whl", hash = "sha256:d2ceafaad1b3457968ed14efa28798162f1638dbb5d2a6868a2db002a5ee39a4", size = 202595, upload-time = "2026-03-27T19:11:13.595Z" },
991
  ]
992
 
993
+ [[package]]
994
+ name = "google-api-core"
995
+ version = "2.30.3"
996
+ source = { registry = "https://pypi.org/simple" }
997
+ dependencies = [
998
+ { name = "google-auth" },
999
+ { name = "googleapis-common-protos" },
1000
+ { name = "proto-plus" },
1001
+ { name = "protobuf" },
1002
+ { name = "requests" },
1003
+ ]
1004
+ sdist = { url = "https://files.pythonhosted.org/packages/16/ce/502a57fb0ec752026d24df1280b162294b22a0afb98a326084f9a979138b/google_api_core-2.30.3.tar.gz", hash = "sha256:e601a37f148585319b26db36e219df68c5d07b6382cff2d580e83404e44d641b", size = 177001, upload-time = "2026-04-10T00:41:28.035Z" }
1005
+ wheels = [
1006
+ { url = "https://files.pythonhosted.org/packages/03/15/e56f351cf6ef1cfea58e6ac226a7318ed1deb2218c4b3cc9bd9e4b786c5a/google_api_core-2.30.3-py3-none-any.whl", hash = "sha256:a85761ba72c444dad5d611c2220633480b2b6be2521eca69cca2dbb3ffd6bfe8", size = 173274, upload-time = "2026-04-09T22:57:16.198Z" },
1007
+ ]
1008
+
1009
+ [package.optional-dependencies]
1010
+ grpc = [
1011
+ { name = "grpcio" },
1012
+ { name = "grpcio-status" },
1013
+ ]
1014
+
1015
+ [[package]]
1016
+ name = "google-auth"
1017
+ version = "2.49.2"
1018
+ source = { registry = "https://pypi.org/simple" }
1019
+ dependencies = [
1020
+ { name = "cryptography" },
1021
+ { name = "pyasn1-modules" },
1022
+ ]
1023
+ sdist = { url = "https://files.pythonhosted.org/packages/c6/fc/e925290a1ad95c975c459e2df070fac2b90954e13a0370ac505dff78cb99/google_auth-2.49.2.tar.gz", hash = "sha256:c1ae38500e73065dcae57355adb6278cf8b5c8e391994ae9cbadbcb9631ab409", size = 333958, upload-time = "2026-04-10T00:41:21.888Z" }
1024
+ wheels = [
1025
+ { url = "https://files.pythonhosted.org/packages/73/76/d241a5c927433420507215df6cac1b1fa4ac0ba7a794df42a84326c68da8/google_auth-2.49.2-py3-none-any.whl", hash = "sha256:c2720924dfc82dedb962c9f52cabb2ab16714fd0a6a707e40561d217574ed6d5", size = 240638, upload-time = "2026-04-10T00:41:14.501Z" },
1026
+ ]
1027
+
1028
+ [[package]]
1029
+ name = "google-cloud-bigquery"
1030
+ version = "3.41.0"
1031
+ source = { registry = "https://pypi.org/simple" }
1032
+ dependencies = [
1033
+ { name = "google-api-core", extra = ["grpc"] },
1034
+ { name = "google-auth" },
1035
+ { name = "google-cloud-core" },
1036
+ { name = "google-resumable-media" },
1037
+ { name = "packaging" },
1038
+ { name = "python-dateutil" },
1039
+ { name = "requests" },
1040
+ ]
1041
+ sdist = { url = "https://files.pythonhosted.org/packages/ce/13/6515c7aab55a4a0cf708ffd309fb9af5bab54c13e32dc22c5acd6497193c/google_cloud_bigquery-3.41.0.tar.gz", hash = "sha256:2217e488b47ed576360c9b2cc07d59d883a54b83167c0ef37f915c26b01a06fe", size = 513434, upload-time = "2026-03-30T22:50:55.347Z" }
1042
+ wheels = [
1043
+ { url = "https://files.pythonhosted.org/packages/40/33/1d3902efadef9194566d499d61507e1f038454e0b55499d2d7f8ab2a4fee/google_cloud_bigquery-3.41.0-py3-none-any.whl", hash = "sha256:2a5b5a737b401cbd824a6e5eac7554100b878668d908e6548836b5d8aaa4dcaa", size = 262343, upload-time = "2026-03-30T22:48:45.444Z" },
1044
+ ]
1045
+
1046
+ [[package]]
1047
+ name = "google-cloud-core"
1048
+ version = "2.5.1"
1049
+ source = { registry = "https://pypi.org/simple" }
1050
+ dependencies = [
1051
+ { name = "google-api-core" },
1052
+ { name = "google-auth" },
1053
+ ]
1054
+ sdist = { url = "https://files.pythonhosted.org/packages/dc/24/6ca08b0a03c7b0c620427503ab00353a4ae806b848b93bcea18b6b76fde6/google_cloud_core-2.5.1.tar.gz", hash = "sha256:3dc94bdec9d05a31d9f355045ed0f369fbc0d8c665076c734f065d729800f811", size = 36078, upload-time = "2026-03-30T22:50:08.057Z" }
1055
+ wheels = [
1056
+ { url = "https://files.pythonhosted.org/packages/73/d9/5bb050cb32826466aa9b25f79e2ca2879fe66cb76782d4ed798dd7506151/google_cloud_core-2.5.1-py3-none-any.whl", hash = "sha256:ea62cdf502c20e3e14be8a32c05ed02113d7bef454e40ff3fab6fe1ec9f1f4e7", size = 29452, upload-time = "2026-03-30T22:48:31.567Z" },
1057
+ ]
1058
+
1059
+ [[package]]
1060
+ name = "google-crc32c"
1061
+ version = "1.8.0"
1062
+ source = { registry = "https://pypi.org/simple" }
1063
+ sdist = { url = "https://files.pythonhosted.org/packages/03/41/4b9c02f99e4c5fb477122cd5437403b552873f014616ac1d19ac8221a58d/google_crc32c-1.8.0.tar.gz", hash = "sha256:a428e25fb7691024de47fecfbff7ff957214da51eddded0da0ae0e0f03a2cf79", size = 14192, upload-time = "2025-12-16T00:35:25.142Z" }
1064
+ wheels = [
1065
+ { url = "https://files.pythonhosted.org/packages/e9/5f/7307325b1198b59324c0fa9807cafb551afb65e831699f2ce211ad5c8240/google_crc32c-1.8.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:4b8286b659c1335172e39563ab0a768b8015e88e08329fa5321f774275fc3113", size = 31300, upload-time = "2025-12-16T00:21:56.723Z" },
1066
+ { url = "https://files.pythonhosted.org/packages/21/8e/58c0d5d86e2220e6a37befe7e6a94dd2f6006044b1a33edf1ff6d9f7e319/google_crc32c-1.8.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:2a3dc3318507de089c5384cc74d54318401410f82aa65b2d9cdde9d297aca7cb", size = 30867, upload-time = "2025-12-16T00:38:31.302Z" },
1067
+ { url = "https://files.pythonhosted.org/packages/ce/a9/a780cc66f86335a6019f557a8aaca8fbb970728f0efd2430d15ff1beae0e/google_crc32c-1.8.0-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:14f87e04d613dfa218d6135e81b78272c3b904e2a7053b841481b38a7d901411", size = 33364, upload-time = "2025-12-16T00:40:22.96Z" },
1068
+ { url = "https://files.pythonhosted.org/packages/21/3f/3457ea803db0198c9aaca2dd373750972ce28a26f00544b6b85088811939/google_crc32c-1.8.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:cb5c869c2923d56cb0c8e6bcdd73c009c36ae39b652dbe46a05eb4ef0ad01454", size = 33740, upload-time = "2025-12-16T00:40:23.96Z" },
1069
+ { url = "https://files.pythonhosted.org/packages/df/c0/87c2073e0c72515bb8733d4eef7b21548e8d189f094b5dad20b0ecaf64f6/google_crc32c-1.8.0-cp312-cp312-win_amd64.whl", hash = "sha256:3cc0c8912038065eafa603b238abf252e204accab2a704c63b9e14837a854962", size = 34437, upload-time = "2025-12-16T00:35:21.395Z" },
1070
+ ]
1071
+
1072
+ [[package]]
1073
+ name = "google-resumable-media"
1074
+ version = "2.8.2"
1075
+ source = { registry = "https://pypi.org/simple" }
1076
+ dependencies = [
1077
+ { name = "google-crc32c" },
1078
+ ]
1079
+ sdist = { url = "https://files.pythonhosted.org/packages/3f/d1/b1ea14b93b6b78f57fc580125de44e9f593ab88dd2460f1a8a8d18f74754/google_resumable_media-2.8.2.tar.gz", hash = "sha256:f3354a182ebd193ae3f42e3ef95e6c9b10f128320de23ac7637236713b1acd70", size = 2164510, upload-time = "2026-03-30T23:34:25.369Z" }
1080
+ wheels = [
1081
+ { url = "https://files.pythonhosted.org/packages/5e/f8/50bfaf4658431ff9de45c5c3935af7ab01157a4903c603cd0eee6e78e087/google_resumable_media-2.8.2-py3-none-any.whl", hash = "sha256:82b6d8ccd11765268cdd2a2123f417ec806b8eef3000a9a38dfe3033da5fb220", size = 81511, upload-time = "2026-03-30T23:34:09.671Z" },
1082
+ ]
1083
+
1084
+ [[package]]
1085
+ name = "googleapis-common-protos"
1086
+ version = "1.74.0"
1087
+ source = { registry = "https://pypi.org/simple" }
1088
+ dependencies = [
1089
+ { name = "protobuf" },
1090
+ ]
1091
+ sdist = { url = "https://files.pythonhosted.org/packages/20/18/a746c8344152d368a5aac738d4c857012f2c5d1fd2eac7e17b647a7861bd/googleapis_common_protos-1.74.0.tar.gz", hash = "sha256:57971e4eeeba6aad1163c1f0fc88543f965bb49129b8bb55b2b7b26ecab084f1", size = 151254, upload-time = "2026-04-02T21:23:26.679Z" }
1092
+ wheels = [
1093
+ { url = "https://files.pythonhosted.org/packages/b6/b0/be5d3329badb9230b765de6eea66b73abd5944bdeb5afb3562ddcd80ae84/googleapis_common_protos-1.74.0-py3-none-any.whl", hash = "sha256:702216f78610bb510e3f12ac3cafd281b7ac45cc5d86e90ad87e4d301a3426b5", size = 300743, upload-time = "2026-04-02T21:22:49.108Z" },
1094
+ ]
1095
+
1096
  [[package]]
1097
  name = "greenlet"
1098
  version = "3.3.2"
 
1110
  { url = "https://files.pythonhosted.org/packages/58/2e/fe7f36ff1982d6b10a60d5e0740c759259a7d6d2e1dc41da6d96de32fff6/greenlet-3.3.2-cp312-cp312-win_arm64.whl", hash = "sha256:d3a62fa76a32b462a97198e4c9e99afb9ab375115e74e9a83ce180e7a496f643", size = 230331, upload-time = "2026-02-20T20:17:23.34Z" },
1111
  ]
1112
 
1113
+ [[package]]
1114
+ name = "grpcio"
1115
+ version = "1.80.0"
1116
+ source = { registry = "https://pypi.org/simple" }
1117
+ dependencies = [
1118
+ { name = "typing-extensions" },
1119
+ ]
1120
+ sdist = { url = "https://files.pythonhosted.org/packages/b7/48/af6173dbca4454f4637a4678b67f52ca7e0c1ed7d5894d89d434fecede05/grpcio-1.80.0.tar.gz", hash = "sha256:29aca15edd0688c22ba01d7cc01cb000d72b2033f4a3c72a81a19b56fd143257", size = 12978905, upload-time = "2026-03-30T08:49:10.502Z" }
1121
+ wheels = [
1122
+ { url = "https://files.pythonhosted.org/packages/5c/e8/a2b749265eb3415abc94f2e619bbd9e9707bebdda787e61c593004ec927a/grpcio-1.80.0-cp312-cp312-linux_armv7l.whl", hash = "sha256:c624cc9f1008361014378c9d776de7182b11fe8b2e5a81bc69f23a295f2a1ad0", size = 6015616, upload-time = "2026-03-30T08:47:13.428Z" },
1123
+ { url = "https://files.pythonhosted.org/packages/3e/97/b1282161a15d699d1e90c360df18d19165a045ce1c343c7f313f5e8a0b77/grpcio-1.80.0-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:f49eddcac43c3bf350c0385366a58f36bed8cc2c0ec35ef7b74b49e56552c0c2", size = 12014204, upload-time = "2026-03-30T08:47:15.873Z" },
1124
+ { url = "https://files.pythonhosted.org/packages/6e/5e/d319c6e997b50c155ac5a8cb12f5173d5b42677510e886d250d50264949d/grpcio-1.80.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d334591df610ab94714048e0d5b4f3dd5ad1bee74dfec11eee344220077a79de", size = 6563866, upload-time = "2026-03-30T08:47:18.588Z" },
1125
+ { url = "https://files.pythonhosted.org/packages/ae/f6/fdd975a2cb4d78eb67769a7b3b3830970bfa2e919f1decf724ae4445f42c/grpcio-1.80.0-cp312-cp312-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:0cb517eb1d0d0aaf1d87af7cc5b801d686557c1d88b2619f5e31fab3c2315921", size = 7273060, upload-time = "2026-03-30T08:47:21.113Z" },
1126
+ { url = "https://files.pythonhosted.org/packages/db/f0/a3deb5feba60d9538a962913e37bd2e69a195f1c3376a3dd44fe0427e996/grpcio-1.80.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4e78c4ac0d97dc2e569b2f4bcbbb447491167cb358d1a389fc4af71ab6f70411", size = 6782121, upload-time = "2026-03-30T08:47:23.827Z" },
1127
+ { url = "https://files.pythonhosted.org/packages/ca/84/36c6dcfddc093e108141f757c407902a05085e0c328007cb090d56646cdf/grpcio-1.80.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:2ed770b4c06984f3b47eb0517b1c69ad0b84ef3f40128f51448433be904634cd", size = 7383811, upload-time = "2026-03-30T08:47:26.517Z" },
1128
+ { url = "https://files.pythonhosted.org/packages/7c/ef/f3a77e3dc5b471a0ec86c564c98d6adfa3510d38f8ee99010410858d591e/grpcio-1.80.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:256507e2f524092f1473071a05e65a5b10d84b82e3ff24c5b571513cfaa61e2f", size = 8393860, upload-time = "2026-03-30T08:47:29.439Z" },
1129
+ { url = "https://files.pythonhosted.org/packages/9b/8d/9d4d27ed7f33d109c50d6b5ce578a9914aa68edab75d65869a17e630a8d1/grpcio-1.80.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:9a6284a5d907c37db53350645567c522be314bac859a64a7a5ca63b77bb7958f", size = 7830132, upload-time = "2026-03-30T08:47:33.254Z" },
1130
+ { url = "https://files.pythonhosted.org/packages/14/e4/9990b41c6d7a44e1e9dee8ac11d7a9802ba1378b40d77468a7761d1ad288/grpcio-1.80.0-cp312-cp312-win32.whl", hash = "sha256:c71309cfce2f22be26aa4a847357c502db6c621f1a49825ae98aa0907595b193", size = 4140904, upload-time = "2026-03-30T08:47:35.319Z" },
1131
+ { url = "https://files.pythonhosted.org/packages/2f/2c/296f6138caca1f4b92a31ace4ae1b87dab692fc16a7a3417af3bb3c805bf/grpcio-1.80.0-cp312-cp312-win_amd64.whl", hash = "sha256:9fe648599c0e37594c4809d81a9e77bd138cc82eb8baa71b6a86af65426723ff", size = 4880944, upload-time = "2026-03-30T08:47:37.831Z" },
1132
+ ]
1133
+
1134
+ [[package]]
1135
+ name = "grpcio-status"
1136
+ version = "1.80.0"
1137
+ source = { registry = "https://pypi.org/simple" }
1138
+ dependencies = [
1139
+ { name = "googleapis-common-protos" },
1140
+ { name = "grpcio" },
1141
+ { name = "protobuf" },
1142
+ ]
1143
+ sdist = { url = "https://files.pythonhosted.org/packages/b1/ed/105f619bdd00cb47a49aa2feea6232ea2bbb04199d52a22cc6a7d603b5cb/grpcio_status-1.80.0.tar.gz", hash = "sha256:df73802a4c89a3ea88aa2aff971e886fccce162bc2e6511408b3d67a144381cd", size = 13901, upload-time = "2026-03-30T08:54:34.784Z" }
1144
+ wheels = [
1145
+ { url = "https://files.pythonhosted.org/packages/76/80/58cd2dfc19a07d022abe44bde7c365627f6c7cb6f692ada6c65ca437d09a/grpcio_status-1.80.0-py3-none-any.whl", hash = "sha256:4b56990363af50dbf2c2ebb80f1967185c07d87aa25aa2bea45ddb75fc181dbe", size = 14638, upload-time = "2026-03-30T08:54:01.569Z" },
1146
+ ]
1147
+
1148
  [[package]]
1149
  name = "h11"
1150
  version = "0.16.0"
 
1314
  { url = "https://files.pythonhosted.org/packages/67/8a/a342b2f0251f3dac4ca17618265d93bf244a2a4d089126e81e4c1056ac50/jiter-0.13.0-graalpy312-graalpy250_312_native-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7bb00b6d26db67a05fe3e12c76edc75f32077fb51deed13822dc648fa373bc19", size = 343768, upload-time = "2026-02-02T12:37:55.055Z" },
1315
  ]
1316
 
1317
+ [[package]]
1318
+ name = "jmespath"
1319
+ version = "1.1.0"
1320
+ source = { registry = "https://pypi.org/simple" }
1321
+ sdist = { url = "https://files.pythonhosted.org/packages/d3/59/322338183ecda247fb5d1763a6cbe46eff7222eaeebafd9fa65d4bf5cb11/jmespath-1.1.0.tar.gz", hash = "sha256:472c87d80f36026ae83c6ddd0f1d05d4e510134ed462851fd5f754c8c3cbb88d", size = 27377, upload-time = "2026-01-22T16:35:26.279Z" }
1322
+ wheels = [
1323
+ { url = "https://files.pythonhosted.org/packages/14/2f/967ba146e6d58cf6a652da73885f52fc68001525b4197effc174321d70b4/jmespath-1.1.0-py3-none-any.whl", hash = "sha256:a5663118de4908c91729bea0acadca56526eb2698e83de10cd116ae0f4e97c64", size = 20419, upload-time = "2026-01-22T16:35:24.919Z" },
1324
+ ]
1325
+
1326
  [[package]]
1327
  name = "joblib"
1328
  version = "1.5.3"
 
2150
  { name = "bcrypt" },
2151
  ]
2152
 
2153
+ [[package]]
2154
+ name = "pdf2image"
2155
+ version = "1.17.0"
2156
+ source = { registry = "https://pypi.org/simple" }
2157
+ dependencies = [
2158
+ { name = "pillow" },
2159
+ ]
2160
+ sdist = { url = "https://files.pythonhosted.org/packages/00/d8/b280f01045555dc257b8153c00dee3bc75830f91a744cd5f84ef3a0a64b1/pdf2image-1.17.0.tar.gz", hash = "sha256:eaa959bc116b420dd7ec415fcae49b98100dda3dd18cd2fdfa86d09f112f6d57", size = 12811, upload-time = "2024-01-07T20:33:01.965Z" }
2161
+ wheels = [
2162
+ { url = "https://files.pythonhosted.org/packages/62/33/61766ae033518957f877ab246f87ca30a85b778ebaad65b7f74fa7e52988/pdf2image-1.17.0-py3-none-any.whl", hash = "sha256:ecdd58d7afb810dffe21ef2b1bbc057ef434dabbac6c33778a38a3f7744a27e2", size = 11618, upload-time = "2024-01-07T20:32:59.957Z" },
2163
+ ]
2164
+
2165
  [[package]]
2166
  name = "pgvector"
2167
  version = "0.3.6"
 
2329
  { url = "https://files.pythonhosted.org/packages/5b/5a/bc7b4a4ef808fa59a816c17b20c4bef6884daebbdf627ff2a161da67da19/propcache-0.4.1-py3-none-any.whl", hash = "sha256:af2a6052aeb6cf17d3e46ee169099044fd8224cbaf75c76a2ef596e8163e2237", size = 13305, upload-time = "2025-10-08T19:49:00.792Z" },
2330
  ]
2331
 
2332
+ [[package]]
2333
+ name = "proto-plus"
2334
+ version = "1.27.2"
2335
+ source = { registry = "https://pypi.org/simple" }
2336
+ dependencies = [
2337
+ { name = "protobuf" },
2338
+ ]
2339
+ sdist = { url = "https://files.pythonhosted.org/packages/81/0d/94dfe80193e79d55258345901acd2917523d56e8381bc4dee7fd38e3868a/proto_plus-1.27.2.tar.gz", hash = "sha256:b2adde53adadf75737c44d3dcb0104fde65250dfc83ad59168b4aa3e574b6a24", size = 57204, upload-time = "2026-03-26T22:18:57.174Z" }
2340
+ wheels = [
2341
+ { url = "https://files.pythonhosted.org/packages/84/f3/1fba73eeffafc998a25d59703b63f8be4fe8a5cb12eaff7386a0ba0f7125/proto_plus-1.27.2-py3-none-any.whl", hash = "sha256:6432f75893d3b9e70b9c412f1d2f03f65b11fb164b793d14ae2ca01821d22718", size = 50450, upload-time = "2026-03-26T22:13:42.927Z" },
2342
+ ]
2343
+
2344
+ [[package]]
2345
+ name = "protobuf"
2346
+ version = "6.33.6"
2347
+ source = { registry = "https://pypi.org/simple" }
2348
+ sdist = { url = "https://files.pythonhosted.org/packages/66/70/e908e9c5e52ef7c3a6c7902c9dfbb34c7e29c25d2f81ade3856445fd5c94/protobuf-6.33.6.tar.gz", hash = "sha256:a6768d25248312c297558af96a9f9c929e8c4cee0659cb07e780731095f38135", size = 444531, upload-time = "2026-03-18T19:05:00.988Z" }
2349
+ wheels = [
2350
+ { url = "https://files.pythonhosted.org/packages/fc/9f/2f509339e89cfa6f6a4c4ff50438db9ca488dec341f7e454adad60150b00/protobuf-6.33.6-cp310-abi3-win32.whl", hash = "sha256:7d29d9b65f8afef196f8334e80d6bc1d5d4adedb449971fefd3723824e6e77d3", size = 425739, upload-time = "2026-03-18T19:04:48.373Z" },
2351
+ { url = "https://files.pythonhosted.org/packages/76/5d/683efcd4798e0030c1bab27374fd13a89f7c2515fb1f3123efdfaa5eab57/protobuf-6.33.6-cp310-abi3-win_amd64.whl", hash = "sha256:0cd27b587afca21b7cfa59a74dcbd48a50f0a6400cfb59391340ad729d91d326", size = 437089, upload-time = "2026-03-18T19:04:50.381Z" },
2352
+ { url = "https://files.pythonhosted.org/packages/5c/01/a3c3ed5cd186f39e7880f8303cc51385a198a81469d53d0fdecf1f64d929/protobuf-6.33.6-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:9720e6961b251bde64edfdab7d500725a2af5280f3f4c87e57c0208376aa8c3a", size = 427737, upload-time = "2026-03-18T19:04:51.866Z" },
2353
+ { url = "https://files.pythonhosted.org/packages/ee/90/b3c01fdec7d2f627b3a6884243ba328c1217ed2d978def5c12dc50d328a3/protobuf-6.33.6-cp39-abi3-manylinux2014_aarch64.whl", hash = "sha256:e2afbae9b8e1825e3529f88d514754e094278bb95eadc0e199751cdd9a2e82a2", size = 324610, upload-time = "2026-03-18T19:04:53.096Z" },
2354
+ { url = "https://files.pythonhosted.org/packages/9b/ca/25afc144934014700c52e05103c2421997482d561f3101ff352e1292fb81/protobuf-6.33.6-cp39-abi3-manylinux2014_s390x.whl", hash = "sha256:c96c37eec15086b79762ed265d59ab204dabc53056e3443e702d2681f4b39ce3", size = 339381, upload-time = "2026-03-18T19:04:54.616Z" },
2355
+ { url = "https://files.pythonhosted.org/packages/16/92/d1e32e3e0d894fe00b15ce28ad4944ab692713f2e7f0a99787405e43533a/protobuf-6.33.6-cp39-abi3-manylinux2014_x86_64.whl", hash = "sha256:e9db7e292e0ab79dd108d7f1a94fe31601ce1ee3f7b79e0692043423020b0593", size = 323436, upload-time = "2026-03-18T19:04:55.768Z" },
2356
+ { url = "https://files.pythonhosted.org/packages/c4/72/02445137af02769918a93807b2b7890047c32bfb9f90371cbc12688819eb/protobuf-6.33.6-py3-none-any.whl", hash = "sha256:77179e006c476e69bf8e8ce866640091ec42e1beb80b213c3900006ecfba6901", size = 170656, upload-time = "2026-03-18T19:04:59.826Z" },
2357
+ ]
2358
+
2359
  [[package]]
2360
  name = "psycopg"
2361
  version = "3.2.3"
 
2416
  { url = "https://files.pythonhosted.org/packages/b5/bf/635fbe5dd10ed200afbbfbe98f8602829252ca1cce81cc48fb25ed8dadc0/psycopg2-2.9.11-cp312-cp312-win_amd64.whl", hash = "sha256:e03e4a6dbe87ff81540b434f2e5dc2bddad10296db5eea7bdc995bf5f4162938", size = 2713969, upload-time = "2025-10-10T11:10:15.946Z" },
2417
  ]
2418
 
2419
+ [[package]]
2420
+ name = "pyarrow"
2421
+ version = "24.0.0"
2422
+ source = { registry = "https://pypi.org/simple" }
2423
+ sdist = { url = "https://files.pythonhosted.org/packages/91/13/13e1069b351bdc3881266e11147ffccf687505dbb0ea74036237f5d454a5/pyarrow-24.0.0.tar.gz", hash = "sha256:85fe721a14dd823aca09127acbb06c3ca723efbd436c004f16bca601b04dcc83", size = 1180261, upload-time = "2026-04-21T10:51:25.837Z" }
2424
+ wheels = [
2425
+ { url = "https://files.pythonhosted.org/packages/b4/a9/9686d9f07837f91f775e8932659192e02c74f9d8920524b480b85212cc68/pyarrow-24.0.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:6233c9ed9ab9d1db47de57d9753256d9dcffbf42db341576099f0fd9f6bf4810", size = 34981559, upload-time = "2026-04-21T10:47:22.17Z" },
2426
+ { url = "https://files.pythonhosted.org/packages/80/b6/0ddf0e9b6ead3474ab087ae598c76b031fc45532bf6a63f3a553440fb258/pyarrow-24.0.0-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:f7616236ec1bc2b15bfdec22a71ab38851c86f8f05ff64f379e1278cf20c634a", size = 36663654, upload-time = "2026-04-21T10:47:28.315Z" },
2427
+ { url = "https://files.pythonhosted.org/packages/7c/3b/926382efe8ce27ba729071d3566ade6dfb86bdf112f366000196b2f5780a/pyarrow-24.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:1617043b99bd33e5318ae18eb2919af09c71322ef1ca46566cdafc6e6712fb66", size = 45679394, upload-time = "2026-04-21T10:47:34.821Z" },
2428
+ { url = "https://files.pythonhosted.org/packages/b3/7a/829f7d9dfd37c207206081d6dad474d81dde29952401f07f2ba507814818/pyarrow-24.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:6165461f55ef6314f026de6638d661188e3455d3ec49834556a0ebbdbace18bb", size = 48863122, upload-time = "2026-04-21T10:47:42.056Z" },
2429
+ { url = "https://files.pythonhosted.org/packages/5f/e8/f88ce625fe8babaae64e8db2d417c7653adb3019b08aae85c5ed787dc816/pyarrow-24.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3b13dedfe76a0ad2d1d859b0811b53827a4e9d93a0bcb05cf59333ab4980cc7e", size = 49376032, upload-time = "2026-04-21T10:47:48.967Z" },
2430
+ { url = "https://files.pythonhosted.org/packages/36/7a/82c363caa145fff88fb475da50d3bf52bb024f61917be5424c3392eaf878/pyarrow-24.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:25ea65d868eb04015cd18e6df2fbe98f07e5bda2abefabcb88fce39a947716f6", size = 51929490, upload-time = "2026-04-21T10:47:55.981Z" },
2431
+ { url = "https://files.pythonhosted.org/packages/66/1c/e3e72c8014ad2743ca64a701652c733cc5cbcee15c0463a32a8c55518d9e/pyarrow-24.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:295f0a7f2e242dabd513737cf076007dc5b2d59237e3eca37b05c0c6446f3826", size = 27355660, upload-time = "2026-04-21T10:48:01.718Z" },
2432
+ ]
2433
+
2434
+ [[package]]
2435
+ name = "pyasn1"
2436
+ version = "0.6.3"
2437
+ source = { registry = "https://pypi.org/simple" }
2438
+ sdist = { url = "https://files.pythonhosted.org/packages/5c/5f/6583902b6f79b399c9c40674ac384fd9cd77805f9e6205075f828ef11fb2/pyasn1-0.6.3.tar.gz", hash = "sha256:697a8ecd6d98891189184ca1fa05d1bb00e2f84b5977c481452050549c8a72cf", size = 148685, upload-time = "2026-03-17T01:06:53.382Z" }
2439
+ wheels = [
2440
+ { url = "https://files.pythonhosted.org/packages/5d/a0/7d793dce3fa811fe047d6ae2431c672364b462850c6235ae306c0efd025f/pyasn1-0.6.3-py3-none-any.whl", hash = "sha256:a80184d120f0864a52a073acc6fc642847d0be408e7c7252f31390c0f4eadcde", size = 83997, upload-time = "2026-03-17T01:06:52.036Z" },
2441
+ ]
2442
+
2443
+ [[package]]
2444
+ name = "pyasn1-modules"
2445
+ version = "0.4.2"
2446
+ source = { registry = "https://pypi.org/simple" }
2447
+ dependencies = [
2448
+ { name = "pyasn1" },
2449
+ ]
2450
+ sdist = { url = "https://files.pythonhosted.org/packages/e9/e6/78ebbb10a8c8e4b61a59249394a4a594c1a7af95593dc933a349c8d00964/pyasn1_modules-0.4.2.tar.gz", hash = "sha256:677091de870a80aae844b1ca6134f54652fa2c8c5a52aa396440ac3106e941e6", size = 307892, upload-time = "2025-03-28T02:41:22.17Z" }
2451
+ wheels = [
2452
+ { url = "https://files.pythonhosted.org/packages/47/8d/d529b5d697919ba8c11ad626e835d4039be708a35b0d22de83a269a6682c/pyasn1_modules-0.4.2-py3-none-any.whl", hash = "sha256:29253a9207ce32b64c3ac6600edc75368f98473906e8fd1043bd6b5b1de2c14a", size = 181259, upload-time = "2025-03-28T02:41:19.028Z" },
2453
+ ]
2454
+
2455
  [[package]]
2456
  name = "pycparser"
2457
  version = "3.0"
 
2581
  { url = "https://files.pythonhosted.org/packages/60/4c/33f75713d50d5247f2258405142c0318ff32c6f8976171c4fcae87a9dbdf/pymongo-4.16.0-cp312-cp312-win_arm64.whl", hash = "sha256:dfc320f08ea9a7ec5b2403dc4e8150636f0d6150f4b9792faaae539c88e7db3b", size = 892971, upload-time = "2026-01-07T18:04:35.594Z" },
2582
  ]
2583
 
2584
+ [[package]]
2585
+ name = "pymssql"
2586
+ version = "2.3.13"
2587
+ source = { registry = "https://pypi.org/simple" }
2588
+ sdist = { url = "https://files.pythonhosted.org/packages/7a/cc/843c044b7f71ee329436b7327c578383e2f2499313899f88ad267cdf1f33/pymssql-2.3.13.tar.gz", hash = "sha256:2137e904b1a65546be4ccb96730a391fcd5a85aab8a0632721feb5d7e39cfbce", size = 203153, upload-time = "2026-02-14T05:00:36.865Z" }
2589
+ wheels = [
2590
+ { url = "https://files.pythonhosted.org/packages/ba/60/a2e8a8a38f7be21d54402e2b3365cd56f1761ce9f2706c97f864e8aa8300/pymssql-2.3.13-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:cf4f32b4a05b66f02cb7d55a0f3bcb0574a6f8cf0bee4bea6f7b104038364733", size = 3158689, upload-time = "2026-02-14T04:59:46.982Z" },
2591
+ { url = "https://files.pythonhosted.org/packages/43/9e/0cf0ffb9e2f73238baf766d8e31d7237b5bee3cc1bb29a376b404610994a/pymssql-2.3.13-cp312-cp312-macosx_15_0_x86_64.whl", hash = "sha256:2b056eb175955f7fb715b60dc1c0c624969f4d24dbdcf804b41ab1e640a2b131", size = 2960018, upload-time = "2026-02-14T04:59:48.668Z" },
2592
+ { url = "https://files.pythonhosted.org/packages/93/ea/bc27354feaca717faa4626911f6b19bb62985c87dda28957c63de4de5895/pymssql-2.3.13-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:319810b89aa64b99d9c5c01518752c813938df230496fa2c4c6dda0603f04c4c", size = 3065719, upload-time = "2026-02-14T04:59:50.369Z" },
2593
+ { url = "https://files.pythonhosted.org/packages/1e/7a/8028681c96241fb5fc850b87c8959402c353e4b83c6e049a99ffa67ded54/pymssql-2.3.13-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c0ea72641cb0f8bce7ad8565dbdbda4a7437aa58bce045f2a3a788d71af2e4be", size = 3190567, upload-time = "2026-02-14T04:59:52.202Z" },
2594
+ { url = "https://files.pythonhosted.org/packages/aa/f1/ab5b76adbbd6db9ce746d448db34b044683522e7e7b95053f9dd0165297b/pymssql-2.3.13-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1493f63d213607f708a5722aa230776ada726ccdb94097fab090a1717a2534e0", size = 3710481, upload-time = "2026-02-14T04:59:54.01Z" },
2595
+ { url = "https://files.pythonhosted.org/packages/59/aa/2fa0951475cd0a1829e0b8bfbe334d04ece4bce11546a556b005c4100689/pymssql-2.3.13-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:eb3275985c23479e952d6462ae6c8b2b6993ab6b99a92805a9c17942cf3d5b3d", size = 3453789, upload-time = "2026-02-14T04:59:56.841Z" },
2596
+ { url = "https://files.pythonhosted.org/packages/78/08/8cd2af9003f9fc03912b658a64f5a4919dcd68f0dd3bbc822b49a3d14fd9/pymssql-2.3.13-cp312-cp312-win_amd64.whl", hash = "sha256:a930adda87bdd8351a5637cf73d6491936f34e525a5e513068a6eac742f69cdb", size = 1994709, upload-time = "2026-02-14T04:59:58.972Z" },
2597
+ ]
2598
+
2599
+ [[package]]
2600
+ name = "pymysql"
2601
+ version = "1.1.2"
2602
+ source = { registry = "https://pypi.org/simple" }
2603
+ sdist = { url = "https://files.pythonhosted.org/packages/f5/ae/1fe3fcd9f959efa0ebe200b8de88b5a5ce3e767e38c7ac32fb179f16a388/pymysql-1.1.2.tar.gz", hash = "sha256:4961d3e165614ae65014e361811a724e2044ad3ea3739de9903ae7c21f539f03", size = 48258, upload-time = "2025-08-24T12:55:55.146Z" }
2604
+ wheels = [
2605
+ { url = "https://files.pythonhosted.org/packages/7c/4c/ad33b92b9864cbde84f259d5df035a6447f91891f5be77788e2a3892bce3/pymysql-1.1.2-py3-none-any.whl", hash = "sha256:e6b1d89711dd51f8f74b1631fe08f039e7d76cf67a42a323d3178f0f25762ed9", size = 45300, upload-time = "2025-08-24T12:55:53.394Z" },
2606
+ ]
2607
+
2608
+ [[package]]
2609
+ name = "pyopenssl"
2610
+ version = "25.1.0"
2611
+ source = { registry = "https://pypi.org/simple" }
2612
+ dependencies = [
2613
+ { name = "cryptography" },
2614
+ { name = "typing-extensions" },
2615
+ ]
2616
+ sdist = { url = "https://files.pythonhosted.org/packages/04/8c/cd89ad05804f8e3c17dea8f178c3f40eeab5694c30e0c9f5bcd49f576fc3/pyopenssl-25.1.0.tar.gz", hash = "sha256:8d031884482e0c67ee92bf9a4d8cceb08d92aba7136432ffb0703c5280fc205b", size = 179937, upload-time = "2025-05-17T16:28:31.31Z" }
2617
+ wheels = [
2618
+ { url = "https://files.pythonhosted.org/packages/80/28/2659c02301b9500751f8d42f9a6632e1508aa5120de5e43042b8b30f8d5d/pyopenssl-25.1.0-py3-none-any.whl", hash = "sha256:2b11f239acc47ac2e5aca04fd7fa829800aeee22a2eb30d744572a157bd8a1ab", size = 56771, upload-time = "2025-05-17T16:28:29.197Z" },
2619
+ ]
2620
+
2621
  [[package]]
2622
  name = "pyparsing"
2623
  version = "3.3.2"
 
2636
  { url = "https://files.pythonhosted.org/packages/04/fc/6f52588ac1cb4400a7804ef88d0d4e00cfe57a7ac6793ec3b00de5a8758b/pypdf-5.1.0-py3-none-any.whl", hash = "sha256:3bd4f503f4ebc58bae40d81e81a9176c400cbbac2ba2d877367595fb524dfdfc", size = 297976, upload-time = "2024-10-27T19:46:44.439Z" },
2637
  ]
2638
 
2639
+ [[package]]
2640
+ name = "pypdf2"
2641
+ version = "3.0.1"
2642
+ source = { registry = "https://pypi.org/simple" }
2643
+ sdist = { url = "https://files.pythonhosted.org/packages/9f/bb/18dc3062d37db6c491392007dfd1a7f524bb95886eb956569ac38a23a784/PyPDF2-3.0.1.tar.gz", hash = "sha256:a74408f69ba6271f71b9352ef4ed03dc53a31aa404d29b5d31f53bfecfee1440", size = 227419, upload-time = "2022-12-31T10:36:13.13Z" }
2644
+ wheels = [
2645
+ { url = "https://files.pythonhosted.org/packages/8e/5e/c86a5643653825d3c913719e788e41386bee415c2b87b4f955432f2de6b2/pypdf2-3.0.1-py3-none-any.whl", hash = "sha256:d16e4205cfee272fbdc0568b68d82be796540b1537508cef59388f839c191928", size = 232572, upload-time = "2022-12-31T10:36:10.327Z" },
2646
+ ]
2647
+
2648
+ [[package]]
2649
+ name = "pytesseract"
2650
+ version = "0.3.13"
2651
+ source = { registry = "https://pypi.org/simple" }
2652
+ dependencies = [
2653
+ { name = "packaging" },
2654
+ { name = "pillow" },
2655
+ ]
2656
+ sdist = { url = "https://files.pythonhosted.org/packages/9f/a6/7d679b83c285974a7cb94d739b461fa7e7a9b17a3abfd7bf6cbc5c2394b0/pytesseract-0.3.13.tar.gz", hash = "sha256:4bf5f880c99406f52a3cfc2633e42d9dc67615e69d8a509d74867d3baddb5db9", size = 17689, upload-time = "2024-08-16T02:33:56.762Z" }
2657
+ wheels = [
2658
+ { url = "https://files.pythonhosted.org/packages/7a/33/8312d7ce74670c9d39a532b2c246a853861120486be9443eebf048043637/pytesseract-0.3.13-py3-none-any.whl", hash = "sha256:7a99c6c2ac598360693d83a416e36e0b33a67638bb9d77fdcac094a3589d4b34", size = 14705, upload-time = "2024-08-16T02:36:10.09Z" },
2659
+ ]
2660
+
2661
  [[package]]
2662
  name = "pytest"
2663
  version = "8.3.4"
 
2940
  { url = "https://files.pythonhosted.org/packages/13/9f/026e18ca7d7766783d779dae5e9c656746c6ede36ef73c6d934aaf4a6dec/ruff-0.8.4-py3-none-win_arm64.whl", hash = "sha256:9183dd615d8df50defa8b1d9a074053891ba39025cf5ae88e8bcb52edcc4bf08", size = 9074500, upload-time = "2024-12-19T13:36:23.92Z" },
2941
  ]
2942
 
2943
+ [[package]]
2944
+ name = "s3transfer"
2945
+ version = "0.16.0"
2946
+ source = { registry = "https://pypi.org/simple" }
2947
+ dependencies = [
2948
+ { name = "botocore" },
2949
+ ]
2950
+ sdist = { url = "https://files.pythonhosted.org/packages/05/04/74127fc843314818edfa81b5540e26dd537353b123a4edc563109d8f17dd/s3transfer-0.16.0.tar.gz", hash = "sha256:8e990f13268025792229cd52fa10cb7163744bf56e719e0b9cb925ab79abf920", size = 153827, upload-time = "2025-12-01T02:30:59.114Z" }
2951
+ wheels = [
2952
+ { url = "https://files.pythonhosted.org/packages/fc/51/727abb13f44c1fcf6d145979e1535a35794db0f6e450a0cb46aa24732fe2/s3transfer-0.16.0-py3-none-any.whl", hash = "sha256:18e25d66fed509e3868dc1572b3f427ff947dd2c56f844a5bf09481ad3f3b2fe", size = 86830, upload-time = "2025-12-01T02:30:57.729Z" },
2953
+ ]
2954
+
2955
  [[package]]
2956
  name = "safetensors"
2957
  version = "0.7.0"
 
3106
  { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" },
3107
  ]
3108
 
3109
+ [[package]]
3110
+ name = "snowflake-connector-python"
3111
+ version = "4.0.0"
3112
+ source = { registry = "https://pypi.org/simple" }
3113
+ dependencies = [
3114
+ { name = "asn1crypto" },
3115
+ { name = "boto3" },
3116
+ { name = "botocore" },
3117
+ { name = "certifi" },
3118
+ { name = "charset-normalizer" },
3119
+ { name = "cryptography" },
3120
+ { name = "filelock" },
3121
+ { name = "idna" },
3122
+ { name = "packaging" },
3123
+ { name = "platformdirs" },
3124
+ { name = "pyjwt" },
3125
+ { name = "pyopenssl" },
3126
+ { name = "pytz" },
3127
+ { name = "requests" },
3128
+ { name = "sortedcontainers" },
3129
+ { name = "tomlkit" },
3130
+ { name = "typing-extensions" },
3131
+ ]
3132
+ sdist = { url = "https://files.pythonhosted.org/packages/1d/f1/4aff125021a9c5e0183f2f55dd7d04b7256a0e1e10db50d537a7415d9c55/snowflake_connector_python-4.0.0.tar.gz", hash = "sha256:4b10a865c4a5e1fa60c365c7fe41e0433605e6e5edc824e8730a9038f330b3a6", size = 813937, upload-time = "2025-10-09T10:11:34.631Z" }
3133
+ wheels = [
3134
+ { url = "https://files.pythonhosted.org/packages/ea/b0/462c0deee35d6d03d3d729b3f923615bae665beb7f9a94673a23a52080fe/snowflake_connector_python-4.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:bfd3b8523d7adc830f99c5c4c635689ceca61700a05368d5bbb34c6811f2ec54", size = 1029568, upload-time = "2025-10-09T10:11:42.125Z" },
3135
+ { url = "https://files.pythonhosted.org/packages/ff/4b/bb3ae3f07e7927c8f16c4c0f1283d3c721978d16e8bf4193fc8e41025c1e/snowflake_connector_python-4.0.0-cp312-cp312-macosx_11_0_x86_64.whl", hash = "sha256:835161dd46ef8f5fc9d2f135ca654c2f3fbdf57b035d3e1980506aa8eac671dc", size = 1041337, upload-time = "2025-10-09T10:11:43.692Z" },
3136
+ { url = "https://files.pythonhosted.org/packages/9c/75/4bfac89f10c6dbb75e97adf1e217737fc599ebf964031c9298b6cbd807d0/snowflake_connector_python-4.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:65e4e36dd1b0c7235d84cddef8a3c97c5ea0dc8fea85e31e45fc485000b77a83", size = 2699730, upload-time = "2025-10-09T10:11:25.295Z" },
3137
+ { url = "https://files.pythonhosted.org/packages/cd/78/0e916416c50909dbae511fe38b1e671a9efa62decdce51b174a0396804e4/snowflake_connector_python-4.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e6132986d6965e4005b0167270612fbc7fa4bc4ef42726a40b85a8f57475a78d", size = 2731336, upload-time = "2025-10-09T10:11:27.028Z" },
3138
+ { url = "https://files.pythonhosted.org/packages/83/f0/3db8a2f3f5ee724d309c661af739a70d0643070b9b4597728151ef900f9b/snowflake_connector_python-4.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:a790f06808e4481c23cfed1396d2c9a786060ddd62408b1fda1a63e1e6bc4b07", size = 1176292, upload-time = "2025-10-09T10:11:54.956Z" },
3139
+ ]
3140
+
3141
+ [[package]]
3142
+ name = "snowflake-sqlalchemy"
3143
+ version = "1.9.0"
3144
+ source = { registry = "https://pypi.org/simple" }
3145
+ dependencies = [
3146
+ { name = "snowflake-connector-python" },
3147
+ { name = "sqlalchemy" },
3148
+ ]
3149
+ sdist = { url = "https://files.pythonhosted.org/packages/ff/6a/fcc5c00c3a253029a7b7b293a3958ba07d5e97623b643de47be0cc9e5530/snowflake_sqlalchemy-1.9.0.tar.gz", hash = "sha256:fb32baf559f7f933ae8fde2ec535bcea5381bb15188777cd8c006b3226efa3b1", size = 141707, upload-time = "2026-03-04T13:48:17.905Z" }
3150
+ wheels = [
3151
+ { url = "https://files.pythonhosted.org/packages/88/28/b7ae8df80847e8157b74669ad7e1b0180e82ac0e3daf950612effd232fea/snowflake_sqlalchemy-1.9.0-py3-none-any.whl", hash = "sha256:f0b1528173e93c8c80bd9ca510985054667e0e514dd90b890271ac1cfae261c1", size = 78953, upload-time = "2026-03-04T13:48:16.393Z" },
3152
+ ]
3153
+
3154
+ [[package]]
3155
+ name = "sortedcontainers"
3156
+ version = "2.4.0"
3157
+ source = { registry = "https://pypi.org/simple" }
3158
+ sdist = { url = "https://files.pythonhosted.org/packages/e8/c4/ba2f8066cceb6f23394729afe52f3bf7adec04bf9ed2c820b39e19299111/sortedcontainers-2.4.0.tar.gz", hash = "sha256:25caa5a06cc30b6b83d11423433f65d1f9d76c4c6a0c90e3379eaa43b9bfdb88", size = 30594, upload-time = "2021-05-16T22:03:42.897Z" }
3159
+ wheels = [
3160
+ { url = "https://files.pythonhosted.org/packages/32/46/9cb0e58b2deb7f82b84065f37f3bffeb12413f947f9388e4cac22c4621ce/sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0", size = 29575, upload-time = "2021-05-16T22:03:41.177Z" },
3161
+ ]
3162
+
3163
  [[package]]
3164
  name = "spacy"
3165
  version = "3.8.3"
 
3238
  { name = "greenlet" },
3239
  ]
3240
 
3241
+ [[package]]
3242
+ name = "sqlalchemy-bigquery"
3243
+ version = "1.16.0"
3244
+ source = { registry = "https://pypi.org/simple" }
3245
+ dependencies = [
3246
+ { name = "google-api-core" },
3247
+ { name = "google-auth" },
3248
+ { name = "google-cloud-bigquery" },
3249
+ { name = "packaging" },
3250
+ { name = "sqlalchemy" },
3251
+ ]
3252
+ sdist = { url = "https://files.pythonhosted.org/packages/7e/6a/c49932b3d9c44cab9202b1866c5b36b7f0d0455d4653fbc0af4466aeaa76/sqlalchemy_bigquery-1.16.0.tar.gz", hash = "sha256:fe937a0d1f4cf7219fcf5d4995c6718805b38d4df43e29398dec5dc7b6d1987e", size = 119632, upload-time = "2025-11-06T01:35:40.373Z" }
3253
+ wheels = [
3254
+ { url = "https://files.pythonhosted.org/packages/c0/87/11e6de00ef7949bb8ea06b55304a1a4911c329fdf0d9882b464db240c2c5/sqlalchemy_bigquery-1.16.0-py3-none-any.whl", hash = "sha256:0fe7634cd954f3e74f5e2db6d159f9e5ee87a47fbe8d52eac3cd3bb3dadb3a77", size = 40615, upload-time = "2025-11-06T01:35:39.358Z" },
3255
+ ]
3256
+
3257
+ [[package]]
3258
+ name = "sqlglot"
3259
+ version = "30.6.0"
3260
+ source = { registry = "https://pypi.org/simple" }
3261
+ sdist = { url = "https://files.pythonhosted.org/packages/3c/66/6ece15f197874e56c76e1d0269cebf284ba992a80dfadca9d1972fdf7edf/sqlglot-30.6.0.tar.gz", hash = "sha256:246d34d39927422a50a3fa155f37b2f6346fba85f1a755b13c941eb32ef93361", size = 5835307, upload-time = "2026-04-20T20:11:08.164Z" }
3262
+ wheels = [
3263
+ { url = "https://files.pythonhosted.org/packages/dc/e7/64fe971cbca33a0446b06f4a5ff8e3fa4a1dbd0a039ceabcc3e6cf4087a9/sqlglot-30.6.0-py3-none-any.whl", hash = "sha256:e005fc2f47994f90d7d8df341f1cbe937518497b0b7b1507d4c03c4c9dfd2778", size = 673920, upload-time = "2026-04-20T20:11:05.758Z" },
3264
+ ]
3265
+
3266
  [[package]]
3267
  name = "srsly"
3268
  version = "2.5.3"
 
3436
  { url = "https://files.pythonhosted.org/packages/72/f4/0de46cfa12cdcbcd464cc59fde36912af405696f687e53a091fb432f694c/tokenizers-0.22.2-cp39-abi3-win_arm64.whl", hash = "sha256:9ce725d22864a1e965217204946f830c37876eee3b2ba6fc6255e8e903d5fcbc", size = 2612133, upload-time = "2026-01-05T10:45:17.232Z" },
3437
  ]
3438
 
3439
+ [[package]]
3440
+ name = "tomlkit"
3441
+ version = "0.14.0"
3442
+ source = { registry = "https://pypi.org/simple" }
3443
+ sdist = { url = "https://files.pythonhosted.org/packages/c3/af/14b24e41977adb296d6bd1fb59402cf7d60ce364f90c890bd2ec65c43b5a/tomlkit-0.14.0.tar.gz", hash = "sha256:cf00efca415dbd57575befb1f6634c4f42d2d87dbba376128adb42c121b87064", size = 187167, upload-time = "2026-01-13T01:14:53.304Z" }
3444
+ wheels = [
3445
+ { url = "https://files.pythonhosted.org/packages/b5/11/87d6d29fb5d237229d67973a6c9e06e048f01cf4994dee194ab0ea841814/tomlkit-0.14.0-py3-none-any.whl", hash = "sha256:592064ed85b40fa213469f81ac584f67a4f2992509a7c3ea2d632208623a3680", size = 39310, upload-time = "2026-01-13T01:14:51.965Z" },
3446
+ ]
3447
+
3448
  [[package]]
3449
  name = "torch"
3450
  version = "2.11.0"