Spaces:

DataEyond
/

Agentic-Service-Data-Eyond

Sleeping

App Files Files Community

Rifqi Hafizuddin commited on 16 days ago

Commit

2c8a3e8

1 Parent(s): 145bca3

[KM-512] create folder for querying from bd/tabular docs

Browse files

Files changed (6) hide show

src/query/__init__.py +0 -0
src/query/base.py +27 -0
src/query/executor.py +48 -0
src/query/executors/__init__.py +0 -0
src/query/executors/db.py +32 -0
src/query/executors/tabular.py +36 -0

src/query/__init__.py ADDED Viewed

File without changes

src/query/base.py ADDED Viewed

	@@ -0,0 +1,27 @@

+"""Shared contract for query executors."""
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from src.rag.base import RetrievalResult
+@dataclass
+class QueryResult:
+    source_type: str        # "database" or "document"
+    source_id: str          # database_client_id or document_id
+    table_or_file: str
+    columns: list[str]
+    rows: list[dict]
+    row_count: int
+    metadata: dict = field(default_factory=dict)
+class BaseExecutor(ABC):
+    @abstractmethod
+    async def execute(
+        self,
+        results: list[RetrievalResult],
+        user_id: str,
+        limit: int = 100,
+    ) -> list[QueryResult]: ...

src/query/executor.py ADDED Viewed

	@@ -0,0 +1,48 @@

+"""QueryExecutor — dispatches retrieval results to the appropriate executor by source_type."""
+import asyncio
+from src.middlewares.logging import get_logger
+from src.query.base import QueryResult
+from src.query.executors.db import db_executor
+from src.query.executors.tabular import tabular_executor
+from src.rag.base import RetrievalResult
+logger = get_logger("query_executor")
+class QueryExecutor:
+    async def execute(
+        self,
+        results: list[RetrievalResult],
+        user_id: str,
+        limit: int = 100,
+    ) -> list[QueryResult]:
+        db_results = [r for r in results if r.source_type == "database"]
+        tabular_results = [
+            r for r in results
+            if r.source_type == "document"
+            and r.metadata.get("data", {}).get("file_type") in ("csv", "xlsx")
+        ]
+        async def _empty() -> list[QueryResult]:
+            return []
+        batches = await asyncio.gather(
+            db_executor.execute(db_results, user_id, limit) if db_results else _empty(),
+            tabular_executor.execute(tabular_results, user_id, limit) if tabular_results else _empty(),
+            return_exceptions=True,
+        )
+        query_results: list[QueryResult] = []
+        for batch in batches:
+            if isinstance(batch, Exception):
+                logger.error("executor failed", error=str(batch))
+                continue
+            query_results.extend(batch)
+        logger.info("query execution complete", total=len(query_results))
+        return query_results
+query_executor = QueryExecutor()

src/query/executors/__init__.py ADDED Viewed

File without changes

src/query/executors/db.py ADDED Viewed

	@@ -0,0 +1,32 @@

+"""Executor for registered database sources (source_type="database").
+Flow:
+  1. Group RetrievalResult chunks by database_client_id.
+  2. For each client: decrypt creds -> connect -> SELECT relevant columns FROM table LIMIT n.
+  3. Return QueryResult per (client_id, table_name).
+"""
+from src.middlewares.logging import get_logger
+from src.query.base import BaseExecutor, QueryResult
+from src.rag.base import RetrievalResult
+logger = get_logger("db_executor")
+class DbExecutor(BaseExecutor):
+    async def execute(
+        self,
+        results: list[RetrievalResult],
+        user_id: str,
+        limit: int = 100,
+    ) -> list[QueryResult]:
+        # TODO: implement
+        # 1. filter results where source_type == "database"
+        # 2. group by (database_client_id, table_name) -> list of column_names
+        # 3. per group: look up DatabaseClient, decrypt creds, connect via db_pipeline_service
+        # 4. SELECT <columns> FROM <table> LIMIT limit
+        # 5. return QueryResult per group
+        raise NotImplementedError
+db_executor = DbExecutor()

src/query/executors/tabular.py ADDED Viewed

	@@ -0,0 +1,36 @@

+"""Executor for tabular document sources (source_type="document", file_type csv/xlsx).
+Flow:
+  1. Group RetrievalResult chunks by document_id.
+  2. For each document: download bytes from Azure Blob -> read with pandas.
+  3. Filter DataFrame to relevant columns identified by retrieval.
+  4. Return QueryResult per document.
+"""
+from src.middlewares.logging import get_logger
+from src.query.base import BaseExecutor, QueryResult
+from src.rag.base import RetrievalResult
+logger = get_logger("tabular_executor")
+_TABULAR_FILE_TYPES = ("csv", "xlsx")
+class TabularExecutor(BaseExecutor):
+    async def execute(
+        self,
+        results: list[RetrievalResult],
+        user_id: str,
+        limit: int = 100,
+    ) -> list[QueryResult]:
+        # TODO: implement
+        # 1. filter results where source_type == "document" and file_type in _TABULAR_FILE_TYPES
+        # 2. group by document_id -> list of column_names
+        # 3. per group: look up Document by document_id -> get blob_name
+        # 4. blob_storage.download_file(blob_name) -> pd.read_csv / pd.read_excel
+        # 5. df[relevant_columns].head(limit) -> rows as list[dict]
+        # 6. return QueryResult per document
+        raise NotImplementedError
+tabular_executor = TabularExecutor()