Spaces:

DataEyond
/

Agentic-Service-Data-Eyond

Paused

App Files Files Community

sofhiaazzhr commited on 24 days ago

Commit

29efec6

2 Parent(s): 240251c 948d6dd

Merge branch 'dev_new' of https://huggingface.co/spaces/DataEyond/Agentic-Service-Data-Eyond into dev_new

Browse files

Files changed (4) hide show

src/query/executors/db.py +0 -32
src/query/executors/db_executor.py +334 -0
src/query/{executor.py → query_executor.py} +0 -0
uv.lock +11 -0

src/query/executors/db.py DELETED Viewed

@@ -1,32 +0,0 @@
-"""Executor for registered database sources (source_type="database").
-Flow:
-  1. Group RetrievalResult chunks by database_client_id.
-  2. For each client: decrypt creds -> connect -> SELECT relevant columns FROM table LIMIT n.
-  3. Return QueryResult per (client_id, table_name).
-"""
-from src.middlewares.logging import get_logger
-from src.query.base import BaseExecutor, QueryResult
-from src.rag.base import RetrievalResult
-logger = get_logger("db_executor")
-class DbExecutor(BaseExecutor):
-    async def execute(
-        self,
-        results: list[RetrievalResult],
-        user_id: str,
-        limit: int = 100,
-    ) -> list[QueryResult]:
-        # TODO: implement
-        # 1. filter results where source_type == "database"
-        # 2. group by (database_client_id, table_name) -> list of column_names
-        # 3. per group: look up DatabaseClient, decrypt creds, connect via db_pipeline_service
-        # 4. SELECT <columns> FROM <table> LIMIT limit
-        # 5. return QueryResult per group
-        raise NotImplementedError
-db_executor = DbExecutor()

src/query/executors/db_executor.py ADDED Viewed

	@@ -0,0 +1,334 @@

+"""Executor for registered database sources (source_type="database").
+Flow per (client_id, question):
+  1. Collect all relevant (table_name, column_name) pairs from retrieval results.
+  2. Fetch the FULL schema for those tables from PGVector (not just top-k columns).
+  3. Build a schema context string and send to LLM → structured SQLQuery output.
+  4. Validate via sqlglot: SELECT-only, schema-grounded, LIMIT enforced.
+  5. Execute on the user's DB via engine_scope + asyncio.to_thread.
+  6. Return QueryResult per client_id (may span multiple tables via JOINs).
+Supported db_types: postgres, supabase, mysql.
+Other types are skipped with a warning — they do not raise.
+"""
+import asyncio
+from collections import defaultdict
+from typing import Any
+import sqlglot
+import sqlglot.expressions as exp
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_openai import AzureChatOpenAI
+from sqlalchemy import text
+from sqlalchemy.ext.asyncio import AsyncSession
+from src.config.settings import settings
+from src.database_client.database_client_service import database_client_service
+from src.db.postgres.connection import _pgvector_engine
+from src.middlewares.logging import get_logger
+from src.models.sql_query import SQLQuery
+from src.pipeline.db_pipeline import db_pipeline_service
+from src.query.base import BaseExecutor, QueryResult
+from src.rag.base import RetrievalResult
+from src.utils.db_credential_encryption import decrypt_credentials_dict
+logger = get_logger("db_executor")
+_SUPPORTED_DB_TYPES = {"postgres", "supabase", "mysql"}
+_MAX_RETRIES = 3
+_MAX_LIMIT = 500
+_SQL_SYSTEM_PROMPT = """\
+You are a SQL data analyst working with a user's database.
+Generate a single SQL SELECT statement that answers the user's question.
+Rules:
+- ONLY reference tables and columns listed in the schema below. Do not invent names.
+- Always include a LIMIT clause (max {limit}).
+- Do not use DELETE, UPDATE, INSERT, DROP, TRUNCATE, ALTER, CREATE, or any DDL.
+- Prefer explicit JOINs over subqueries when combining tables.
+- For aggregations, always alias the result column (e.g. COUNT(*) AS order_count).
+- For date filtering, use standard SQL date functions appropriate for the dialect.
+Schema:
+{schema}
+{error_section}"""
+class DbExecutor(BaseExecutor):
+    def __init__(self) -> None:
+        self._llm = AzureChatOpenAI(
+            azure_deployment=settings.azureai_deployment_name_4o,
+            openai_api_version=settings.azureai_api_version_4o,
+            azure_endpoint=settings.azureai_endpoint_url_4o,
+            api_key=settings.azureai_api_key_4o,
+            temperature=0,
+        )
+        self._prompt = ChatPromptTemplate.from_messages([
+            ("system", _SQL_SYSTEM_PROMPT),
+            ("human", "{question}"),
+        ])
+        self._chain = self._prompt | self._llm.with_structured_output(SQLQuery)
+    # ------------------------------------------------------------------
+    # Public interface
+    # ------------------------------------------------------------------
+    async def execute(
+        self,
+        results: list[RetrievalResult],
+        user_id: str,
+        db: AsyncSession,
+        limit: int = 100,
+    ) -> list[QueryResult]:
+        db_results = [r for r in results if r.source_type == "database"]
+        if not db_results:
+            return []
+        # Group by client_id — one SQL generation + execution pass per client
+        by_client: dict[str, list[RetrievalResult]] = defaultdict(list)
+        for r in db_results:
+            client_id = r.metadata.get("database_client_id", "")
+            if client_id:
+                by_client[client_id].append(r)
+            else:
+                logger.warning("db result missing database_client_id, skipping")
+        query_results: list[QueryResult] = []
+        for client_id, client_results in by_client.items():
+            try:
+                qr = await self._execute_for_client(client_id, client_results, user_id, db, limit)
+                if qr:
+                    query_results.append(qr)
+            except Exception as e:
+                logger.error("db executor failed for client", client_id=client_id, error=str(e))
+        return query_results
+    # ------------------------------------------------------------------
+    # Per-client execution
+    # ------------------------------------------------------------------
+    async def _execute_for_client(
+        self,
+        client_id: str,
+        results: list[RetrievalResult],
+        user_id: str,
+        db: AsyncSession,
+        limit: int,
+    ) -> QueryResult | None:
+        client = await database_client_service.get(db, client_id)
+        if not client:
+            logger.warning("database client not found", client_id=client_id)
+            return None
+        if client.user_id != user_id:
+            logger.warning("client ownership mismatch", client_id=client_id)
+            return None
+        if client.db_type not in _SUPPORTED_DB_TYPES:
+            logger.warning("unsupported db_type for query execution", db_type=client.db_type)
+            return None
+        # Distinct table names from retrieval results
+        table_names = list({
+            r.metadata.get("data", {}).get("table_name")
+            for r in results
+            if r.metadata.get("data", {}).get("table_name")
+        })
+        full_schema = await self._fetch_full_schema(client_id, table_names, user_id)
+        if not full_schema:
+            logger.warning("no schema found in vector store", client_id=client_id, tables=table_names)
+            return None
+        schema_ctx = self._build_schema_context(full_schema)
+        question = self._extract_question(results)
+        capped_limit = min(limit, _MAX_LIMIT)
+        # SQL generation with retry
+        validated_sql: str | None = None
+        prev_error: str = ""
+        for attempt in range(_MAX_RETRIES):
+            error_section = f"Previous attempt failed: {prev_error}\nFix the issue above." if prev_error else ""
+            try:
+                result: SQLQuery = await self._chain.ainvoke({
+                    "schema": schema_ctx,
+                    "limit": capped_limit,
+                    "error_section": error_section,
+                    "question": question,
+                })
+                sql = result.sql.strip()
+                validation_error = self._validate(sql, full_schema, capped_limit)
+                if validation_error:
+                    prev_error = validation_error
+                    logger.warning("sql validation failed", attempt=attempt + 1, error=validation_error)
+                    continue
+                validated_sql = sql
+                logger.info("sql generated", attempt=attempt + 1, reasoning=result.reasoning)
+                break
+            except Exception as e:
+                prev_error = str(e)
+                logger.warning("sql generation error", attempt=attempt + 1, error=prev_error)
+        if not validated_sql:
+            logger.error("sql generation failed after retries", client_id=client_id)
+            return None
+        # Execute on user's DB
+        creds = decrypt_credentials_dict(client.credentials)
+        with db_pipeline_service.engine_scope(client.db_type, creds) as engine:
+            rows = await asyncio.to_thread(self._run_sql, engine, validated_sql)
+        column_types = {
+            col["name"]: col["type"]
+            for cols in full_schema.values()
+            for col in cols
+        }
+        columns = list(rows[0].keys()) if rows else []
+        return QueryResult(
+            source_type="database",
+            source_id=client_id,
+            table_or_file=", ".join(table_names),
+            columns=columns,
+            rows=rows,
+            row_count=len(rows),
+            metadata={
+                "db_type": client.db_type,
+                "client_name": client.name,
+                "sql": validated_sql,
+                "column_types": {c: column_types.get(c, "unknown") for c in columns},
+            },
+        )
+    # ------------------------------------------------------------------
+    # Schema helpers
+    # ------------------------------------------------------------------
+    async def _fetch_full_schema(
+        self,
+        client_id: str,
+        table_names: list[str],
+        user_id: str,
+    ) -> dict[str, list[dict[str, Any]]]:
+        """Fetch ALL column chunks for the given tables from PGVector.
+        Returns {table_name: [{"name": ..., "type": ..., "is_primary_key": ...,
+                                "foreign_key": ..., "content": ...}]}
+        """
+        placeholders = ", ".join(f":t{i}" for i in range(len(table_names)))
+        sql = text(f"""
+            SELECT lpe.cmetadata, lpe.document
+            FROM langchain_pg_embedding lpe
+            JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
+            WHERE lpc.name = 'document_embeddings'
+              AND lpe.cmetadata->>'user_id' = :user_id
+              AND lpe.cmetadata->>'source_type' = 'database'
+              AND lpe.cmetadata->>'database_client_id' = :client_id
+              AND lpe.cmetadata->'data'->>'table_name' IN ({placeholders})
+            ORDER BY lpe.cmetadata->'data'->>'table_name', lpe.cmetadata->'data'->>'column_name'
+        """)
+        params: dict[str, Any] = {"user_id": user_id, "client_id": client_id}
+        for i, name in enumerate(table_names):
+            params[f"t{i}"] = name
+        async with _pgvector_engine.connect() as conn:
+            result = await conn.execute(sql, params)
+            rows = result.fetchall()
+        schema: dict[str, list[dict[str, Any]]] = defaultdict(list)
+        for row in rows:
+            data = row.cmetadata.get("data", {})
+            table = data.get("table_name")
+            if table:
+                schema[table].append({
+                    "name": data.get("column_name", ""),
+                    "type": data.get("column_type", ""),
+                    "is_primary_key": data.get("is_primary_key", False),
+                    "foreign_key": data.get("foreign_key"),
+                    "content": row.document,  # chunk text includes top values / samples
+                })
+        return dict(schema)
+    def _build_schema_context(self, schema: dict[str, list[dict[str, Any]]]) -> str:
+        lines: list[str] = []
+        for table, columns in schema.items():
+            lines.append(f"Table: {table}")
+            for col in columns:
+                flags = []
+                if col["is_primary_key"]:
+                    flags.append("PRIMARY KEY")
+                if col["foreign_key"]:
+                    flags.append(f"FK -> {col['foreign_key']}")
+                flag_str = f"  [{', '.join(flags)}]" if flags else ""
+                lines.append(f"  - {col['name']}  {col['type']}{flag_str}")
+                # Include sample/top-values line from chunk content if present
+                for line in col["content"].splitlines():
+                    if line.startswith(("Top values:", "Sample values:")):
+                        lines.append(f"    {line}")
+                        break
+            lines.append("")
+        return "\n".join(lines).strip()
+    def _extract_question(self, results: list[RetrievalResult]) -> str:
+        # The search_query rewritten by the orchestrator is not in RetrievalResult —
+        # the content field carries schema descriptions. Return a generic fallback;
+        # callers that have the original question should pass it explicitly.
+        # TODO: thread the original user question through to execute() when wiring into the agent.
+        return "Answer the user's data question using the schema provided."
+    # ------------------------------------------------------------------
+    # Guardrails
+    # ------------------------------------------------------------------
+    def _validate(self, sql: str, schema: dict[str, list[dict]], limit: int) -> str:
+        """Return an error string if validation fails, empty string if OK."""
+        # Layer 1: sqlglot parse + SELECT-only check
+        try:
+            parsed = sqlglot.parse_one(sql)
+        except sqlglot.errors.ParseError as e:
+            return f"SQL parse error: {e}"
+        if not isinstance(parsed, exp.Select):
+            return f"Only SELECT statements are allowed. Got: {type(parsed).__name__}"
+        # Check for DML inside CTEs
+        for cte in parsed.find_all(exp.With):
+            for node in cte.find_all((exp.Insert, exp.Update, exp.Delete)):
+                return f"DML ({type(node).__name__}) inside CTE is not allowed."
+        # Layer 2: schema grounding — table names
+        known_tables = {t.lower() for t in schema}
+        for tbl in parsed.find_all(exp.Table):
+            name = tbl.name.lower()
+            if name and name not in known_tables:
+                return f"Unknown table '{tbl.name}'. Only use tables from the schema."
+        # Layer 3: LIMIT enforcement (inject if missing — done before execution)
+        return ""
+    # ------------------------------------------------------------------
+    # SQL execution
+    # ------------------------------------------------------------------
+    def _enforce_limit(self, sql: str, limit: int) -> str:
+        """Inject or cap LIMIT using sqlglot AST manipulation."""
+        parsed = sqlglot.parse_one(sql)
+        existing = parsed.find(exp.Limit)
+        if existing:
+            current = int(existing.expression.this)
+            if current > limit:
+                existing.expression.set("this", str(limit))
+        else:
+            parsed = parsed.limit(limit)
+        return parsed.sql()
+    def _run_sql(self, engine: Any, sql: str) -> list[dict]:
+        with engine.connect() as conn:
+            result = conn.execute(text(sql))
+            return [dict(row) for row in result.mappings()]
+db_executor = DbExecutor()

src/query/{executor.py → query_executor.py} RENAMED Viewed

File without changes

uv.lock CHANGED Viewed

@@ -66,6 +66,7 @@ dependencies = [
     { name = "spacy" },
     { name = "sqlalchemy", extra = ["asyncio"] },
     { name = "sqlalchemy-bigquery" },
     { name = "sse-starlette" },
     { name = "starlette" },
     { name = "structlog" },
@@ -149,6 +150,7 @@ requires-dist = [
     { name = "spacy", specifier = "==3.8.3" },
     { name = "sqlalchemy", extras = ["asyncio"], specifier = "==2.0.36" },
     { name = "sqlalchemy-bigquery", specifier = ">=1.11.0" },
     { name = "sse-starlette", specifier = "==2.1.3" },
     { name = "starlette", specifier = "==0.41.3" },
     { name = "structlog", specifier = "==24.4.0" },
@@ -3221,6 +3223,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/c0/87/11e6de00ef7949bb8ea06b55304a1a4911c329fdf0d9882b464db240c2c5/sqlalchemy_bigquery-1.16.0-py3-none-any.whl", hash = "sha256:0fe7634cd954f3e74f5e2db6d159f9e5ee87a47fbe8d52eac3cd3bb3dadb3a77", size = 40615, upload-time = "2025-11-06T01:35:39.358Z" },
 ]
 [[package]]
 name = "srsly"
 version = "2.5.3"

     { name = "spacy" },
     { name = "sqlalchemy", extra = ["asyncio"] },
     { name = "sqlalchemy-bigquery" },
+    { name = "sqlglot" },
     { name = "sse-starlette" },
     { name = "starlette" },
     { name = "structlog" },
     { name = "spacy", specifier = "==3.8.3" },
     { name = "sqlalchemy", extras = ["asyncio"], specifier = "==2.0.36" },
     { name = "sqlalchemy-bigquery", specifier = ">=1.11.0" },
+    { name = "sqlglot", specifier = ">=25.0.0" },
     { name = "sse-starlette", specifier = "==2.1.3" },
     { name = "starlette", specifier = "==0.41.3" },
     { name = "structlog", specifier = "==24.4.0" },
     { url = "https://files.pythonhosted.org/packages/c0/87/11e6de00ef7949bb8ea06b55304a1a4911c329fdf0d9882b464db240c2c5/sqlalchemy_bigquery-1.16.0-py3-none-any.whl", hash = "sha256:0fe7634cd954f3e74f5e2db6d159f9e5ee87a47fbe8d52eac3cd3bb3dadb3a77", size = 40615, upload-time = "2025-11-06T01:35:39.358Z" },
 ]
+[[package]]
+name = "sqlglot"
+version = "30.6.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/3c/66/6ece15f197874e56c76e1d0269cebf284ba992a80dfadca9d1972fdf7edf/sqlglot-30.6.0.tar.gz", hash = "sha256:246d34d39927422a50a3fa155f37b2f6346fba85f1a755b13c941eb32ef93361", size = 5835307, upload-time = "2026-04-20T20:11:08.164Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/dc/e7/64fe971cbca33a0446b06f4a5ff8e3fa4a1dbd0a039ceabcc3e6cf4087a9/sqlglot-30.6.0-py3-none-any.whl", hash = "sha256:e005fc2f47994f90d7d8df341f1cbe937518497b0b7b1507d4c03c4c9dfd2778", size = 673920, upload-time = "2026-04-20T20:11:05.758Z" },
+]
 [[package]]
 name = "srsly"
 version = "2.5.3"