Agentic-Service-Data-Eyond-Catalog

Sleeping

App Files Files Community

sofhiaazzhr commited on 22 days ago

Commit

b272cc7

1 Parent(s): 5670888

[KM-556] delete Phase 1 remnants: query/executors/, query_executor.py, orchestration.py

Browse files

Files changed (6) hide show

PROGRESS.md +1 -1
src/agents/orchestration.py +0 -79
src/query/executors/__init__.py +0 -0
src/query/executors/db_executor.py +0 -648
src/query/executors/tabular.py +0 -287
src/query/query_executor.py +0 -42

PROGRESS.md CHANGED Viewed

@@ -2,7 +2,7 @@
 Persistent tracker mirroring the 42-item ownership table in `REPO_CONTEXT.md` "Team — division of work". Update as PRs land. Future Claude Code sessions read this to know what's already done.
-**Last updated**: 2026-05-08 (item 41 done; item 16 done; item 31 done; item 35 done; item 36 done — chat endpoint rewired to Phase 2 QueryService)
 **Current open PR**: none — all Phase 2 contracts shipped on `pr/1`. Cleanup PR pending (API rewiring + Phase 1 removal).
 ---

 Persistent tracker mirroring the 42-item ownership table in `REPO_CONTEXT.md` "Team — division of work". Update as PRs land. Future Claude Code sessions read this to know what's already done.
+**Last updated**: 2026-05-08 (items 16,31,35,36,41 done; Phase 1 remnants deleted: query/executors/, query_executor.py, agents/orchestration.py)
 **Current open PR**: none — all Phase 2 contracts shipped on `pr/1`. Cleanup PR pending (API rewiring + Phase 1 removal).
 ---

src/agents/orchestration.py DELETED Viewed

@@ -1,79 +0,0 @@
-"""Orchestrator agent for intent recognition and planning."""
-from langchain_openai import AzureChatOpenAI
-from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
-from src.config.settings import settings
-from src.middlewares.logging import get_logger
-from src.models.structured_output import IntentClassification
-logger = get_logger("orchestrator")
-class OrchestratorAgent:
-    """Orchestrator agent for intent recognition and planning."""
-    def __init__(self):
-        self.llm = AzureChatOpenAI(
-            azure_deployment=settings.azureai_deployment_name_4o,
-            openai_api_version=settings.azureai_api_version_4o,
-            azure_endpoint=settings.azureai_endpoint_url_4o,
-            api_key=settings.azureai_api_key_4o,
-            temperature=0
-        )
-        self.prompt = ChatPromptTemplate.from_messages([
-            ("system", """You are an orchestrator agent. You receive recent conversation history and the user's latest message.
-Your task:
-1. Determine intent: question, greeting, goodbye, or other
-2. Decide whether to search the user's documents (needs_search)
-3. If search is needed, rewrite the user's message into a STANDALONE search query that incorporates necessary context from conversation history. If the user says "tell me more" or "how many papers?", the search_query must spell out the full topic explicitly from history.
-4. If no search needed, provide a short direct_response (plain text only, no markdown formatting).
-Intent Routing:
-- question -> needs_search=True, search_query=<standalone rewritten query>
-- greeting -> needs_search=False, direct_response="Hello! How can I assist you today?"
-- goodbye -> needs_search=False, direct_response="Goodbye! Have a great day!"
-- other -> needs_search=True, search_query=<standalone rewritten query>
-Source Routing (set source_hint):
-- Columns, tables, sheets, data types, schema, row counts, statistics -> source_hint=schema
-- Document content, paragraphs, reports, articles, text -> source_hint=document
-- Unclear or spans both -> source_hint=both
-"""),
-            MessagesPlaceholder(variable_name="history"),
-            ("user", "{message}")
-        ])
-        # with_structured_output uses function calling — guarantees valid schema regardless of LLM response style
-        self.chain = self.prompt | self.llm.with_structured_output(IntentClassification)
-    async def analyze_message(self, message: str, history: list = None) -> dict:
-        """Analyze user message and determine next actions.
-        Args:
-            message: The current user message.
-            history: Recent conversation as LangChain BaseMessage objects (oldest-first).
-                     Used to rewrite ambiguous follow-ups into standalone search queries.
-        """
-        try:
-            logger.info(f"Analyzing message: {message[:50]}...")
-            history_messages = history or []
-            result: IntentClassification = await self.chain.ainvoke({"message": message, "history": history_messages})
-            logger.info(f"Intent: {result.intent}, Needs search: {result.needs_search}, Search query: {result.search_query[:50] if result.search_query else ''}")
-            return result.model_dump()
-        except Exception as e:
-            logger.error("Message analysis failed", error=str(e))
-            # Fallback to treating everything as a question
-            return {
-                "intent": "question",
-                "needs_search": True,
-                "search_query": message,
-                "direct_response": None
-            }
-orchestrator = OrchestratorAgent()

src/query/executors/__init__.py DELETED Viewed

File without changes

src/query/executors/db_executor.py DELETED Viewed

@@ -1,648 +0,0 @@
-"""Executor for registered database sources (source_type="database").
-Flow per (client_id, question):
-  1. Collect all relevant (table_name, column_name) pairs from retrieval results.
-  2. Fetch the FULL schema for those tables from PGVector (not just top-k columns).
-  3. Build a schema context string and send to LLM → structured SQLQuery output.
-  4. Validate via sqlglot: SELECT-only, schema-grounded, LIMIT enforced.
-  5. Execute on the user's DB via engine_scope + asyncio.to_thread.
-  6. Return QueryResult per client_id (may span multiple tables via JOINs).
-Supported db_types: postgres, supabase, mysql.
-Other types are skipped with a warning — they do not raise.
-"""
-import asyncio
-from collections import defaultdict
-from typing import Any
-import sqlglot
-import sqlglot.expressions as exp
-import tiktoken
-from langchain_core.prompts import ChatPromptTemplate
-from langchain_openai import AzureChatOpenAI
-from sqlalchemy import text
-from sqlalchemy.ext.asyncio import AsyncSession
-from src.config.settings import settings
-from src.database_client.database_client_service import database_client_service
-from src.db.postgres.connection import _pgvector_engine
-from src.middlewares.logging import get_logger
-from src.models.sql_query import SQLQuery
-from src.pipeline.db_pipeline import db_pipeline_service
-from src.query.base import BaseExecutor, QueryResult
-from src.retrieval.base import RetrievalResult
-from src.utils.db_credential_encryption import decrypt_credentials_dict
-logger = get_logger("db_executor")
-_enc = tiktoken.get_encoding("cl100k_base")
-_SUPPORTED_DB_TYPES = {"postgres", "supabase", "mysql"}
-_MAX_RETRIES = 3
-_MAX_LIMIT = 500
-_FK_EXPANSION_MAX_TABLES = 5
-_SQL_SYSTEM_PROMPT = """\
-You are a SQL data analyst working with a user's database.
-Generate a single SQL SELECT statement that answers the user's question.
-Database dialect: {dialect}
-Rules:
-- ONLY reference tables and columns listed in the schema below. Do not invent names.
-- Always include a LIMIT clause (max {limit}).
-- Do not use DELETE, UPDATE, INSERT, DROP, TRUNCATE, ALTER, CREATE, or any DDL.
-- Prefer explicit JOINs over subqueries when combining tables.
-- For aggregations, always alias the result column (e.g. COUNT(*) AS order_count).
-- For date filtering, use dialect-appropriate functions ({dialect} syntax).
-Schema:
-{schema}
-{error_section}"""
-class DbExecutor(BaseExecutor):
-    def __init__(self) -> None:
-        self._llm = AzureChatOpenAI(
-            azure_deployment=settings.azureai_deployment_name_4o,
-            openai_api_version=settings.azureai_api_version_4o,
-            azure_endpoint=settings.azureai_endpoint_url_4o,
-            api_key=settings.azureai_api_key_4o,
-            temperature=0,
-        )
-        self._prompt = ChatPromptTemplate.from_messages([
-            ("system", _SQL_SYSTEM_PROMPT),
-            ("human", "{question}"),
-        ])
-        self._chain = self._prompt | self._llm.with_structured_output(SQLQuery)
-    # ------------------------------------------------------------------
-    # Public interface
-    # ------------------------------------------------------------------
-    async def execute(
-        self,
-        results: list[RetrievalResult],
-        user_id: str,
-        db: AsyncSession,
-        question: str,
-        limit: int = 100,
-    ) -> list[QueryResult]:
-        db_results = [r for r in results if r.source_type == "database"]
-        if not db_results:
-            return []
-        # Group by client_id — one SQL generation + execution pass per client
-        by_client: dict[str, list[RetrievalResult]] = defaultdict(list)
-        for r in db_results:
-            client_id = r.metadata.get("database_client_id", "")
-            if client_id:
-                by_client[client_id].append(r)
-            else:
-                logger.warning("db result missing database_client_id, skipping")
-        query_results: list[QueryResult] = []
-        for client_id, client_results in by_client.items():
-            try:
-                qr = await self._execute_for_client(client_id, client_results, user_id, db, question, limit)
-                if qr:
-                    query_results.append(qr)
-            except Exception as e:
-                logger.error("db executor failed for client", client_id=client_id, error=str(e))
-        return query_results
-    # ------------------------------------------------------------------
-    # Per-client execution
-    # ------------------------------------------------------------------
-    async def _execute_for_client(
-        self,
-        client_id: str,
-        results: list[RetrievalResult],
-        user_id: str,
-        db: AsyncSession,
-        question: str,
-        limit: int,
-    ) -> QueryResult | None:
-        client = await database_client_service.get(db, client_id)
-        if not client:
-            logger.warning("database client not found", client_id=client_id)
-            return None
-        if client.user_id != user_id:
-            logger.warning("client ownership mismatch", client_id=client_id)
-            return None
-        if client.db_type not in _SUPPORTED_DB_TYPES:
-            logger.warning("unsupported db_type for query execution", db_type=client.db_type)
-            return None
-        # Hit tables = tables retrieval pointed at directly. Get full per-column
-        # schema for these. Related tables (one FK hop away, both directions) are
-        # fetched separately in abbreviated form to give the LLM enough context
-        # to JOIN without paying the per-column profile token cost.
-        hit_tables = list({
-            r.metadata.get("data", {}).get("table_name")
-            for r in results
-            if r.metadata.get("data", {}).get("table_name")
-        })
-        if not hit_tables:
-            logger.warning("no table_name on any retrieval result", client_id=client_id)
-            return None
-        full_schema = await self._fetch_full_schema(client_id, hit_tables, user_id)
-        if not full_schema:
-            logger.warning("no schema found in vector store", client_id=client_id, tables=hit_tables)
-            return None
-        related_tables = await self._find_related_tables(client_id, user_id, hit_tables)
-        related_schema = (
-            await self._fetch_abbreviated_schema(client_id, user_id, related_tables)
-            if related_tables else {}
-        )
-        schema_ctx = self._build_schema_context(full_schema, related_schema)
-        capped_limit = min(limit, _MAX_LIMIT)
-        dialect = client.db_type
-        # SQL generation with retry
-        validated_sql: str | None = None
-        prev_error: str = ""
-        prev_reasoning: str = ""
-        for attempt in range(_MAX_RETRIES):
-            if prev_error:
-                error_section = (
-                    f"Previous attempt reasoning: {prev_reasoning}\n"
-                    f"Previous attempt failed: {prev_error}\n"
-                    "Fix the issue above."
-                )
-            else:
-                error_section = ""
-            try:
-                prompt_text = schema_ctx + error_section + question
-                input_tokens = len(_enc.encode(prompt_text))
-                logger.info("sql generation input tokens", attempt=attempt + 1, tokens=input_tokens)
-                result: SQLQuery = await self._chain.ainvoke({
-                    "schema": schema_ctx,
-                    "dialect": dialect,
-                    "limit": capped_limit,
-                    "error_section": error_section,
-                    "question": question,
-                })
-                sql = result.sql.strip()
-                allowed_tables = set(full_schema) | set(related_schema)
-                column_map: dict[str, set[str]] = {
-                    t: {c["name"] for c in cols} for t, cols in full_schema.items()
-                }
-                for t, info in related_schema.items():
-                    column_map[t] = set(info.get("column_names") or [])
-                validation_error = self._validate(sql, allowed_tables, capped_limit, column_map)
-                if validation_error:
-                    prev_error = validation_error
-                    prev_reasoning = result.reasoning
-                    logger.warning("sql validation failed", attempt=attempt + 1, error=validation_error)
-                    continue
-                validated_sql = self._enforce_limit(sql, capped_limit)
-                output_tokens = len(_enc.encode(result.sql)) + len(_enc.encode(result.reasoning))
-                logger.info(
-                    "sql generated",
-                    attempt=attempt + 1,
-                    input_tokens=input_tokens,
-                    output_tokens=output_tokens,
-                    total_tokens=input_tokens + output_tokens,
-                    reasoning=result.reasoning,
-                )
-                break
-            except Exception as e:
-                prev_error = str(e)
-                logger.warning("sql generation error", attempt=attempt + 1, error=prev_error)
-        if not validated_sql:
-            logger.error("sql generation failed after retries", client_id=client_id)
-            return None
-        # Execute on user's DB
-        creds = decrypt_credentials_dict(client.credentials)
-        with db_pipeline_service.engine_scope(client.db_type, creds) as engine:
-            rows = await asyncio.to_thread(self._run_sql, engine, validated_sql)
-        column_types = {
-            col["name"]: col["type"]
-            for cols in full_schema.values()
-            for col in cols
-        }
-        columns = list(rows[0].keys()) if rows else []
-        return QueryResult(
-            source_type="database",
-            source_id=client_id,
-            table_or_file=", ".join(hit_tables),
-            columns=columns,
-            rows=rows,
-            row_count=len(rows),
-            metadata={
-                "db_type": client.db_type,
-                "client_name": client.name,
-                "sql": validated_sql,
-                "column_types": {c: column_types.get(c, "unknown") for c in columns},
-            },
-        )
-    # ------------------------------------------------------------------
-    # Schema helpers
-    # ------------------------------------------------------------------
-    async def _find_related_tables(
-        self,
-        client_id: str,
-        user_id: str,
-        hit_tables: list[str],
-    ) -> list[str]:
-        """One-hop FK neighbours of `hit_tables`, both directions, excluding hits.
-        Prefers chunk_level='table' rows; if none exist for the client (legacy
-        ingest predating Phase 1), falls back to aggregating from column-chunk
-        metadata. Returns [] when no FK metadata is available.
-        Capped at _FK_EXPANSION_MAX_TABLES, ranked by edge count desc then
-        table name asc. A warning is logged when the cap kicks in.
-        """
-        if not hit_tables:
-            return []
-        hit_set = set(hit_tables)
-        # edge_counts[related_table] = number of FK edges connecting it to the hit set
-        edge_counts: dict[str, int] = defaultdict(int)
-        # ---- Primary path: table-level chunks ----
-        sql = text("""
-            SELECT lpe.cmetadata
-            FROM langchain_pg_embedding lpe
-            JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
-            WHERE lpc.name = 'document_embeddings'
-              AND lpe.cmetadata->>'user_id' = :user_id
-              AND lpe.cmetadata->>'source_type' = 'database'
-              AND lpe.cmetadata->>'database_client_id' = :client_id
-              AND lpe.cmetadata->>'chunk_level' = 'table'
-        """)
-        async with _pgvector_engine.connect() as conn:
-            result = await conn.execute(sql, {"user_id": user_id, "client_id": client_id})
-            table_rows = result.fetchall()
-        if table_rows:
-            for row in table_rows:
-                data = row.cmetadata.get("data", {})
-                table = data.get("table_name")
-                fks = data.get("foreign_keys") or []
-                if not table:
-                    continue
-                if table in hit_set:
-                    # Outgoing: this hit's FKs point at related tables
-                    for fk in fks:
-                        target = fk.get("target_table")
-                        if target and target not in hit_set:
-                            edge_counts[target] += 1
-                else:
-                    # Incoming: this non-hit table's FKs point into the hit set
-                    for fk in fks:
-                        target = fk.get("target_table")
-                        if target in hit_set:
-                            edge_counts[table] += 1
-        else:
-            # ---- Fallback: aggregate from column chunks ----
-            sql = text("""
-                SELECT lpe.cmetadata->'data'->>'table_name' AS src_table,
-                       lpe.cmetadata->'data'->>'foreign_key' AS fk
-                FROM langchain_pg_embedding lpe
-                JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
-                WHERE lpc.name = 'document_embeddings'
-                  AND lpe.cmetadata->>'user_id' = :user_id
-                  AND lpe.cmetadata->>'source_type' = 'database'
-                  AND lpe.cmetadata->>'database_client_id' = :client_id
-                  AND lpe.cmetadata->>'chunk_level' = 'column'
-                  AND lpe.cmetadata->'data'->>'foreign_key' IS NOT NULL
-            """)
-            async with _pgvector_engine.connect() as conn:
-                result = await conn.execute(sql, {"user_id": user_id, "client_id": client_id})
-                col_rows = result.fetchall()
-            for row in col_rows:
-                src = row.src_table
-                fk = row.fk
-                if not src or not fk:
-                    continue
-                target = fk.split(".", 1)[0]
-                if src in hit_set and target and target not in hit_set:
-                    edge_counts[target] += 1
-                elif src not in hit_set and target in hit_set:
-                    edge_counts[src] += 1
-        if not edge_counts:
-            return []
-        ranked = sorted(edge_counts.items(), key=lambda kv: (-kv[1], kv[0]))
-        if len(ranked) > _FK_EXPANSION_MAX_TABLES:
-            logger.warning(
-                "fk expansion cap hit",
-                client_id=client_id,
-                total=len(ranked),
-                cap=_FK_EXPANSION_MAX_TABLES,
-                dropped=[t for t, _ in ranked[_FK_EXPANSION_MAX_TABLES:]],
-            )
-            ranked = ranked[:_FK_EXPANSION_MAX_TABLES]
-        related = [t for t, _ in ranked]
-        logger.info("fk-related tables", hit=sorted(hit_set), related=related)
-        return related
-    async def _fetch_abbreviated_schema(
-        self,
-        client_id: str,
-        user_id: str,
-        table_names: list[str],
-    ) -> dict[str, dict[str, Any]]:
-        """Abbreviated schema: name, row_count, PK, FKs, column names — no profiles.
-        Prefers chunk_level='table' rows. Falls back to aggregating column-chunk
-        metadata when table chunks are missing for a given table_name.
-        Returns {table_name: {"row_count": int|None, "primary_key": [str],
-        "foreign_keys": [{column, target_table, target_column}],
-        "column_names": [str]}}.
-        """
-        if not table_names:
-            return {}
-        placeholders = ", ".join(f":t{i}" for i in range(len(table_names)))
-        params: dict[str, Any] = {"user_id": user_id, "client_id": client_id}
-        for i, name in enumerate(table_names):
-            params[f"t{i}"] = name
-        # Primary path: one row per table from chunk_level='table'
-        sql_table = text(f"""
-            SELECT lpe.cmetadata
-            FROM langchain_pg_embedding lpe
-            JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
-            WHERE lpc.name = 'document_embeddings'
-              AND lpe.cmetadata->>'user_id' = :user_id
-              AND lpe.cmetadata->>'source_type' = 'database'
-              AND lpe.cmetadata->>'database_client_id' = :client_id
-              AND lpe.cmetadata->>'chunk_level' = 'table'
-              AND lpe.cmetadata->'data'->>'table_name' IN ({placeholders})
-        """)
-        async with _pgvector_engine.connect() as conn:
-            result = await conn.execute(sql_table, params)
-            t_rows = result.fetchall()
-        out: dict[str, dict[str, Any]] = {}
-        for row in t_rows:
-            data = row.cmetadata.get("data", {})
-            tname = data.get("table_name")
-            if not tname:
-                continue
-            out[tname] = {
-                "row_count": data.get("row_count"),
-                "primary_key": list(data.get("primary_key") or []),
-                "foreign_keys": list(data.get("foreign_keys") or []),
-                "column_names": list(data.get("column_names") or []),
-            }
-        # Fallback for tables with no table-chunk: aggregate column chunks
-        missing = [t for t in table_names if t not in out]
-        if missing:
-            placeholders_m = ", ".join(f":m{i}" for i in range(len(missing)))
-            params_m: dict[str, Any] = {"user_id": user_id, "client_id": client_id}
-            for i, name in enumerate(missing):
-                params_m[f"m{i}"] = name
-            sql_col = text(f"""
-                SELECT lpe.cmetadata
-                FROM langchain_pg_embedding lpe
-                JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
-                WHERE lpc.name = 'document_embeddings'
-                  AND lpe.cmetadata->>'user_id' = :user_id
-                  AND lpe.cmetadata->>'source_type' = 'database'
-                  AND lpe.cmetadata->>'database_client_id' = :client_id
-                  AND lpe.cmetadata->>'chunk_level' = 'column'
-                  AND lpe.cmetadata->'data'->>'table_name' IN ({placeholders_m})
-                ORDER BY lpe.cmetadata->'data'->>'table_name', lpe.cmetadata->'data'->>'column_name'
-            """)
-            async with _pgvector_engine.connect() as conn:
-                result = await conn.execute(sql_col, params_m)
-                c_rows = result.fetchall()
-            agg: dict[str, dict[str, Any]] = {
-                t: {"row_count": None, "primary_key": [], "foreign_keys": [], "column_names": []}
-                for t in missing
-            }
-            for row in c_rows:
-                data = row.cmetadata.get("data", {})
-                tname = data.get("table_name")
-                cname = data.get("column_name")
-                if not tname or tname not in agg or not cname:
-                    continue
-                bucket = agg[tname]
-                bucket["column_names"].append(cname)
-                if data.get("is_primary_key"):
-                    bucket["primary_key"].append(cname)
-                fk = data.get("foreign_key")
-                if fk:
-                    target_table, _, target_col = fk.partition(".")
-                    bucket["foreign_keys"].append({
-                        "column": cname,
-                        "target_table": target_table,
-                        "target_column": target_col,
-                    })
-            for t, v in agg.items():
-                if v["column_names"]:
-                    out[t] = v
-        return out
-    async def _fetch_full_schema(
-        self,
-        client_id: str,
-        table_names: list[str],
-        user_id: str,
-    ) -> dict[str, list[dict[str, Any]]]:
-        """Fetch ALL column chunks for the given tables from PGVector.
-        Returns {table_name: [{"name": ..., "type": ..., "is_primary_key": ...,
-                                "foreign_key": ..., "content": ...}]}
-        """
-        placeholders = ", ".join(f":t{i}" for i in range(len(table_names)))
-        sql = text(f"""
-            SELECT lpe.cmetadata, lpe.document
-            FROM langchain_pg_embedding lpe
-            JOIN langchain_pg_collection lpc ON lpe.collection_id = lpc.uuid
-            WHERE lpc.name = 'document_embeddings'
-              AND lpe.cmetadata->>'user_id' = :user_id
-              AND lpe.cmetadata->>'source_type' = 'database'
-              AND lpe.cmetadata->>'chunk_level' = 'column'
-              AND lpe.cmetadata->>'database_client_id' = :client_id
-              AND lpe.cmetadata->'data'->>'table_name' IN ({placeholders})
-            ORDER BY lpe.cmetadata->'data'->>'table_name', lpe.cmetadata->'data'->>'column_name'
-        """)
-        params: dict[str, Any] = {"user_id": user_id, "client_id": client_id}
-        for i, name in enumerate(table_names):
-            params[f"t{i}"] = name
-        async with _pgvector_engine.connect() as conn:
-            result = await conn.execute(sql, params)
-            rows = result.fetchall()
-        schema: dict[str, list[dict[str, Any]]] = defaultdict(list)
-        for row in rows:
-            data = row.cmetadata.get("data", {})
-            table = data.get("table_name")
-            if table:
-                schema[table].append({
-                    "name": data.get("column_name", ""),
-                    "type": data.get("column_type", ""),
-                    "is_primary_key": data.get("is_primary_key", False),
-                    "foreign_key": data.get("foreign_key"),
-                    "content": row.document,  # chunk text includes top values / samples
-                })
-        return dict(schema)
-    def _build_schema_context(
-        self,
-        schema: dict[str, list[dict[str, Any]]],
-        related_schema: dict[str, dict[str, Any]] | None = None,
-    ) -> str:
-        lines: list[str] = []
-        for table, columns in schema.items():
-            lines.append(f"Table: {table}")
-            for col in columns:
-                flags = []
-                if col["is_primary_key"]:
-                    flags.append("PRIMARY KEY")
-                if col["foreign_key"]:
-                    flags.append(f"FK -> {col['foreign_key']}")
-                flag_str = f"  [{', '.join(flags)}]" if flags else ""
-                lines.append(f"  - {col['name']}  {col['type']}{flag_str}")
-                # Include sample/top-values line from chunk content if present
-                for line in col["content"].splitlines():
-                    if line.startswith(("Top values:", "Sample values:")):
-                        lines.append(f"    {line}")
-                        break
-            lines.append("")
-        related_block = self._build_related_schema_block(related_schema or {})
-        if related_block:
-            lines.append(related_block)
-        return "\n".join(lines).strip()
-    def _build_related_schema_block(self, related_schema: dict[str, dict[str, Any]]) -> str:
-        """Format the abbreviated FK-related-tables section. Empty string when no related."""
-        if not related_schema:
-            return ""
-        lines: list[str] = ["Related tables (one hop via FK, abbreviated — use for JOINs only):"]
-        for table, info in related_schema.items():
-            row_count = info.get("row_count")
-            header = f"- {table} ({row_count} rows)" if row_count is not None else f"- {table}"
-            lines.append(header)
-            pk = info.get("primary_key") or []
-            lines.append(f"    Primary key: {', '.join(pk) if pk else '(none)'}")
-            fks = info.get("foreign_keys") or []
-            if fks:
-                fk_strs = [
-                    f"{fk.get('column')} -> {fk.get('target_table')}.{fk.get('target_column')}"
-                    for fk in fks
-                ]
-                lines.append(f"    Foreign keys: {', '.join(fk_strs)}")
-            else:
-                lines.append("    Foreign keys: (none)")
-            cols = info.get("column_names") or []
-            lines.append(f"    Columns: {', '.join(cols)}")
-        return "\n".join(lines)
-    # ------------------------------------------------------------------
-    # Guardrails
-    # ------------------------------------------------------------------
-    def _validate(
-        self,
-        sql: str,
-        allowed_tables: set[str],
-        limit: int,
-        column_map: dict[str, set[str]] | None = None,
-    ) -> str:
-        """Return an error string if validation fails, empty string if OK.
-        `allowed_tables` is the union of hit-table names and FK-related table
-        names — both are legal targets for SELECT/JOIN.
-        `column_map` maps table_name → set of valid column names. When provided,
-        any qualified table.column reference not found in the map triggers a retry
-        with an informative error so the LLM can self-correct without hallucinating.
-        """
-        # Layer 1: sqlglot parse + SELECT-only check
-        try:
-            parsed = sqlglot.parse_one(sql)
-        except sqlglot.errors.ParseError as e:
-            return f"SQL parse error: {e}"
-        if not isinstance(parsed, exp.Select):
-            return f"Only SELECT statements are allowed. Got: {type(parsed).__name__}"
-        # Check for DML anywhere in the AST (including writeable CTEs)
-        for node in parsed.find_all((exp.Insert, exp.Update, exp.Delete)):
-            return f"DML ({type(node).__name__}) is not allowed."
-        # Layer 2: schema grounding — table names
-        known_tables = {t.lower() for t in allowed_tables}
-        alias_to_table: dict[str, str] = {}
-        for tbl in parsed.find_all(exp.Table):
-            name = tbl.name.lower()
-            if name and name not in known_tables:
-                return f"Unknown table '{tbl.name}'. Only use tables from the schema."
-            alias = (tbl.alias or tbl.name).lower()
-            alias_to_table[alias] = name
-        # Layer 3: column grounding — qualified references only (table.column)
-        if column_map:
-            normalized_map = {t.lower(): {c.lower() for c in cols} for t, cols in column_map.items()}
-            for col_node in parsed.find_all(exp.Column):
-                tbl_ref = col_node.table
-                if not tbl_ref:
-                    continue  # unqualified — skip, can't resolve without full alias tracking
-                tbl_name = alias_to_table.get(tbl_ref.lower(), tbl_ref.lower())
-                col_name = col_node.name.lower()
-                if tbl_name in normalized_map and col_name not in normalized_map[tbl_name]:
-                    available = ", ".join(sorted(normalized_map[tbl_name]))
-                    return (
-                        f"Column '{col_node.name}' does not exist on table '{tbl_name}'. "
-                        f"Available columns: {available}."
-                    )
-        # Layer 4: LIMIT enforcement (inject if missing — done before execution)
-        return ""
-    # ------------------------------------------------------------------
-    # SQL execution
-    # ------------------------------------------------------------------
-    def _enforce_limit(self, sql: str, limit: int) -> str:
-        """Inject or cap LIMIT using sqlglot AST manipulation."""
-        parsed = sqlglot.parse_one(sql)
-        existing = parsed.find(exp.Limit)
-        if existing:
-            current = int(existing.expression.this)
-            if current > limit:
-                return parsed.limit(limit).sql()
-        else:
-            return parsed.limit(limit).sql()
-        return parsed.sql()
-    def _run_sql(self, engine: Any, sql: str) -> list[dict]:
-        # Ensure the user DB connection is a read-only credential — sqlglot validation alone is not sufficient.
-        with engine.connect() as conn:
-            result = conn.execute(text(sql))
-            return [dict(row) for row in result.mappings()]
-db_executor = DbExecutor()

src/query/executors/tabular.py DELETED Viewed

@@ -1,287 +0,0 @@
-"""Executor for tabular document sources (source_type="document", file_type csv/xlsx).
-Flow:
-  1. Group RetrievalResult chunks by (document_id, sheet_name).
-  2. Per group: download Parquet from Azure Blob → pandas DataFrame.
-  3. Build schema context from DataFrame columns + sample values.
-  4. LLM decides operation (groupby_sum, filter, top_n, etc.) via structured output.
-  5. Pandas runs the operation; retry up to 3x on error with feedback to LLM.
-  6. Fallback to raw rows if all retries fail.
-  7. Return QueryResult per group.
-"""
-import asyncio
-from typing import Literal, TypedDict
-import pandas as pd
-from langchain_core.prompts import ChatPromptTemplate
-from langchain_openai import AzureChatOpenAI
-from pydantic import BaseModel
-from sqlalchemy.ext.asyncio import AsyncSession
-from src.config.settings import settings
-from src.storage.parquet import download_parquet
-from src.middlewares.logging import get_logger
-from src.query.base import BaseExecutor, QueryResult
-from src.retrieval.base import RetrievalResult
-logger = get_logger("tabular_executor")
-class _GroupInfo(TypedDict):
-    filename: str
-    file_type: str
-_TABULAR_FILE_TYPES = ("csv", "xlsx")
-_MAX_RETRIES = 3
-_SYSTEM_PROMPT = """\
-You are a data analyst. Given a DataFrame schema and a user question, \
-decide which pandas operation to perform.
-IMPORTANT rules:
-- Use ONLY the exact column names as written in the schema below. Never translate or rename them.
-- For top_n: always set value_col to the column to sort by. Do NOT use sort_col for top_n.
-- For sort: use sort_col for the column to sort by.
-- For filter with comparison (>, <, >=, <=, !=): set filter_operator accordingly (gt, lt, gte, lte, ne). Default is eq (==).
-- For multi-condition filters (AND logic), use the filters field as a list of {{"col", "value", "op"}} dicts instead of filter_col/filter_value.
-  Example: status=SUCCESS AND amount_paid>200000 → filters=[{{"col":"status","value":"SUCCESS","op":"eq"}},{{"col":"amount_paid","value":"200000","op":"gt"}}]
-- For OR conditions on a column (e.g. value is A or B), use or_filters. Combine with filters for mixed AND+OR logic.
-  Example: (status=FAILED OR status=REVERSED) AND payment_channel=X → or_filters=[{{"col":"status","value":"FAILED","op":"eq"}},{{"col":"status","value":"REVERSED","op":"eq"}}], filters=[{{"col":"payment_channel","value":"X","op":"eq"}}]
-- For groupby with a pre-filter (e.g. count SUCCESS per channel): use filters or or_filters to narrow rows first, then use groupby_count/groupby_sum/groupby_avg on the filtered data by setting both filters and group_col.
-Schema:
-{schema}
-{error_section}"""
-class TabularOperation(BaseModel):
-    operation: Literal[
-        "filter", "groupby_sum", "groupby_avg", "groupby_count",
-        "top_n", "sort", "aggregate", "raw"
-    ]
-    group_col: str | None = None       # for groupby_*
-    value_col: str | None = None       # for groupby_*, top_n, aggregate
-    filter_col: str | None = None      # for single filter
-    filter_value: str | None = None    # for single filter
-    filter_operator: Literal["eq", "ne", "gt", "gte", "lt", "lte"] = "eq"  # for single filter
-    filters: list[dict] | None = None     # for multi-condition AND: [{"col": ..., "value": ..., "op": ...}]
-    or_filters: list[dict] | None = None  # for OR conditions, applied before AND filters
-    sort_col: str | None = None        # for sort
-    ascending: bool = True             # for sort
-    n: int | None = None               # for top_n
-    agg_func: Literal["sum", "avg", "min", "max", "count"] | None = None  # for aggregate
-    reasoning: str
-def _get_filter_mask(df: pd.DataFrame, col: str, value: str, operator: str) -> pd.Series:
-    numeric = pd.to_numeric(df[col], errors="coerce")
-    if operator == "eq":
-        return df[col].astype(str) == str(value)
-    elif operator == "ne":
-        return df[col].astype(str) != str(value)
-    elif operator == "gt":
-        return numeric > float(value)
-    elif operator == "gte":
-        return numeric >= float(value)
-    elif operator == "lt":
-        return numeric < float(value)
-    elif operator == "lte":
-        return numeric <= float(value)
-    raise ValueError(f"Unknown operator: {operator}")
-def _apply_single_filter(df: pd.DataFrame, col: str, value: str, operator: str) -> pd.DataFrame:
-    return df[_get_filter_mask(df, col, value, operator)]
-def _build_schema_context(df: pd.DataFrame) -> str:
-    lines = []
-    for col in df.columns:
-        sample = df[col].dropna().head(3).tolist()
-        lines.append(f"- {col} ({df[col].dtype}): sample values: {sample}")
-    return "\n".join(lines)
-def _apply_operation(df: pd.DataFrame, op: TabularOperation, limit: int) -> pd.DataFrame:
-    if op.operation == "groupby_sum":
-        if not op.group_col or not op.value_col:
-            raise ValueError(f"groupby_sum requires group_col and value_col, got {op}")
-        return df.groupby(op.group_col)[op.value_col].sum().reset_index().nlargest(limit, op.value_col)
-    elif op.operation == "groupby_avg":
-        if not op.group_col or not op.value_col:
-            raise ValueError(f"groupby_avg requires group_col and value_col, got {op}")
-        return df.groupby(op.group_col)[op.value_col].mean().reset_index().nlargest(limit, op.value_col)
-    elif op.operation == "groupby_count":
-        if not op.group_col:
-            raise ValueError(f"groupby_count requires group_col, got {op}")
-        df_filtered = df.copy()
-        if op.or_filters:
-            or_mask = pd.Series([False] * len(df_filtered), index=df_filtered.index)
-            for f in op.or_filters:
-                or_mask = or_mask | _get_filter_mask(df_filtered, f["col"], f["value"], f.get("op", "eq"))
-            df_filtered = df_filtered[or_mask]
-        if op.filters:
-            for f in op.filters:
-                df_filtered = _apply_single_filter(df_filtered, f["col"], f["value"], f.get("op", "eq"))
-        elif op.filter_col and op.filter_value is not None:
-            df_filtered = _apply_single_filter(df_filtered, op.filter_col, op.filter_value, op.filter_operator)
-        return df_filtered.groupby(op.group_col).size().reset_index(name="count").nlargest(limit, "count")
-    elif op.operation == "filter":
-        result = df.copy()
-        if op.or_filters:
-            or_mask = pd.Series([False] * len(result), index=result.index)
-            for f in op.or_filters:
-                or_mask = or_mask | _get_filter_mask(result, f["col"], f["value"], f.get("op", "eq"))
-            result = result[or_mask]
-        if op.filters:
-            for f in op.filters:
-                result = _apply_single_filter(result, f["col"], f["value"], f.get("op", "eq"))
-        elif op.filter_col and op.filter_value is not None and not op.or_filters:
-            result = _apply_single_filter(result, op.filter_col, op.filter_value, op.filter_operator)
-        elif not op.or_filters and not op.filters and (not op.filter_col or op.filter_value is None):
-            raise ValueError(f"filter requires filter_col/filter_value or filters or or_filters, got {op}")
-        return result.head(limit)
-    elif op.operation == "top_n":
-        col = op.value_col
-        if not col:
-            raise ValueError(f"top_n requires value_col, got {op}")
-        n = op.n or limit
-        return df.nlargest(n, col)
-    elif op.operation == "sort":
-        if not op.sort_col:
-            raise ValueError(f"sort requires sort_col, got {op}")
-        return df.sort_values(op.sort_col, ascending=op.ascending).head(limit)
-    elif op.operation == "aggregate":
-        if not op.value_col or not op.agg_func:
-            raise ValueError(f"aggregate requires value_col and agg_func, got {op}")
-        funcs = {"sum": "sum", "avg": "mean", "min": "min", "max": "max", "count": "count"}
-        value = getattr(df[op.value_col], funcs[op.agg_func])()
-        return pd.DataFrame([{op.value_col: value, "operation": op.agg_func}])
-    else:  # "raw"
-        return df.head(limit)
-class TabularExecutor(BaseExecutor):
-    def __init__(self) -> None:
-        self._llm = AzureChatOpenAI(
-            azure_deployment=settings.azureai_deployment_name_4o,
-            openai_api_version=settings.azureai_api_version_4o,
-            azure_endpoint=settings.azureai_endpoint_url_4o,
-            api_key=settings.azureai_api_key_4o,
-            temperature=0,
-        )
-        self._prompt = ChatPromptTemplate.from_messages([
-            ("system", _SYSTEM_PROMPT),
-            ("human", "{question}"),
-        ])
-        self._chain = self._prompt | self._llm.with_structured_output(TabularOperation)
-    async def execute(
-        self,
-        results: list[RetrievalResult],
-        user_id: str,
-        _db: AsyncSession,
-        question: str,
-        limit: int = 100,
-    ) -> list[QueryResult]:
-        tabular = [
-            r for r in results
-            if r.source_type == "document"
-            and r.metadata.get("data", {}).get("file_type") in _TABULAR_FILE_TYPES
-        ]
-        if not tabular:
-            return []
-        # Group by (document_id, sheet_name) — one parquet download per group
-        groups: dict[tuple[str, str | None], _GroupInfo] = {}
-        for r in tabular:
-            data = r.metadata.get("data", {})
-            doc_id = data.get("document_id")
-            if not doc_id:
-                continue
-            sheet_name = data.get("sheet_name")  # None for CSV
-            key = (doc_id, sheet_name)
-            if key not in groups:
-                groups[key] = {
-                    "filename": data.get("filename", ""),
-                    "file_type": data.get("file_type", ""),
-                }
-        async def _process_group(
-            doc_id: str, sheet_name: str | None, info: _GroupInfo
-        ) -> QueryResult | None:
-            try:
-                df = await download_parquet(user_id, doc_id, sheet_name)
-                df_result = await self._query_with_agent(df, question, limit)
-                table_label = info["filename"]
-                if sheet_name:
-                    table_label += f" / sheet: {sheet_name}"
-                logger.info(
-                    "tabular query complete",
-                    document_id=doc_id,
-                    sheet=sheet_name,
-                    file_type=info["file_type"],
-                    rows=len(df_result),
-                    columns=len(df_result.columns),
-                )
-                return QueryResult(
-                    source_type="document",
-                    source_id=doc_id,
-                    table_or_file=table_label,
-                    columns=list(df_result.columns),
-                    rows=df_result.to_dict(orient="records"),
-                    row_count=len(df_result),
-                )
-            except Exception as e:
-                logger.error(
-                    "tabular query failed",
-                    document_id=doc_id,
-                    sheet=sheet_name,
-                    error=str(e),
-                )
-                return None
-        gathered = await asyncio.gather(*[
-            _process_group(doc_id, sheet_name, info)
-            for (doc_id, sheet_name), info in groups.items()
-        ])
-        return [r for r in gathered if r is not None]
-    async def _query_with_agent(
-        self, df: pd.DataFrame, question: str, limit: int
-    ) -> pd.DataFrame:
-        schema_ctx = _build_schema_context(df)
-        prev_error = ""
-        for attempt in range(_MAX_RETRIES):
-            error_section = (
-                f"Previous attempt failed: {prev_error}\nFix the issue."
-                if prev_error else ""
-            )
-            try:
-                op: TabularOperation = await self._chain.ainvoke({
-                    "schema": schema_ctx,
-                    "error_section": error_section,
-                    "question": question,
-                })
-                logger.info(
-                    "tabular operation decided",
-                    operation=op.operation,
-                    reasoning=op.reasoning,
-                )
-                return _apply_operation(df, op, limit)
-            except Exception as e:
-                prev_error = str(e)
-                logger.warning("tabular agent error", attempt=attempt + 1, error=prev_error)
-        # Fallback: return raw rows
-        logger.warning("tabular agent failed after retries, returning raw rows")
-        return df.head(limit)
-tabular_executor = TabularExecutor()

src/query/query_executor.py DELETED Viewed

@@ -1,42 +0,0 @@
-"""QueryExecutor — dispatches retrieval results to the appropriate executor by source_type."""
-import asyncio
-from sqlalchemy.ext.asyncio import AsyncSession
-from src.middlewares.logging import get_logger
-from src.query.base import QueryResult
-from src.query.executors.db_executor import db_executor
-from src.query.executors.tabular import tabular_executor
-from src.retrieval.base import RetrievalResult
-logger = get_logger("query_executor")
-class QueryExecutor:
-    async def execute(
-        self,
-        results: list[RetrievalResult],
-        user_id: str,
-        db: AsyncSession,
-        question: str,
-        limit: int = 100,
-    ) -> list[QueryResult]:
-        batches = await asyncio.gather(
-            db_executor.execute(results, user_id, db, question, limit),
-            tabular_executor.execute(results, user_id, db, question, limit),
-            return_exceptions=True,
-        )
-        query_results: list[QueryResult] = []
-        for batch in batches:
-            if isinstance(batch, Exception):
-                logger.error("executor failed", error=str(batch))
-                continue
-            query_results.extend(batch)
-        logger.info("query execution complete", total=len(query_results))
-        return query_results
-query_executor = QueryExecutor()