Spaces:

julkarnaeen
/

Databot

Runtime error

App Files Files Community

julkarnaeen commited on Mar 1

Commit

27514ca

verified ·

1 Parent(s): 1a07f7f

Update databot.py

Browse files

Files changed (1) hide show

databot.py +309 -417

databot.py CHANGED Viewed

@@ -1,418 +1,310 @@
-import os
-import re
-import time
-import sys
-import json
-from dotenv import load_dotenv # pyre-ignore[21]
-from sqlalchemy import create_engine, text# pyre-ignore[21]
-from openai import OpenAI as OpenAIClient
-# Load credentials from .env
-load_dotenv()
-# Path to security config
-CONFIG_PATH = os.path.join(os.path.dirname(__file__), "data_access_config.json")
-class DataBot:
-    def __init__(self):
-        # 1. Database Connection Details
-        self.db_user = os.getenv("DB_USER")
-        self.db_pass = os.getenv("DB_PASSWORD")
-        self.db_host = os.getenv("DB_HOST", "51.89.104.26")
-        self.db_name = os.getenv("DB_NAME", "dev_poly")
-        self.port = "3306"
-        # 2. Initialize OpenAI client directly
-        self.client = OpenAIClient(api_key=os.getenv("OPENAI_API_KEY"))
-        self.model = os.getenv("LLM_MODEL", "gpt-4o")
-        # 3. Load data access security config
-        self.access_config: dict = self._load_access_config()
-        # 4. Establish MySQL Connection with timeouts
-        self.engine = create_engine(
-            f"mysql+pymysql://{self.db_user}:{self.db_pass}@{self.db_host}:{self.port}/{self.db_name}?charset=utf8",
-            connect_args={
-                "connect_timeout": 30,
-                "read_timeout": 60,
-                "write_timeout": 60,
-            },
-            pool_pre_ping=True,
-            pool_recycle=300,
-        )
-        # 5. Cache the schema once at startup (filtered by access config)
-        print("Loading database schema...")
-        raw_schema = self._load_raw_schema_with_retry()
-        self.schema_info = self._filter_schema(raw_schema)
-        print(f"Loaded schema for {len(self.schema_info)} accessible tables "
-              f"(filtered from {len(raw_schema)} total).")
-    # ── Security Config ──────────────────────────────────────────────
-    def _load_access_config(self) -> dict:
-        """Load data access security configuration."""
-        try:
-            with open(CONFIG_PATH, "r", encoding="utf-8") as f:
-                config = json.load(f)
-            print("Loaded data access security config.")
-            return config
-        except FileNotFoundError:
-            print("WARNING: data_access_config.json not found! All tables accessible.")
-            return {}
-        except json.JSONDecodeError as e:
-            print(f"WARNING: Invalid config JSON: {e}. All tables accessible.")
-            return {}
-    def _is_table_allowed(self, table_name):
-        """Check if a table is allowed based on the access config."""
-        if not self.access_config:
-            return True  # No config = allow all (backward compatible)
-        table_lower = table_name.lower()
-        # First check restricted (takes priority)
-        for prefix in self.access_config.get("restricted_table_prefixes", []):
-            if table_lower.startswith(prefix.lower()):
-                return False
-        # Then check allowed
-        for prefix in self.access_config.get("allowed_table_prefixes", []):
-            if table_lower.startswith(prefix.lower()):
-                return True
-        # Default: deny (whitelist approach)
-        return False
-    def _is_column_allowed(self, column_name):
-        """Check if a column is allowed based on restricted column patterns."""
-        if not self.access_config:
-            return True
-        col_lower = column_name.lower()
-        for restricted_col in self.access_config.get("restricted_columns", []):
-            if restricted_col.lower() == col_lower:
-                return False
-        return True
-    def _filter_schema(self, raw_schema: dict[str, list[str]]) -> dict[str, list[str]]:
-        """Filter the raw schema to remove restricted tables and columns."""
-        filtered: dict[str, list[str]] = {}
-        blocked_tables: list[str] = []
-        for table, columns in raw_schema.items():
-            if not self._is_table_allowed(table):
-                blocked_tables.append(table)
-                continue
-            # Filter out restricted columns
-            safe_columns = []
-            for col_entry in columns:
-                # col_entry format: "column_name (type)"
-                col_name = col_entry.split(" (")[0].strip()
-                if self._is_column_allowed(col_name):
-                    safe_columns.append(col_entry)
-            if safe_columns:
-                filtered[table] = safe_columns
-        if blocked_tables:
-            print(f"  → Blocked {len(blocked_tables)} restricted tables.")
-        return filtered
-    # ── SQL Security Validation ──────────────────────────────────────
-    def _validate_sql_security(self, sql):
-        """
-        Validate that a generated SQL query doesn't reference restricted tables or columns.
-        Returns (is_safe, reason) tuple.
-        """
-        if not self.access_config:
-            return True, ""
-        sql_upper = sql.upper()
-        # Check for write operations (extra safety)
-        write_ops = ["INSERT", "UPDATE", "DELETE", "DROP", "ALTER", "TRUNCATE", "CREATE"]
-        for op in write_ops:
-            if re.search(rf'\b{op}\b', sql_upper):
-                return False, f"Write operation '{op}' is not allowed."
-        sql_lower = sql.lower()
-        # Check for restricted tables
-        for prefix in self.access_config.get("restricted_table_prefixes", []):
-            # Look for table references with word boundaries
-            pattern = rf'\b{re.escape(prefix.lower())}\w*\b'
-            if re.search(pattern, sql_lower):
-                return False, f"Query references restricted data ('{prefix}*' tables). Access denied."
-        # Check for restricted columns in SELECT
-        for restricted_col in self.access_config.get("restricted_columns", []):
-            pattern = rf'\b{re.escape(restricted_col.lower())}\b'
-            if re.search(pattern, sql_lower):
-                return False, f"Query references restricted column '{restricted_col}'. Access denied."
-        return True, ""
-    # ── Core Schema Loading ──────────────────────────────────────────
-    def _load_raw_schema_with_retry(self, max_retries=3, delay=5) -> dict[str, list[str]]:
-        """Try to load the schema with retry logic for connection failures."""
-        for attempt in range(1, max_retries + 1):
-            try:
-                return self._load_raw_schema()
-            except Exception as e:
-                if attempt < max_retries:
-                    print(f"  ✗ Connection attempt {attempt}/{max_retries} failed. "
-                          f"Retrying in {delay}s...")
-                    time.sleep(delay)
-                else:
-                    print(f"\n{'='*60}")
-                    print(f"ERROR: Cannot connect to MySQL server")
-                    print(f"  Host: {self.db_host}:{self.port}")
-                    print(f"  Database: {self.db_name}")
-                    print(f"  Tried {max_retries} times.")
-                    print(f"")
-                    print(f"  Possible causes:")
-                    print(f"    1. MySQL service is down on the server")
-                    print(f"    2. Firewall is blocking port {self.port}")
-                    print(f"    3. Wrong host/credentials in .env")
-                    print(f"{'='*60}")
-                    raise SystemExit(1) from e
-        return {}  # unreachable, but satisfies type checker
-    def _load_raw_schema(self) -> dict[str, list[str]]:
-        """Load ALL table names and columns from INFORMATION_SCHEMA (one fast query)."""
-        query = text("""
-            SELECT TABLE_NAME, COLUMN_NAME, COLUMN_TYPE
-            FROM INFORMATION_SCHEMA.COLUMNS
-            WHERE TABLE_SCHEMA = :db_name
-            ORDER BY TABLE_NAME, ORDINAL_POSITION
-        """)
-        schema: dict[str, list[str]] = {}
-        with self.engine.connect() as conn:
-            result = conn.execute(query, {"db_name": self.db_name})
-            for row in result:
-                table = row[0]
-                col_name = row[1]
-                col_type = row[2]
-                if table not in schema:
-                    schema[table] = []
-                schema[table].append(f"{col_name} ({col_type})")
-        return schema
-    # ── LLM Pipeline ─────────────────────────────────────────────────
-    def _pick_relevant_tables(self, question, max_tables=5):
-        """Use GPT to quickly pick relevant tables from ALLOWED table names only."""
-        table_names: list[str] = list(self.schema_info.keys())
-        table_list = ", ".join(table_names)
-        response = self.client.chat.completions.create(
-            model=self.model,
-            temperature=0,
-            max_tokens=200,
-            messages=[{
-                "role": "system",
-                "content": (
-                    "You are a bilingual database assistant. You understand questions in English and French "
-                    "and pick the most relevant tables.\n"
-                    "You can ONLY pick tables from the provided list. "
-                    "Return ONLY comma-separated table names from the list, nothing else."
-                )
-            }, {
-                "role": "user",
-                "content": (
-                    f"Here are the ONLY accessible MySQL table names from the dev_poly ERP database:\n"
-                    f"{table_list}\n\n"
-                    f"Question: \"{question}\"\n\n"
-                    f"Pick the {max_tables} most relevant tables to answer this question. "
-                    f"Return ONLY comma-separated table names, nothing else. "
-                    f"You may ONLY use tables from the list above."
-                )
-            }]
-        )
-        content = response.choices[0].message.content or ""
-        suggested = [t.strip().strip("'\"` ") for t in content.split(",")]
-        valid = [t for t in suggested if t in self.schema_info]
-        fallback: list[str] = table_names[:5]
-        return valid if valid else fallback
-    def _build_schema_context(self, tables: list[str]):
-        """Build a compact schema string for the selected tables."""
-        parts: list[str] = []
-        for table in tables:
-            if table in self.schema_info:
-                cols = ", ".join(self.schema_info[table])
-                parts.append(f"Table '{table}': {cols}")
-        return "\n".join(parts)
-    def _generate_sql(self, question, schema_context):
-        """Ask GPT to generate a SELECT query with security constraints."""
-        response = self.client.chat.completions.create(
-            model=self.model,
-            temperature=0,
-            max_tokens=500,
-            messages=[{
-                "role": "system",
-                "content": (
-                    "You are DataBot, a bilingual SQL expert for the dev_poly ERP MySQL database. "
-                    "You understand questions in English and French.\n\n"
-                    "TASK: Generate ONLY ONE single SELECT statement based on the user's question.\n"
-                    "- No INSERT/UPDATE/DELETE. Only SELECT queries.\n"
-                    "- Do NOT generate multiple queries. Combine data using JOINs or subqueries into ONE query.\n"
-                    "- Do NOT use semicolons.\n"
-                    "- Return ONLY the raw SQL query, no explanation, no markdown.\n"
-                    "- Only use the tables and columns provided in the schema context below.\n"
-                    "- NEVER reference tables or columns not in the provided schema.\n\n"
-                    "SPECIAL CASES:\n"
-                    "- If the user's message is NOT a database question (e.g. greetings, chitchat, "
-                    "general knowledge questions unrelated to the database), return ONLY the text: NOT_A_QUERY\n"
-                    "- If the user explicitly asks for passwords, bank account numbers, or identity document "
-                    "numbers, return ONLY the text: SECURITY_BLOCK\n\n"
-                    "IMPORTANT: If the user asks a legitimate business question (like counts, totals, lists) "
-                    "about ANY topic and the schema provides relevant tables, generate the SQL. "
-                    "Do NOT block queries just because they mention employees, staff, or people — "
-                    "the schema you receive has already been filtered for security."
-                )
-            }, {
-                "role": "user",
-                "content": f"Schema:\n{schema_context}\n\nQuestion: {question}"
-            }]
-        )
-        raw_sql = response.choices[0].message.content
-        if not raw_sql:
-            return "NOT_A_QUERY"
-        sql = raw_sql.strip()
-        # Check for special LLM responses
-        if "SECURITY_BLOCK" in sql.upper():
-            return "SECURITY_BLOCK"
-        if "NOT_A_QUERY" in sql.upper():
-            return "NOT_A_QUERY"
-        # Clean up any markdown formatting
-        sql = sql.replace("```sql", "").replace("```", "").strip()
-        # Safety: if GPT returned multiple statements, keep only the first one
-        if ";" in sql:
-            sql = sql.split(";")[0].strip()
-        return sql
-    def _execute_sql(self, sql):
-        """Execute the SQL and return results."""
-        with self.engine.connect() as conn:
-            result = conn.execute(text(sql))
-            columns = list(result.keys())
-            rows = [dict(zip(columns, row)) for row in result.fetchall()]
-            return columns, rows
-    def _summarize_results(self, question, sql, columns, rows):
-        """Ask GPT to summarize the results in a well-structured, insightful way."""
-        # Limit rows to avoid token overflow
-        display_rows = rows[:50]
-        total_count = len(rows)
-        truncated = total_count > 50
-        result_text = f"Columns: {columns}\nRows ({total_count} total"
-        if truncated:
-            result_text += f", showing first 50"
-        result_text += "):\n"
-        for row in display_rows:
-            result_text += str(row) + "\n"
-        response = self.client.chat.completions.create(
-            model=self.model,
-            temperature=0.3,
-            max_tokens=2000,
-            messages=[{
-                "role": "system",
-                "content": (
-                    "You are DataBot, an intelligent ERP database assistant for the dev_poly system. "
-                    "Your job is to answer the user's question based on the SQL query results provided.\n\n"
-                    "RESPONSE GUIDELINES:\n"
-                    "1. **Answer the question directly first** — start with a clear, direct answer to what "
-                    "the user asked. Don't start with 'Based on the query results...' or similar filler.\n"
-                    "2. **Be specific with numbers** — always include exact counts, totals, amounts, "
-                    "and percentages where relevant. Round monetary values to 2 decimal places.\n"
-                    "3. **Use structured formatting** when presenting multiple items:\n"
-                    "   - Use numbered lists or bullet points for lists of items\n"
-                    "   - Use simple text tables for comparisons (align columns with spaces)\n"
-                    "   - Bold important values or key findings using **bold**\n"
-                    "4. **Add brief insights** — after presenting the data, add 1-2 sentences of "
-                    "business-relevant observations if applicable (e.g., trends, outliers, notable patterns).\n"
-                    "5. **Handle empty results gracefully** — if there are 0 rows, say so clearly and "
-                    "suggest possible reasons (e.g., 'No matching records found. This could mean the "
-                    "date range has no activity, or the filter criteria may be too narrow.').\n"
-                    "6. **Keep it conversational but professional** — write as a knowledgeable business "
-                    "analyst would speak, not as a robotic data dump.\n"
-                    "7. **If results are truncated** (showing partial data), mention that more records exist "
-                    "and the summary covers the displayed portion.\n"
-                    "8. **Match the user's language** — always reply in the same language the user "
-                    "used in their question. If they asked in French, respond entirely in French. "
-                    "If in English, respond in English. Only these two languages are supported.\n\n"
-                    "SECURITY RULES:\n"
-                    "- NEVER include personal data: phone numbers, email addresses, passwords, salaries, "
-                    "bank account numbers, identity document numbers, or home addresses.\n"
-                    "- If the results contain such data, omit it and note that it is restricted.\n"
-                    "- Focus on business-relevant information: counts, totals, trends, entity names, and statuses."
-                )
-            }, {
-                "role": "user",
-                "content": (
-                    f"Question: {question}\n"
-                    f"SQL executed: {sql}\n"
-                    f"Results:\n{result_text}"
-                )
-            }]
-        )
-        return (response.choices[0].message.content or "").strip()
-    # ── Main Entry Point ─────────────────────────────────────────────
-    def ask(self, question):
-        """Processes a natural language question and returns an answer."""
-        try:
-            # Step 1: Pick relevant tables (fast LLM call, only from allowed tables)
-            relevant_tables = self._pick_relevant_tables(question)
-            print(f"  → Tables: {', '.join(relevant_tables)}")
-            # Step 2: Build schema context from cache (instant, no DB query)
-            schema_context = self._build_schema_context(relevant_tables)
-            # Step 3: Generate SQL (fast LLM call with security prompt)
-            sql = self._generate_sql(question, schema_context)
-            # Step 3a: Check for non-database questions
-            if sql == "NOT_A_QUERY":
-                return ("Hello! I'm DataBot, your ERP database assistant. "
-                        "Ask me questions about your business data and I'll "
-                        "query the database to find the answer for you.")
-            # Step 3b: Check if the LLM blocked the query
-            if sql == "SECURITY_BLOCK":
-                return ("I'm sorry, but I cannot provide that information. "
-                        "Your request involves sensitive or personal data "
-                        "(such as salaries, passwords, phone numbers, or identity details) "
-                        "which I am not authorized to access.")
-            print(f"  → SQL: {sql}")
-            # Step 3c: Validate SQL doesn't reference restricted tables/columns
-            is_safe, reason = self._validate_sql_security(sql)
-            if not is_safe:
-                print(f"  → SECURITY BLOCK: {reason}")
-                return ("I'm sorry, but I cannot execute that query. "
-                        f"Security check: {reason}")
-            # Step 4: Execute SQL (one fast query)
-            columns, rows = self._execute_sql(sql)
-            # Step 5: Summarize results (fast LLM call)
-            return self._summarize_results(question, sql, columns, rows)
-        except Exception as e:
             return f"DataBot Error: {str(e)}"

+import os
+import re
+import time
+import json
+from dotenv import load_dotenv  # pyre-ignore[21]
+from sqlalchemy import create_engine, text  # pyre-ignore[21]
+from openai import OpenAI as OpenAIClient
+load_dotenv()
+# Config file paths
+BASE_DIR = os.path.dirname(__file__)
+def _load_json(path, name):
+    try:
+        with open(path, "r", encoding="utf-8") as f:
+            return json.load(f)
+    except (FileNotFoundError, json.JSONDecodeError) as e:
+        print(f"  ✗ {name}: {e}")
+        return {}
+class DataBot:
+    def __init__(self):
+        print("Loading configurations...")
+        self.db_cfg = _load_json(os.path.join(BASE_DIR, "db_config.json"), "db_config")
+        self.ai_cfg = _load_json(os.path.join(BASE_DIR, "ai_config.json"), "ai_config")
+        self.prompts = _load_json(os.path.join(BASE_DIR, "prompts_config.json"), "prompts_config")
+        self.access_cfg = _load_json(os.path.join(BASE_DIR, "data_access_config.json"), "data_access_config")
+        # Query limits
+        ql = self.db_cfg.get("query_limits", {})
+        self.MAX_ROWS = ql.get("max_rows", 100)
+        self.MAX_QUERY_TIME = ql.get("max_query_time_seconds", 30)
+        self.MAX_JOIN_TABLES = ql.get("max_join_tables", 3)
+        # Pre-cache restricted columns as a lowercase set (used on every column check)
+        self._restricted_cols = {c.lower() for c in self.access_cfg.get("restricted_columns", [])}
+        # AI model
+        self.model = self.ai_cfg.get("model", os.getenv("LLM_MODEL", "gpt-4o"))
+        self.client = OpenAIClient(api_key=os.getenv("OPENAI_API_KEY"))
+        # Database engine
+        conn_cfg = self.db_cfg.get("connection", {})
+        timeouts = self.db_cfg.get("timeouts", {})
+        pool = self.db_cfg.get("pool", {})
+        self.db_user = os.getenv("DB_USER")
+        self.db_pass = os.getenv("DB_PASSWORD")
+        self.db_host = os.getenv("DB_HOST", conn_cfg.get("host", "51.89.104.26"))
+        self.db_name = os.getenv("DB_NAME", conn_cfg.get("database", "dev_poly"))
+        self.port = conn_cfg.get("port", "3306")
+        self.engine = create_engine(
+            f"mysql+pymysql://{self.db_user}:{self.db_pass}@{self.db_host}:{self.port}/{self.db_name}?charset={conn_cfg.get('charset', 'utf8')}",
+            connect_args={
+                "connect_timeout": timeouts.get("connect_timeout", 30),
+                "read_timeout": timeouts.get("read_timeout", 60),
+                "write_timeout": timeouts.get("write_timeout", 60),
+            },
+            pool_pre_ping=pool.get("pool_pre_ping", True),
+            pool_recycle=pool.get("pool_recycle", 300),
+        )
+        # Load and filter schema
+        print("Loading database schema...")
+        schema_cfg = self.db_cfg.get("schema_loading", {})
+        raw = self._load_schema(schema_cfg.get("max_retries", 3), schema_cfg.get("retry_delay_seconds", 5))
+        self.schema_info = self._filter_schema(raw)
+        print(f"Loaded {len(self.schema_info)} accessible tables (from {len(raw)} total).")
+    # ── Schema ────────────────────────────────────────────────────────
+    def _load_schema(self, retries=3, delay=5):
+        for attempt in range(1, retries + 1):
+            try:
+                schema = {}
+                with self.engine.connect() as conn:
+                    rows = conn.execute(text(
+                        "SELECT TABLE_NAME, COLUMN_NAME, COLUMN_TYPE "
+                        "FROM INFORMATION_SCHEMA.COLUMNS "
+                        "WHERE TABLE_SCHEMA = :db ORDER BY TABLE_NAME, ORDINAL_POSITION"
+                    ), {"db": self.db_name})
+                    for r in rows:
+                        schema.setdefault(r[0], []).append(f"{r[1]} ({r[2]})")
+                return schema
+            except Exception as e:
+                if attempt < retries:
+                    print(f"  ✗ Attempt {attempt}/{retries} failed, retrying in {delay}s...")
+                    time.sleep(delay)
+                else:
+                    print(f"ERROR: Cannot connect to {self.db_host}:{self.port}/{self.db_name}")
+                    raise SystemExit(1) from e
+        return {}
+    def _filter_schema(self, raw):
+        if not self.access_cfg:
+            return raw
+        filtered = {}
+        blocked = 0
+        for table, cols in raw.items():
+            if not self._table_allowed(table):
+                blocked += 1
+                continue
+            safe = [c for c in cols if self._column_allowed(c.split(" (")[0].strip())]
+            if safe:
+                filtered[table] = safe
+        if blocked:
+            print(f"  → Blocked {blocked} restricted tables.")
+        return filtered
+    def _table_allowed(self, name):
+        if not self.access_cfg:
+            return True
+        t = name.lower()
+        for p in self.access_cfg.get("restricted_table_prefixes", []):
+            if t.startswith(p.lower()):
+                return False
+        for p in self.access_cfg.get("allowed_table_prefixes", []):
+            if t.startswith(p.lower()):
+                return True
+        return False
+    def _column_allowed(self, name):
+        if not self.access_cfg:
+            return True
+        return name.lower() not in self._restricted_cols
+    # ── Security & Limits ─────────────────────────────────────────────
+    def _validate_security(self, sql):
+        if not self.access_cfg:
+            return True, ""
+        sql_up = sql.upper()
+        for op in ("INSERT", "UPDATE", "DELETE", "DROP", "ALTER", "TRUNCATE", "CREATE"):
+            if re.search(rf'\b{op}\b', sql_up):
+                return False, f"Write operation '{op}' is not allowed."
+        sql_lo = sql.lower()
+        for prefix in self.access_cfg.get("restricted_table_prefixes", []):
+            if re.search(rf'\b{re.escape(prefix.lower())}\w*\b', sql_lo):
+                return False, f"Restricted data ('{prefix}*' tables). Access denied."
+        for col in self.access_cfg.get("restricted_columns", []):
+            if re.search(rf'\b{re.escape(col.lower())}\b', sql_lo):
+                return False, f"Restricted column '{col}'. Access denied."
+        return True, ""
+    def _validate_complexity(self, sql):
+        sql_up = sql.upper()
+        if "CROSS JOIN" in sql_up:
+            return False, "CROSS JOIN is not allowed."
+        if len(re.findall(r'\bJOIN\b', sql_up)) > self.MAX_JOIN_TABLES:
+            return False, f"Too many JOINs (max {self.MAX_JOIN_TABLES}). Simplify your question."
+        if re.search(r'SELECT\s+\*', sql_up) and not re.search(r'SELECT\s+COUNT\s*\(\s*\*\s*\)', sql_up):
+            return False, "SELECT * is not allowed. Specific columns must be selected."
+        has_where = bool(re.search(r'\bWHERE\b', sql_up))
+        has_agg = bool(re.search(r'SELECT\s+(COUNT|SUM|AVG|MIN|MAX)\s*\(', sql_up))
+        has_group = bool(re.search(r'\bGROUP\s+BY\b', sql_up))
+        if not has_where and not has_agg and not has_group:
+            return False, "No WHERE clause or aggregation. Add filters to your question."
+        return True, ""
+    def _enforce_limit(self, sql):
+        sql_up = sql.upper().strip()
+        # Skip pure aggregates without GROUP BY
+        if re.search(r'^SELECT\s+(COUNT|SUM|AVG|MIN|MAX)\s*\(', sql_up) and not re.search(r'\bGROUP\s+BY\b', sql_up):
+            return sql
+        m = re.search(r'\bLIMIT\s+(\d+)', sql_up)
+        if m:
+            if int(m.group(1)) > self.MAX_ROWS:
+                sql = re.sub(r'\bLIMIT\s+\d+', f'LIMIT {self.MAX_ROWS}', sql, flags=re.IGNORECASE)
+            return sql
+        return f"{sql.rstrip()} LIMIT {self.MAX_ROWS}"
+    # ── Prompt Helper ─────────────────────────────────────────────────
+    def _prompt(self, key, **kw):
+        t = self.prompts.get(key, "")
+        if not t:
+            print(f"  ✗ WARNING: prompt '{key}' not found in prompts_config.json")
+            return ""
+        try:
+            return t.format(**kw)
+        except KeyError as e:
+            print(f"  ✗ WARNING: missing placeholder {e} in prompt '{key}'")
+            return t
+    # ── LLM Pipeline ─────────────────────────────────────────────────
+    def _pick_tables(self, question):
+        cfg = self.ai_cfg.get("table_picker", {})
+        max_t = cfg.get("max_tables", 5)
+        names = list(self.schema_info.keys())
+        resp = self.client.chat.completions.create(
+            model=self.model,
+            temperature=cfg.get("temperature", 0),
+            max_tokens=cfg.get("max_tokens", 200),
+            messages=[
+                {"role": "system", "content": self._prompt("table_picker_system")},
+                {"role": "user", "content": self._prompt("table_picker_user",
+                    db_name=self.db_name, table_list=", ".join(names),
+                    question=question, max_tables=max_t)},
+            ]
+        )
+        picked = [t.strip().strip("'\"` ") for t in (resp.choices[0].message.content or "").split(",")]
+        valid = [t for t in picked if t in self.schema_info]
+        return valid or names[:max_t]
+    def _generate_sql(self, question, schema_ctx):
+        cfg = self.ai_cfg.get("sql_generator", {})
+        resp = self.client.chat.completions.create(
+            model=self.model,
+            temperature=cfg.get("temperature", 0),
+            max_tokens=cfg.get("max_tokens", 500),
+            messages=[
+                {"role": "system", "content": self._prompt("sql_generator_system",
+                    db_name=self.db_name, max_rows=self.MAX_ROWS, max_join_tables=self.MAX_JOIN_TABLES)},
+                {"role": "user", "content": self._prompt("sql_generator_user",
+                    schema_context=schema_ctx, question=question)},
+            ]
+        )
+        sql = (resp.choices[0].message.content or "").strip()
+        if "SECURITY_BLOCK" in sql.upper():
+            return "SECURITY_BLOCK"
+        if "NOT_A_QUERY" in sql.upper():
+            return "NOT_A_QUERY"
+        sql = sql.replace("```sql", "").replace("```", "").strip()
+        if ";" in sql:
+            sql = sql.split(";")[0].strip()
+        return sql
+    def _execute(self, sql):
+        with self.engine.connect() as conn:
+            # Try setting query timeout (MariaDB vs MySQL have different syntax)
+            try:
+                conn.execute(text(f"SET SESSION max_statement_time = {self.MAX_QUERY_TIME}"))
+            except Exception:
+                try:
+                    conn.execute(text(f"SET SESSION MAX_EXECUTION_TIME = {self.MAX_QUERY_TIME * 1000}"))
+                except Exception:
+                    pass  # Neither supported — LIMIT and row cap still protect us
+            result = conn.execute(text(sql))
+            cols = list(result.keys())
+            batch = result.fetchmany(self.MAX_ROWS + 1)
+            rows = [dict(zip(cols, r)) for r in batch[:self.MAX_ROWS]]
+            if len(batch) > self.MAX_ROWS:
+                print(f"  → Capped at {self.MAX_ROWS} rows")
+            return cols, rows
+    def _summarize(self, question, sql, cols, rows):
+        cfg = self.ai_cfg.get("summarizer", {})
+        max_disp = cfg.get("max_display_rows", 50)
+        shown = rows[:max_disp]
+        result_text = f"Columns: {cols}\nRows ({len(rows)} total"
+        if len(rows) > max_disp:
+            result_text += f", showing first {max_disp}"
+        result_text += "):\n" + "\n".join(str(r) for r in shown)
+        resp = self.client.chat.completions.create(
+            model=self.model,
+            temperature=cfg.get("temperature", 0.3),
+            max_tokens=cfg.get("max_tokens", 2000),
+            messages=[
+                {"role": "system", "content": self._prompt("summarizer_system", db_name=self.db_name)},
+                {"role": "user", "content": self._prompt("summarizer_user",
+                    question=question, sql=sql, result_text=result_text)},
+            ]
+        )
+        return (resp.choices[0].message.content or "").strip()
+    # ── Main Entry ────────────────────────────────────────────────────
+    def ask(self, question):
+        try:
+            tables = self._pick_tables(question)
+            print(f"  → Tables: {', '.join(tables)}")
+            schema_ctx = "\n".join(
+                f"Table '{t}': {', '.join(self.schema_info[t])}"
+                for t in tables if t in self.schema_info
+            )
+            sql = self._generate_sql(question, schema_ctx)
+            responses = self.prompts.get("responses", {})
+            if sql == "NOT_A_QUERY":
+                return responses.get("not_a_query", "I'm DataBot. Ask me about your business data.")
+            if sql == "SECURITY_BLOCK":
+                return responses.get("security_block", "Access denied: sensitive data requested.")
+            print(f"  → SQL: {sql}")
+            ok, reason = self._validate_security(sql)
+            if not ok:
+                print(f"  → BLOCKED: {reason}")
+                return responses.get("security_check_fail", "Query blocked: {reason}").format(reason=reason)
+            ok, reason = self._validate_complexity(sql)
+            if not ok:
+                print(f"  → BLOCKED: {reason}")
+                return responses.get("complexity_fail", "Query too complex: {reason}").format(reason=reason)
+            sql = self._enforce_limit(sql)
+            print(f"  → Final: {sql}")
+            cols, rows = self._execute(sql)
+            return self._summarize(question, sql, cols, rows)
+        except Exception as e:
             return f"DataBot Error: {str(e)}"