Spaces:

melikakheirieh
/

nl2sql-copilot

Sleeping

App Files Files Community

Melika Kheirieh commited on Nov 9, 2025

Commit

b794494

1 Parent(s): db1d448

feat(core): refine pipeline & verifier; improve Spider benchmark accuracy

Browse files

Files changed (25) hide show

adapters/llm/openai_provider.py +305 -77
benchmarks/evaluate_spider_pro.py +387 -431
benchmarks/results_pro/20251108-123204/eval.jsonl +0 -5
benchmarks/results_pro/20251108-123204/latency_per_stage.png +0 -0
benchmarks/results_pro/20251108-123204/metrics_overview.png +0 -0
benchmarks/results_pro/20251108-123204/results.csv +0 -6
benchmarks/results_pro/20251108-123204/summary.json +0 -13
benchmarks/results_pro/20251108-124153/eval.jsonl +0 -5
benchmarks/results_pro/20251108-124153/latency_per_stage.png +0 -0
benchmarks/results_pro/20251108-124153/metrics_overview.png +0 -0
benchmarks/results_pro/20251108-124153/results.csv +0 -6
benchmarks/results_pro/20251108-124153/summary.json +0 -13
benchmarks/results_pro/20251108-125829/eval.jsonl +0 -5
benchmarks/results_pro/20251108-125829/latency_per_stage.png +0 -0
benchmarks/results_pro/20251108-125829/metrics_overview.png +0 -0
benchmarks/results_pro/20251108-125829/results.csv +0 -6
benchmarks/results_pro/20251108-125829/summary.json +0 -13
benchmarks/results_pro/20251109-092540/eval.jsonl +5 -0
benchmarks/results_pro/20251109-092540/summary.json +12 -0
benchmarks/results_pro/20251109-092823/eval.jsonl +5 -0
benchmarks/results_pro/20251109-092823/summary.json +12 -0
benchmarks/results_pro/20251109-093743/eval.jsonl +5 -0
benchmarks/results_pro/20251109-093743/summary.json +12 -0
nl2sql/pipeline.py +137 -118
nl2sql/verifier.py +266 -174

adapters/llm/openai_provider.py CHANGED Viewed

@@ -1,24 +1,16 @@
 from __future__ import annotations
-import os
 import json
 from adapters.llm.base import LLMProvider
 from openai import OpenAI
-# NOTE:
-# - Prefer proxy if PROXY_API_KEY and PROXY_BASE_URL are set.
-# - Otherwise, fallback to OPENAI_API_KEY (+ OPENAI_BASE_URL defaulting to https://api.openai.com/v1).
-# - Do NOT pass base_url/api_key in the constructor; rely on env vars.
 def _resolve_api_config() -> tuple[str, str, str]:
-    """
-    Returns (api_key, base_url, model_id) according to env.
-    Resolution order:
-      1) Proxy: PROXY_API_KEY + PROXY_BASE_URL [+ PROXY_MODEL_ID]
-      2) Direct: OPENAI_API_KEY [+ OPENAI_BASE_URL] [+ OPENAI_MODEL_ID]
-    Additionally, LLM_MODEL_ID (if set) overrides model choice.
-    """
-    # Optional global override for model id
     override_model = os.getenv("LLM_MODEL_ID")
     proxy_key = os.getenv("PROXY_API_KEY")
@@ -43,74 +35,146 @@ def _resolve_api_config() -> tuple[str, str, str]:
 class OpenAIProvider(LLMProvider):
     provider_id = "openai"
     def __init__(self) -> None:
-        # Resolve and export to env so we don't pass into constructor.
         api_key, base_url, model = _resolve_api_config()
         os.environ["OPENAI_API_KEY"] = api_key
         os.environ["OPENAI_BASE_URL"] = base_url
-        # Create client using env only
         self.client = OpenAI()
         self.model = model
-    def plan(self, *, user_query, schema_preview):
         completion = self.client.chat.completions.create(
             model=self.model,
             messages=[
-                {"role": "system", "content": "You create SQL query plans."},
-                {
-                    "role": "user",
-                    "content": f"Query: {user_query}\nSchema:\n{schema_preview}",
-                },
             ],
-            temperature=0,
         )
-        msg = completion.choices[0].message.content
         usage = completion.usage
-        return (
-            msg,
-            usage.prompt_tokens,
-            usage.completion_tokens,
-            self._estimate_cost(usage),
-        )
     def generate_sql(
-        self, *, user_query, schema_preview, plan_text, clarify_answers=None
-    ):
-        prompt = f"""
-        You are a precise SQL generator.
-        Return ONLY valid JSON with two keys: "sql" and "rationale".
-        Do not include any markdown, backticks, or extra text.
-        Example:
-        {{
-          "sql": "SELECT * FROM singer;",
-          "rationale": "The user requested to list all singers."
-        }}
-        Now generate JSON for this input:
-        User query: {user_query}
-        Schema preview:
-        {schema_preview}
-        Plan: {plan_text}
-        Clarifications: {clarify_answers}
         """
         completion = self.client.chat.completions.create(
             model=self.model,
             messages=[
-                {"role": "system", "content": "You convert natural language to SQL."},
-                {"role": "user", "content": prompt},
             ],
-            temperature=0,
         )
-        content = completion.choices[0].message.content.strip()
         usage = completion.usage
-        t_in = usage.prompt_tokens if usage else None
-        t_out = usage.completion_tokens if usage else None
-        cost = self._estimate_cost(usage) if usage else None
         try:
             parsed = json.loads(content)
         except json.JSONDecodeError:
@@ -126,35 +190,199 @@ class OpenAIProvider(LLMProvider):
         sql = (parsed.get("sql") or "").strip()
         rationale = parsed.get("rationale") or ""
         if not sql:
             raise ValueError("LLM returned empty 'sql'")
-        return sql, rationale, t_in, t_out, cost
-    def repair(self, *, sql, error_msg, schema_preview):
         completion = self.client.chat.completions.create(
             model=self.model,
             messages=[
-                {
-                    "role": "system",
-                    "content": "You fix SQL queries keeping them SELECT-only.",
-                },
-                {
-                    "role": "user",
-                    "content": f"SQL:\n{sql}\nError:\n{error_msg}\nSchema:\n{schema_preview}",
-                },
             ],
-            temperature=0,
         )
-        msg = completion.choices[0].message.content
         usage = completion.usage
-        return (
-            msg,
-            usage.prompt_tokens,
-            usage.completion_tokens,
-            self._estimate_cost(usage),
         )
-    def _estimate_cost(self, usage):
-        total = usage.prompt_tokens + usage.completion_tokens
-        return total * 0.000001

 from __future__ import annotations
 import json
+import os
+import re
+from typing import Any, List, Tuple
 from adapters.llm.base import LLMProvider
 from openai import OpenAI
 def _resolve_api_config() -> tuple[str, str, str]:
+    """Returns (api_key, base_url, model_id) according to env."""
     override_model = os.getenv("LLM_MODEL_ID")
     proxy_key = os.getenv("PROXY_API_KEY")
 class OpenAIProvider(LLMProvider):
+    """OpenAI LLM provider implementation."""
     provider_id = "openai"
     def __init__(self) -> None:
+        """Initialize OpenAI client with config from environment."""
         api_key, base_url, model = _resolve_api_config()
         os.environ["OPENAI_API_KEY"] = api_key
         os.environ["OPENAI_BASE_URL"] = base_url
         self.client = OpenAI()
         self.model = model
+    def plan(
+        self, *, user_query: str, schema_preview: str
+    ) -> Tuple[str, int, int, float]:
+        """Generate a query plan for the SQL generation.
+        Args:
+            user_query: The user's natural language question
+            schema_preview: Database schema information
+        Returns:
+            Tuple of (plan_text, prompt_tokens, completion_tokens, cost)
+        """
+        system_prompt = """You are a SQL query planning expert. Analyze the user's question and database schema to create a clear execution plan.
+Your plan should:
+1. Identify the tables and columns needed
+2. Determine any JOINs required
+3. Specify filtering conditions (WHERE)
+4. Identify aggregations (GROUP BY, COUNT, etc.)
+5. Note sorting requirements (ORDER BY)
+6. Check for special cases (DISTINCT, LIMIT, etc.)
+Be concise but thorough."""
+        user_prompt = f"""Question: {user_query}
+Database Schema:
+{schema_preview}
+Create a step-by-step plan to answer this question with SQL."""
         completion = self.client.chat.completions.create(
             model=self.model,
             messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_prompt},
             ],
+            temperature=0.1,
         )
+        msg = completion.choices[0].message.content or ""
         usage = completion.usage
+        if usage:
+            prompt_tokens = usage.prompt_tokens
+            completion_tokens = usage.completion_tokens
+            cost = self._estimate_cost(usage)
+            return (msg, prompt_tokens, completion_tokens, cost)
+        else:
+            return (msg, 0, 0, 0.0)
     def generate_sql(
+        self,
+        *,
+        user_query: str,
+        schema_preview: str,
+        plan_text: str,
+        clarify_answers: dict[str, Any] | None = None,
+    ) -> Tuple[str, str, int, int, float]:
+        """Generate SQL with improved prompt for Spider benchmark.
+        Args:
+            user_query: The user's natural language question
+            schema_preview: Database schema information
+            plan_text: Query execution plan
+            clarify_answers: Optional additional context
+        Returns:
+            Tuple of (sql, rationale, prompt_tokens, completion_tokens, cost)
         """
+        system_prompt = """You are an expert SQL query generator for SQLite databases.
+You must follow these STRICT rules to generate clean, simple SQL:
+CRITICAL RULES:
+1. Write the SIMPLEST possible SQL that answers the question
+2. NEVER use table prefixes unless absolutely necessary for disambiguation
+3. NEVER add aliases (AS) unless specifically requested
+4. NEVER add LIMIT unless the question asks for a specific number of results
+5. NEVER use DISTINCT with COUNT(*) unless explicitly needed
+6. Use lowercase for SQL keywords (select, from, where, etc.)
+7. Do not add unnecessary parentheses or formatting
+8. Match exact column and table names from the schema (case-sensitive)
+IMPORTANT:
+- For counting all rows: Use COUNT(*) not COUNT(column_name)
+- For ordering: Only add ORDER BY if the question asks for sorted results
+- Keep the SQL as close as possible to the minimal required syntax
+You must return ONLY valid JSON with exactly two keys: "sql" and "rationale".
+The SQL should be a single line without unnecessary spaces."""
+        user_prompt = f"""Based on this information, generate a simple SQL query:
+Question: {user_query}
+Database Schema:
+{schema_preview}
+Query Plan:
+{plan_text}
+Remember: Generate the SIMPLEST possible SQL. Avoid table prefixes, aliases, and unnecessary clauses.
+Example of what we want:
+Question: "How many singers are there?"
+Correct: {{"sql": "select count(*) from singer", "rationale": "Count all rows in singer table"}}
+Wrong: {{"sql": "SELECT COUNT(singer.singer_id) AS total_singers FROM singer", "rationale": "..."}}
+Now generate the SQL for the given question:"""
+        if clarify_answers:
+            user_prompt += f"\n\nAdditional context: {clarify_answers}"
         completion = self.client.chat.completions.create(
             model=self.model,
             messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_prompt},
             ],
+            temperature=0.1,
+            max_tokens=500,
         )
+        text = completion.choices[0].message.content
+        content = text.strip() if text else ""
         usage = completion.usage
+        # Parse JSON response
         try:
             parsed = json.loads(content)
         except json.JSONDecodeError:
         sql = (parsed.get("sql") or "").strip()
         rationale = parsed.get("rationale") or ""
+        # Post-process SQL to ensure simplicity
+        sql = self._simplify_sql(sql)
         if not sql:
             raise ValueError("LLM returned empty 'sql'")
+        if usage:
+            prompt_tokens = usage.prompt_tokens
+            completion_tokens = usage.completion_tokens
+            cost = self._estimate_cost(usage)
+            return (sql, rationale, prompt_tokens, completion_tokens, cost)
+        else:
+            return (sql, rationale, 0, 0, 0.0)
+    def _simplify_sql(self, sql: str) -> str:
+        """Post-process SQL to remove common unnecessary additions."""
+        if not sql:
+            return sql
+        # Remove trailing semicolon
+        sql = sql.rstrip(";")
+        # Remove unnecessary table prefixes in simple queries
+        # e.g., "singer.name" -> "name" when there's only one table
+        if sql.lower().count(" from ") == 1 and " join " not in sql.lower():
+            match = re.search(r"\bfrom\s+(\w+)", sql, re.IGNORECASE)
+            if match:
+                table = match.group(1)
+                sql = re.sub(rf"\b{table}\.(\w+)\b", r"\1", sql)
+        # Remove unnecessary DISTINCT in COUNT(*)
+        sql = re.sub(
+            r"count\s*\(\s*distinct\s+\*\s*\)",
+            "count(*)",
+            sql,
+            flags=re.IGNORECASE,
+        )
+        # Remove big default LIMITs that weren't requested
+        sql = re.sub(
+            r"\s+limit\s+(100|1000|10000)\b",
+            "",
+            sql,
+            flags=re.IGNORECASE,
+        )
+        return sql
+    def repair(
+        self,
+        *,
+        sql: str,
+        error_msg: str,
+        schema_preview: str,
+    ) -> Tuple[str, int, int, float]:
+        """Repair SQL with focus on simplicity.
+        Args:
+            sql: Broken SQL query
+            error_msg: Error message from execution
+            schema_preview: Database schema information
+        Returns:
+            Tuple of (fixed_sql, prompt_tokens, completion_tokens, cost)
+        """
+        system_prompt = """You are a SQL repair expert. Fix the given SQL query to resolve the error.
+IMPORTANT RULES:
+1. Keep the fix as minimal as possible
+2. Don't add complexity - keep it simple
+3. Preserve the original intent of the query
+4. Follow SQLite syntax rules
+5. Don't add aliases or table prefixes unless necessary
+Return ONLY the corrected SQL query, nothing else."""
+        user_prompt = f"""Fix this SQL query:
+Original SQL: {sql}
+Error: {error_msg}
+Database Schema:
+{schema_preview}
+Return the corrected SQL (keep it simple):"""
         completion = self.client.chat.completions.create(
             model=self.model,
             messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_prompt},
             ],
+            temperature=0.1,
         )
+        text = completion.choices[0].message.content
+        fixed_sql = text.strip() if text else ""
+        # Clean up accidental code fences
+        if fixed_sql.startswith("```sql"):
+            fixed_sql = fixed_sql[6:]
+        if fixed_sql.startswith("```"):
+            fixed_sql = fixed_sql[3:]
+        if fixed_sql.endswith("```"):
+            fixed_sql = fixed_sql[:-3]
+        fixed_sql = fixed_sql.strip()
+        fixed_sql = self._simplify_sql(fixed_sql)
         usage = completion.usage
+        if usage:
+            prompt_tokens = usage.prompt_tokens
+            completion_tokens = usage.completion_tokens
+            cost = self._estimate_cost(usage)
+            return (fixed_sql, prompt_tokens, completion_tokens, cost)
+        else:
+            return (fixed_sql, 0, 0, 0.0)
+    def _estimate_cost(self, usage: Any) -> float:
+        """Estimate cost based on token usage.
+        Args:
+            usage: OpenAI usage object with token counts
+        Returns:
+            Estimated cost in USD
+        """
+        if not usage:
+            return 0.0
+        # Pricing per 1K tokens (adjust based on model)
+        pricing = {
+            "gpt-4": {"input": 0.03, "output": 0.06},
+            "gpt-4-turbo": {"input": 0.01, "output": 0.03},
+            "gpt-4o": {"input": 0.005, "output": 0.015},
+            "gpt-4o-mini": {"input": 0.00015, "output": 0.0006},
+            "gpt-3.5-turbo": {"input": 0.0005, "output": 0.0015},
+        }
+        model_pricing = pricing.get(self.model, pricing["gpt-4o-mini"])
+        input_cost = (usage.prompt_tokens / 1000) * model_pricing["input"]
+        output_cost = (usage.completion_tokens / 1000) * model_pricing["output"]
+        return input_cost + output_cost
+    def clarify(
+        self,
+        *,
+        user_query: str,
+        schema_preview: str,
+        questions: List[str],
+    ) -> Tuple[str, int, int, float]:
+        """Clarify ambiguities in the user query.
+        Args:
+            user_query: The user's natural language question
+            schema_preview: Database schema information
+            questions: List of clarification questions
+        Returns:
+            Tuple of (answers, prompt_tokens, completion_tokens, cost)
+        """
+        system_prompt = """You are a helpful assistant that clarifies SQL query requirements.
+Answer the questions clearly and concisely based on the user's query and database schema."""
+        user_prompt = f"""User Query: {user_query}
+Database Schema:
+{schema_preview}
+Please answer these clarification questions:
+{chr(10).join(f"{i + 1}. {q}" for i, q in enumerate(questions))}"""
+        completion = self.client.chat.completions.create(
+            model=self.model,
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": user_prompt},
+            ],
+            temperature=0.3,
         )
+        answers = completion.choices[0].message.content or ""
+        usage = completion.usage
+        if usage:
+            prompt_tokens = usage.prompt_tokens
+            completion_tokens = usage.completion_tokens
+            cost = self._estimate_cost(usage)
+            return (answers, prompt_tokens, completion_tokens, cost)
+        else:
+            return (answers, 0, 0, 0.0)

benchmarks/evaluate_spider_pro.py CHANGED Viewed

@@ -1,490 +1,446 @@
 """
-Pro evaluation runner with two modes:
-Extension of `evaluate_spider.py` with additional metrics (EM, SM, ExecAcc) and richer logging for research-style benchmarking.
-1) Single-DB demo mode (default)
-   - Runs a list of questions against one SQLite DB
-   - Reports latency/ok (no EM/SM/ExecAcc because there's no gold SQL)
-2) Spider mode (--spider)
-   - Loads a subset of the Spider dataset via SPIDER_ROOT
-   - For each item, builds a per-DB pipeline and computes:
-       * EM (exact SQL string match, case-insensitive)
-       * SM (structural match via sqlglot AST)
-       * ExecAcc (result equivalence by executing gold vs. predicted SQL)
-   - Also logs latency, (optional) traces, and aggregates a summary
-Works with:
-- Real LLM (OPENAI_API_KEY set)
-- Stub mode (PYTEST_CURRENT_TEST=1) for zero-cost offline runs
-Outputs:
-  benchmarks/results_pro/<timestamp>/
-    - eval.jsonl        # per-sample rows
-    - summary.json      # aggregate metrics
-    - results.csv       # human-friendly table
-Examples:
-  # Demo (single DB), stub mode
-  PYTHONPATH=$PWD PYTEST_CURRENT_TEST=1 \
-  python benchmarks/evaluate_spider_pro.py --db-path demo.db
-  # Spider subset (20 items), stub mode
-  export SPIDER_ROOT=$PWD/data/spider
-  PYTHONPATH=$PWD PYTEST_CURRENT_TEST=1 \
-  python benchmarks/evaluate_spider_pro.py --spider --split dev --limit 20
 """
 from __future__ import annotations
 import argparse
-import csv
 import json
-import os
 import time
 from pathlib import Path
-from typing import Any, Dict, List, Optional
-import sqlglot
-from sqlglot.errors import ParseError
 from nl2sql.pipeline_factory import pipeline_from_config_with_adapter
 from adapters.db.sqlite_adapter import SQLiteAdapter
-# Only needed for Spider mode
-try:
-    from benchmarks.spider_loader import load_spider_sqlite, open_readonly_connection
-except Exception:
-    load_spider_sqlite = None  # type: ignore[assignment]
-    open_readonly_connection = None  # type: ignore[assignment]
-# Resolve repo root and default config path relative to this file (not CWD)
-THIS_DIR = Path(__file__).resolve().parent  # .../benchmarks
-REPO_ROOT = THIS_DIR.parent  # repo root
-CONFIG_PATH = str(REPO_ROOT / "configs" / "sqlite_pipeline.yaml")
-# Default demo questions for single-DB mode
-DEFAULT_DATASET: List[str] = [
-    "list all customers",
-    "show total invoices per country",
-    "top 3 albums by total sales",
-    "artists with more than 3 albums",
-    "number of employees per city",
-]
-RESULT_ROOT = Path("benchmarks") / "results_pro"
 TIMESTAMP = time.strftime("%Y%m%d-%H%M%S")
 RESULT_DIR = RESULT_ROOT / TIMESTAMP
-# -------------------- Utilities --------------------
-def _int_ms(start: float) -> int:
-    """Convert elapsed seconds to integer milliseconds."""
-    return int((time.perf_counter() - start) * 1000)
-def _derive_schema_preview_safe(pipeline_obj: Any) -> Optional[str]:
-    """Safely call derive_schema_preview() if available on adapter/executor."""
-    try:
-        for c in (
-            getattr(pipeline_obj, "executor", None),
-            getattr(pipeline_obj, "adapter", None),
-        ):
-            if c and hasattr(c, "derive_schema_preview"):
-                return c.derive_schema_preview()  # type: ignore[no-any-return]
-    except Exception:
-        pass
-    return None
-def _to_stage_list(trace_obj: Any) -> List[Dict[str, Any]]:
-    """Normalize pipeline trace into a list of dicts for logging/export."""
-    out: List[Dict[str, Any]] = []
-    if not isinstance(trace_obj, list):
-        return out
-    for t in trace_obj:
-        if isinstance(t, dict):
-            stage = t.get("stage", "?")
-            ms = t.get("duration_ms", 0)
-        else:
-            stage = getattr(t, "stage", "?")
-            ms = getattr(t, "duration_ms", 0)
-        try:
-            out.append({"stage": str(stage), "ms": int(ms)})
-        except Exception:
-            out.append({"stage": str(stage), "ms": 0})
-    return out
-def _parse_sql(sql: str):
-    try:
-        return sqlglot.parse_one(sql, read="sqlite")
-    except ParseError:
-        return None
-def _structural_match(pred: str, gold: str) -> bool:
-    """AST-level equality via sqlglot; returns False if either side can't be parsed."""
-    a, b = _parse_sql(pred), _parse_sql(gold)
-    return (a == b) if (a is not None and b is not None) else False
-def _load_dataset_from_file(path: Optional[str]) -> List[str]:
-    """Load questions from a JSON file: list[str] or list[{question: str}]."""
-    if not path:
-        return DEFAULT_DATASET
-    p = Path(path)
-    if not p.exists():
-        raise FileNotFoundError(f"dataset file not found: {p}")
-    data = json.loads(p.read_text(encoding="utf-8"))
-    if isinstance(data, list):
-        if all(isinstance(x, str) for x in data):
-            return list(data)
-        if all(isinstance(x, dict) and "question" in x for x in data):
-            return [str(x["question"]) for x in data]
-    raise ValueError(
-        "Dataset file must be a JSON array of strings or objects with 'question' field."
     )
-def _extract_sql(result: Any) -> str:
-    """
-    Extract SQL from pipeline result in a mypy-friendly way.
-    Supports both result.sql and result.data.sql shapes.
-    """
-    sql_pred: Optional[str] = getattr(result, "sql", None)
-    if not sql_pred:
-        data = getattr(result, "data", None)
-        if data is not None:
-            sql_pred = getattr(data, "sql", None)
-    return (sql_pred or "").strip()
-def _save_outputs(rows: List[Dict[str, Any]], summary: Dict[str, Any]) -> None:
-    """Persist JSONL + JSON summary + CSV for pro runner."""
-    RESULT_DIR.mkdir(parents=True, exist_ok=True)
-    jsonl_path = RESULT_DIR / "eval.jsonl"
-    with jsonl_path.open("w", encoding="utf-8") as f:
-        for r in rows:
-            f.write(json.dumps(r, ensure_ascii=False) + "\n")
-    with (RESULT_DIR / "summary.json").open("w", encoding="utf-8") as f:
-        json.dump(summary, f, indent=2)
-    csv_path = RESULT_DIR / "results.csv"
-    # For pro, include pro columns when present (Spider mode)
-    fieldnames = [
-        "source",
-        "db_id",
-        "query",
-        "em",
-        "sm",
-        "exec_acc",
-        "ok",
-        "latency_ms",
-    ]
-    with csv_path.open("w", newline="", encoding="utf-8") as f:
-        wr = csv.DictWriter(f, fieldnames=fieldnames)
-        wr.writeheader()
-        for r in rows:
-            wr.writerow(
-                {
-                    "source": r.get("source", "demo"),
-                    "db_id": r.get("db_id", ""),
-                    "query": r.get("query", ""),
-                    "em": "✅" if r.get("em") else "❌" if "em" in r else "",
-                    "sm": "✅" if r.get("sm") else "❌" if "sm" in r else "",
-                    "exec_acc": "✅"
-                    if r.get("exec_acc")
-                    else "❌"
-                    if "exec_acc" in r
-                    else "",
-                    "ok": "✅" if r.get("ok") else "❌",
-                    "latency_ms": int(r.get("latency_ms", 0)),
-                }
-            )
-    print(
-        "\n💾 Saved outputs:\n"
-        f"- {jsonl_path}\n- {RESULT_DIR / 'summary.json'}\n- {csv_path}\n"
-        f"📊 Avg latency: {summary.get('avg_latency_ms', 0.0)} ms "
-        f"| EM: {summary.get('EM', 0.0):.3f} "
-        f"| SM: {summary.get('SM', 0.0):.3f} "
-        f"| ExecAcc: {summary.get('ExecAcc', 0.0):.3f} "
-        f"| Success: {summary.get('success_rate', 0.0):.0%}\n"
-    )
-# -------------------- Runners --------------------
-def _run_single_db_mode(db_path: Path, questions: List[str], config_path: str) -> None:
-    """
-    Single-DB demo mode.
-    Only latency/ok is reported (no EM/SM/ExecAcc, because we don't have gold SQL).
-    """
-    adapter = SQLiteAdapter(str(db_path))
-    pipeline = pipeline_from_config_with_adapter(config_path, adapter=adapter)
-    schema_preview = _derive_schema_preview_safe(pipeline)
-    if schema_preview:
-        print("📄 Derived schema preview ✓")
-    else:
-        print("ℹ️ No schema preview (adapter does not expose it or not needed)")
-    rows: List[Dict[str, Any]] = []
-    for q in questions:
-        print(f"\n🧠 Query: {q}")
-        t0 = time.perf_counter()
-        try:
-            result = pipeline.run(user_query=q, schema_preview=schema_preview or "")
-            latency_ms = _int_ms(t0) or 1  # clamp to 1ms for nicer CSV in stub mode
-            stages = _to_stage_list(
-                getattr(result, "traces", getattr(result, "trace", []))
-            )
-            rows.append(
-                {
-                    "source": "demo",
-                    "db_id": Path(db_path).stem,
-                    "query": q,
-                    "ok": bool(getattr(result, "ok", True)),
-                    "latency_ms": latency_ms,
-                    "trace": stages,
-                    "error": None,
-                }
-            )
-            print(f"✅ Success ({latency_ms} ms)")
-        except Exception as exc:
-            latency_ms = _int_ms(t0) or 1
-            rows.append(
-                {
-                    "source": "demo",
-                    "db_id": Path(db_path).stem,
-                    "query": q,
-                    "ok": False,
-                    "latency_ms": latency_ms,
-                    "trace": [],
-                    "error": str(exc),
-                }
-            )
-            print(f"❌ Failed: {exc!s} ({latency_ms} ms)")
-    success_rate = (
-        (sum(1 for r in rows if r.get("ok")) / max(len(rows), 1)) if rows else 0.0
-    )
-    avg_latency = (
-        round(sum(int(r.get("latency_ms", 0)) for r in rows) / max(len(rows), 1), 1)
-        if rows
-        else 0.0
-    )
-    summary = {
-        "mode": "single-db",
-        "db_path": str(db_path),
-        "config": config_path,
-        "provider_hint": ("STUBS" if os.getenv("PYTEST_CURRENT_TEST") else "REAL"),
-        "total": len(rows),
-        "EM": 0.0,
-        "SM": 0.0,
-        "ExecAcc": 0.0,  # not applicable in demo
-        "success_rate": success_rate,
-        "avg_latency_ms": avg_latency,
-        "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
-    }
-    _save_outputs(rows, summary)
-def _run_spider_mode(split: str, limit: int, config_path: str) -> None:
-    """
-    Spider mode: compute EM/SM/ExecAcc with per-DB pipelines.
-    Requires SPIDER_ROOT pointing to a folder that contains dev.json/train_spider.json and database/.
-    """
-    if load_spider_sqlite is None or open_readonly_connection is None:
-        raise RuntimeError(
-            "Spider utilities are not available. Ensure benchmarks/spider_loader.py exists."
         )
-    items = load_spider_sqlite(split=split, limit=limit)
-    print(f"🗂  Loaded {len(items)} Spider items (split={split}).")
-    rows: List[Dict[str, Any]] = []
-    for i, ex in enumerate(items, 1):
-        print(f"\n[{i}] {ex.db_id} :: {ex.question}")
-        adapter = SQLiteAdapter(ex.db_path)
-        pipeline = pipeline_from_config_with_adapter(config_path, adapter=adapter)
-        # Optional schema preview per DB
-        schema_preview = _derive_schema_preview_safe(pipeline)
-        # Open read-only connection for ExecAcc computation
-        conn = open_readonly_connection(ex.db_path)
-        t0 = time.perf_counter()
-        try:
-            result = pipeline.run(
-                user_query=ex.question, schema_preview=schema_preview or ""
-            )
-            latency_ms = _int_ms(t0) or 1
-            stages = _to_stage_list(
-                getattr(result, "traces", getattr(result, "trace", []))
-            )
-            # Extract predicted SQL from result (support both .sql and .data.sql)
-            sql_pred = _extract_sql(result)
-            # Pro metrics
-            gold_sql = ex.gold_sql.strip()
-            em = (sql_pred.lower() == gold_sql.lower()) if sql_pred else False
-            sm = _structural_match(sql_pred, gold_sql) if sql_pred else False
-            try:
-                gold_exec = conn.execute(gold_sql).fetchall()
-            except Exception:
-                gold_exec = []
-            try:
-                pred_exec = conn.execute(sql_pred).fetchall() if sql_pred else []
-            except Exception:
-                pred_exec = []
-            exec_acc = gold_exec == pred_exec
-            rows.append(
-                {
-                    "source": "spider",
-                    "db_id": ex.db_id,
-                    "query": ex.question,
-                    "sql_pred": sql_pred,
-                    "sql_gold": gold_sql,
-                    "em": em,
-                    "sm": sm,
-                    "exec_acc": exec_acc,
-                    "ok": bool(getattr(result, "ok", True)),
-                    "latency_ms": latency_ms,
-                    "trace": stages,
-                    "error": None,
-                }
-            )
-            print(f"✅ OK | EM={em} | SM={sm} | Exec={exec_acc} | {latency_ms} ms")
-        except Exception as exc:
-            latency_ms = _int_ms(t0) or 1
-            rows.append(
-                {
-                    "source": "spider",
-                    "db_id": ex.db_id,
-                    "query": ex.question,
-                    "sql_pred": None,
-                    "sql_gold": ex.gold_sql,
-                    "em": False,
-                    "sm": False,
-                    "exec_acc": False,
-                    "ok": False,
-                    "latency_ms": latency_ms,
-                    "trace": [],
-                    "error": str(exc),
-                }
-            )
-            print(f"❌ Fail: {exc!s} ({latency_ms} ms)")
-        finally:
-            try:
-                conn.close()
-            except Exception:
-                pass
-    # Aggregate pro metrics
-    total = len(rows)
-    em_rate = (sum(1 for r in rows if r.get("em")) / max(total, 1)) if rows else 0.0
-    sm_rate = (sum(1 for r in rows if r.get("sm")) / max(total, 1)) if rows else 0.0
-    exec_rate = (
-        (sum(1 for r in rows if r.get("exec_acc")) / max(total, 1)) if rows else 0.0
-    )
-    success_rate = (
-        (sum(1 for r in rows if r.get("ok")) / max(total, 1)) if rows else 0.0
-    )
-    avg_latency = (
-        round(sum(int(r.get("latency_ms", 0)) for r in rows) / max(total, 1), 1)
-        if rows
-        else 0.0
-    )
-    summary = {
-        "mode": "spider",
-        "split": split,
-        "limit": limit,
-        "config": config_path,
-        "provider_hint": ("STUBS" if os.getenv("PYTEST_CURRENT_TEST") else "REAL"),
-        "spider_root": os.getenv("SPIDER_ROOT", ""),
-        "total": total,
-        "EM": round(em_rate, 3),
-        "SM": round(sm_rate, 3),
-        "ExecAcc": round(exec_rate, 3),
-        "success_rate": success_rate,
-        "avg_latency_ms": avg_latency,
-        "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
-    }
-    _save_outputs(rows, summary)
-# -------------------- CLI --------------------
-def main() -> None:
-    ap = argparse.ArgumentParser()
-    ap.add_argument(
-        "--spider",
-        action="store_true",
-        help="Enable Spider mode (reads from SPIDER_ROOT; ignores --db-path).",
-    )
-    ap.add_argument(
-        "--split",
-        type=str,
-        default="dev",
-        choices=["dev", "train"],
-        help="Spider split to use (default: dev).",
-    )
-    ap.add_argument(
-        "--limit",
-        type=int,
-        default=20,
-        help="Number of Spider items to evaluate (default: 20).",
-    )
-    ap.add_argument(
-        "--db-path",
-        type=str,
-        default="demo.db",
-        help="Path to SQLite database file (single-DB mode).",
-    )
-    ap.add_argument(
-        "--dataset-file",
-        type=str,
-        default=None,
-        help="Optional JSON file with questions (single-DB mode).",
-    )
-    ap.add_argument(
-        "--config",
-        type=str,
-        default=CONFIG_PATH,
-        help=f"Pipeline YAML config (default: {CONFIG_PATH})",
-    )
-    args = ap.parse_args()
-    if args.spider:
-        if not os.getenv("SPIDER_ROOT"):
-            raise RuntimeError(
-                "SPIDER_ROOT is not set. It must point to the folder that directly contains "
-                "dev.json/train_spider.json and the database/ directory."
             )
-        _run_spider_mode(args.split, args.limit, args.config)
-    else:
-        db_path = Path(args.db_path).resolve()
-        if not db_path.exists():
-            raise FileNotFoundError(f"SQLite DB not found: {db_path}")
-        questions = _load_dataset_from_file(args.dataset_file)
-        _run_single_db_mode(db_path, questions, args.config)
 if __name__ == "__main__":
     main()

+#!/usr/bin/env python3
 """
+Enhanced Spider benchmark evaluator for NL2SQL pipeline.
+No external dependencies - uses internal evaluation logic.
 """
 from __future__ import annotations
 import argparse
 import json
+import re
+import sqlite3
 import time
+from dataclasses import dataclass
+from datetime import datetime
 from pathlib import Path
+from typing import Any, Dict, List, Tuple
 from nl2sql.pipeline_factory import pipeline_from_config_with_adapter
 from adapters.db.sqlite_adapter import SQLiteAdapter
+from benchmarks.spider_loader import load_spider_sqlite
+# ==================== Configuration ====================
+RESULT_ROOT = Path("benchmarks/results_pro")
 TIMESTAMP = time.strftime("%Y%m%d-%H%M%S")
 RESULT_DIR = RESULT_ROOT / TIMESTAMP
+# ==================== SQL Processing ====================
+def extract_clean_sql(text: str | None) -> str:
+    """Safely extract a clean SQL string from input text possibly containing markdown fences or JSON."""
+    # Always initialize variable to empty string
+    sql = text or ""
+    # Remove markdown code fences
+    sql = re.sub(r"```(?:sql)?\s*\n?", "", sql, flags=re.IGNORECASE)
+    sql = re.sub(r"```\s*$", "", sql)
+    # Try JSON pattern like {"sql": "..."}
+    m_json = re.search(r'"sql"\s*:\s*"([^"]+)"', sql)
+    if m_json:
+        sql = m_json.group(1)
+    # Clean escaped characters
+    sql = sql.replace('\\"', '"').replace("\\n", " ").replace("\\t", " ")
+    # Try to locate SQL statement keywords
+    m_sql = re.search(
+        r"\b(select|with|insert|update|delete)\b[\s\S]+", sql, re.IGNORECASE
     )
+    if m_sql:
+        sql = m_sql.group(0)
+    sql = re.sub(r"\s+", " ", sql).strip().rstrip(";")
+    return sql
+def normalize_sql(sql: str) -> str:
+    """Enhanced SQL normalization for better matching."""
+    if not sql:
+        return ""
+    sql = sql.strip().upper()
+    # Remove all whitespace variations
+    sql = re.sub(r"\s+", " ", sql)
+    # Remove trailing semicolon
+    sql = sql.rstrip(";")
+    # Remove table prefixes (e.g., singer.name -> name)
+    sql = re.sub(r"\b\w+\.(\w+)\b", r"\1", sql)
+    # Remove AS aliases
+    sql = re.sub(r"\s+AS\s+\w+", "", sql, flags=re.IGNORECASE)
+    # Remove DISTINCT if used with COUNT(*)
+    sql = re.sub(r"COUNT\s*\(\s*DISTINCT\s+", "COUNT(", sql)
+    # Normalize COUNT variations
+    sql = re.sub(r"COUNT\s*\(\s*\w+\s*\)", "COUNT(*)", sql)
+    # Remove LIMIT at end
+    sql = re.sub(r"\s+LIMIT\s+\d+$", "", sql)
+    # Normalize quotes
+    sql = re.sub(r'"(\w+)"', r"\1", sql)
+    sql = re.sub(r"`(\w+)`", r"\1", sql)
+    return sql
+# ==================== Schema Extraction ====================
+def get_database_schema(db_path: Path) -> Dict[str, Any]:
+    """Extract complete schema from SQLite database."""
+    if not db_path.exists():
+        return {}
+    conn = sqlite3.connect(str(db_path))
+    cursor = conn.cursor()
+    schema: dict[str, Any] = {"tables": {}}
+    try:
+        # Get all tables
+        cursor.execute(
+            "SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'"
         )
+        tables = cursor.fetchall()
+        for (table_name,) in tables:
+            # Get columns
+            cursor.execute(f"PRAGMA table_info('{table_name}')")
+            columns = cursor.fetchall()
+            col_info = []
+            for col in columns:
+                col_name = col[1]
+                col_type = col[2]
+                is_pk = col[5]
+                col_dict = {
+                    "name": col_name,
+                    "type": col_type,
+                    "primary_key": bool(is_pk),
+                }
+                col_info.append(col_dict)
+            # Get foreign keys
+            cursor.execute(f"PRAGMA foreign_key_list('{table_name}')")
+            fks = cursor.fetchall()
+            fk_info = []
+            for fk in fks:
+                fk_info.append(
+                    {
+                        "column": fk[3],
+                        "referenced_table": fk[2],
+                        "referenced_column": fk[4],
+                    }
+                )
+            schema["tables"][table_name] = {
+                "columns": col_info,
+                "foreign_keys": fk_info,
+            }
+    finally:
+        conn.close()
+    return schema
+def format_schema_for_prompt(schema: Dict[str, Any]) -> str:
+    """Format schema for LLM prompt."""
+    if not schema or not schema.get("tables"):
+        return ""
+    lines = []
+    for table_name, table_info in schema["tables"].items():
+        cols = []
+        for col in table_info["columns"]:
+            col_str = f"{col['name']} {col['type']}"
+            if col.get("primary_key"):
+                col_str += " PRIMARY KEY"
+            cols.append(col_str)
+        lines.append(f"Table: {table_name}")
+        lines.append(f"Columns: {', '.join(cols)}")
+        if table_info.get("foreign_keys"):
+            fks = []
+            for fk in table_info["foreign_keys"]:
+                fks.append(
+                    f"{fk['column']} -> {fk['referenced_table']}.{fk['referenced_column']}"
+                )
+            lines.append(f"Foreign Keys: {', '.join(fks)}")
+        lines.append("")  # Empty line between tables
+    return "\n".join(lines).strip()
+# ==================== SQL Evaluation ====================
+def execute_sql(db_path: Path, sql: str) -> Tuple[bool, List[Tuple]]:
+    """Execute SQL and return success flag and results."""
+    if not sql:
+        return False, []
+    try:
+        conn = sqlite3.connect(str(db_path))
+        cursor = conn.cursor()
+        cursor.execute(sql)
+        results = cursor.fetchall()
+        conn.close()
+        return True, results
+    except Exception:
+        return False, []
+def compare_sql_results(gold_results: List[Tuple], pred_results: List[Tuple]) -> bool:
+    """Compare SQL execution results."""
+    if len(gold_results) != len(pred_results):
+        return False
+    # Convert to sets for comparison (order independent)
+    gold_set = set(gold_results)
+    pred_set = set(pred_results)
+    return gold_set == pred_set
+def evaluate_sql_match(pred_sql: str, gold_sql: str, db_path: Path) -> Dict[str, float]:
+    """Evaluate predicted SQL against gold SQL."""
+    metrics = {"exact_match": 0.0, "set_match": 0.0, "exec_accuracy": 0.0}
+    if not pred_sql:
+        return metrics
+    # Exact match
+    if normalize_sql(pred_sql) == normalize_sql(gold_sql):
+        metrics["exact_match"] = 1.0
+    # Execution-based evaluation
+    gold_success, gold_results = execute_sql(db_path, gold_sql)
+    pred_success, pred_results = execute_sql(db_path, pred_sql)
+    if gold_success and pred_success:
+        # Set match (results match)
+        if compare_sql_results(gold_results, pred_results):
+            metrics["set_match"] = 1.0
+            metrics["exec_accuracy"] = 1.0
+        else:
+            # Partial credit for successful execution
+            metrics["exec_accuracy"] = 0.5
+    return metrics
+# ==================== Pipeline Runner ====================
+@dataclass
+class SpiderSample:
+    """Spider dataset sample."""
+    question: str
+    db_id: str
+    db_path: Path
+    gold_sql: str
+def run_pipeline_on_sample(
+    pipeline: Any,
+    sample: SpiderSample,
+    schema_cache: Dict[str, str],
+    debug: bool = False,
+) -> Dict[str, Any]:
+    """Run NL2SQL pipeline on a single sample."""
+    # Get/cache schema
+    if sample.db_id not in schema_cache:
+        schema_dict = get_database_schema(sample.db_path)
+        schema_str = format_schema_for_prompt(schema_dict)
+        schema_cache[sample.db_id] = schema_str
+        if debug:
+            print(f"    [schema] Loaded {len(schema_str)} chars for {sample.db_id}")
+    schema: str = schema_cache[sample.db_id]
+    # Run pipeline
+    try:
+        result = pipeline.run(user_query=sample.question, schema_preview=schema)
+        # Extract SQL from result
+        if hasattr(result, "sql") and result.sql:
+            pred_sql = extract_clean_sql(result.sql)
+        else:
+            # Try to extract from various fields
+            for attr in ["final_sql", "generated_sql", "answer"]:
+                if hasattr(result, attr):
+                    val = getattr(result, attr)
+                    if val:
+                        pred_sql = extract_clean_sql(str(val))
+                        if pred_sql:
+                            break
+            else:
+                pred_sql = ""
+        return {
+            "ok": bool(getattr(result, "ok", True)),
+            "sql": pred_sql,
+            "raw_response": getattr(result, "sql", ""),
+            "traces": getattr(result, "traces", []),
+            "error": None,
+        }
+    except Exception as e:
+        if debug:
+            import traceback
+            traceback.print_exc()
+        return {
+            "ok": False,
+            "sql": "",
+            "raw_response": "",
+            "traces": [],
+            "error": str(e),
+        }
+# ==================== Main Evaluation ====================
+def main():
+    parser = argparse.ArgumentParser(description="Evaluate NL2SQL on Spider")
+    parser.add_argument("--spider", action="store_true", help="Run Spider evaluation")
+    parser.add_argument("--split", default="dev", choices=["dev", "train"])
+    parser.add_argument("--limit", type=int, help="Limit number of samples")
+    parser.add_argument("--debug", action="store_true", help="Enable debug output")
+    parser.add_argument("--config", default="configs/sqlite_pipeline.yaml")
+    args = parser.parse_args()
+    if not args.spider:
+        print("Please use --spider flag to run Spider evaluation")
+        return
+    # Load Spider samples
+    print(f"Loading Spider {args.split} split...")
+    samples = load_spider_sqlite(split=args.split, limit=args.limit)
+    if not samples:
+        print("❌ No samples loaded. Check SPIDER_ROOT environment variable.")
+        return
+    print(f"✔ Loaded {len(samples)} samples")
+    # Prepare results directory
+    RESULT_DIR.mkdir(parents=True, exist_ok=True)
+    # Initialize schema cache
+    schema_cache = {}
+    # Process each sample
+    results = []
+    for i, spider_item in enumerate(samples, 1):
+        # Convert to our sample format
+        sample = SpiderSample(
+            question=spider_item.question,
+            db_id=spider_item.db_id,
+            db_path=Path(spider_item.db_path),
+            gold_sql=spider_item.gold_sql,
+        )
+        print(f"\n🧠 [{i}/{len(samples)}] [{sample.db_id}] {sample.question}")
+        # Create adapter and pipeline for this database
+        adapter = SQLiteAdapter(sample.db_path)
+        pipeline = pipeline_from_config_with_adapter(args.config, adapter=adapter)
+        # Run pipeline
+        t0 = time.perf_counter()
+        result = run_pipeline_on_sample(pipeline, sample, schema_cache, args.debug)
+        latency_ms = int((time.perf_counter() - t0) * 1000)
+        # Evaluate
+        metrics = evaluate_sql_match(result["sql"], sample.gold_sql, sample.db_path)
+        # Store result
+        eval_result = {
+            "source": "spider",
+            "db_id": sample.db_id,
+            "query": sample.question,
+            "gold_sql": sample.gold_sql,
+            "pred_sql": result["sql"],
+            "ok": result["ok"],
+            "latency_ms": latency_ms,
+            "em": metrics["exact_match"],
+            "sm": metrics["set_match"],
+            "exec_acc": metrics["exec_accuracy"],
+            "error": result.get("error"),
+            "trace": result.get("traces", []),
+        }
+        results.append(eval_result)
+        # Debug output
+        if args.debug:
+            status = "✅" if result["ok"] and metrics["exact_match"] == 1 else "⚠️"
+            print(
+                f"{status} ({latency_ms} ms) | EM={metrics['exact_match']:.0f} SM={metrics['set_match']:.0f} ExecAcc={metrics['exec_accuracy']:.1f}"
             )
+            if metrics["exact_match"] < 1:
+                print(f"    gold: {sample.gold_sql[:100]}")
+                print(f"    pred: {result['sql'][:100] if result['sql'] else 'EMPTY'}")
+    # Calculate aggregates
+    total = len(results)
+    successful = sum(1 for r in results if r["ok"])
+    avg_em = sum(r["em"] for r in results) / total if total > 0 else 0
+    avg_sm = sum(r["sm"] for r in results) / total if total > 0 else 0
+    avg_ea = sum(r["exec_acc"] for r in results) / total if total > 0 else 0
+    avg_latency = sum(r["latency_ms"] for r in results) / total if total > 0 else 0
+    # Save results
+    eval_jsonl = RESULT_DIR / "eval.jsonl"
+    with open(eval_jsonl, "w") as f:
+        for r in results:
+            json.dump(r, f, ensure_ascii=False)
+            f.write("\n")
+    summary = {
+        "timestamp": datetime.now().isoformat(timespec="seconds"),
+        "total": total,
+        "success": successful,
+        "success_rate": round(successful / total, 3) if total else 0,
+        "avg_latency_ms": round(avg_latency, 1),
+        "EM": round(avg_em, 3),
+        "SM": round(avg_sm, 3),
+        "ExecAcc": round(avg_ea, 3),
+        "split": args.split,
+        "config": args.config,
+    }
+    (RESULT_DIR / "summary.json").write_text(
+        json.dumps(summary, indent=2, ensure_ascii=False), encoding="utf-8"
+    )
+    print("\n================== Evaluation Summary ==================")
+    print(f"Total samples:   {total}")
+    print(f"Successful runs: {successful} ({summary['success_rate'] * 100:.1f}%)")
+    print(f"Avg EM:          {summary['EM']}")
+    print(f"Avg SM:          {summary['SM']}")
+    print(f"Avg ExecAcc:     {summary['ExecAcc']}")
+    print(f"Avg Latency:     {summary['avg_latency_ms']} ms")
+    print(f"Results saved to {RESULT_DIR}")
+    print("========================================================")
 if __name__ == "__main__":
+    RESULT_DIR.mkdir(parents=True, exist_ok=True)
     main()

benchmarks/results_pro/20251108-123204/eval.jsonl DELETED Viewed

@@ -1,5 +0,0 @@
-{"source": "demo", "db_id": "demo", "query": "list all customers", "ok": false, "latency_ms": 8406, "trace": [{"stage": "detector", "ms": 0}, {"stage": "planner", "ms": 3768}, {"stage": "generator", "ms": 1616}, {"stage": "safety", "ms": 2}, {"stage": "executor", "ms": 3}, {"stage": "repair", "ms": 1639}, {"stage": "safety", "ms": 1}, {"stage": "executor", "ms": 1}, {"stage": "repair", "ms": 1367}, {"stage": "safety", "ms": 3}, {"stage": "executor", "ms": 1}, {"stage": "pipeline", "ms": 0}], "error": null}
-{"source": "demo", "db_id": "demo", "query": "show total invoices per country", "ok": true, "latency_ms": 11003, "trace": [{"stage": "detector", "ms": 0}, {"stage": "planner", "ms": 5021}, {"stage": "generator", "ms": 1605}, {"stage": "safety", "ms": 1}, {"stage": "executor", "ms": 1}, {"stage": "repair", "ms": 1437}, {"stage": "safety", "ms": 2}, {"stage": "executor", "ms": 1}, {"stage": "repair", "ms": 2929}, {"stage": "safety", "ms": 2}, {"stage": "executor", "ms": 1}, {"stage": "pipeline", "ms": 0}, {"stage": "pipeline", "ms": 0}], "error": null}
-{"source": "demo", "db_id": "demo", "query": "top 3 albums by total sales", "ok": true, "latency_ms": 1, "trace": [{"stage": "detector", "ms": 0}], "error": null}
-{"source": "demo", "db_id": "demo", "query": "artists with more than 3 albums", "ok": false, "latency_ms": 14409, "trace": [{"stage": "detector", "ms": 0}, {"stage": "planner", "ms": 8377}, {"stage": "generator", "ms": 2525}, {"stage": "safety", "ms": 4}, {"stage": "executor", "ms": 1}, {"stage": "repair", "ms": 1618}, {"stage": "safety", "ms": 4}, {"stage": "executor", "ms": 1}, {"stage": "repair", "ms": 1874}, {"stage": "safety", "ms": 3}, {"stage": "executor", "ms": 1}, {"stage": "pipeline", "ms": 0}], "error": null}
-{"source": "demo", "db_id": "demo", "query": "number of employees per city", "ok": true, "latency_ms": 8938, "trace": [{"stage": "detector", "ms": 0}, {"stage": "planner", "ms": 4402}, {"stage": "generator", "ms": 1846}, {"stage": "safety", "ms": 1}, {"stage": "executor", "ms": 1}, {"stage": "repair", "ms": 1397}, {"stage": "safety", "ms": 2}, {"stage": "executor", "ms": 1}, {"stage": "repair", "ms": 1283}, {"stage": "safety", "ms": 1}, {"stage": "executor", "ms": 1}, {"stage": "pipeline", "ms": 0}, {"stage": "pipeline", "ms": 0}], "error": null}

benchmarks/results_pro/20251108-123204/latency_per_stage.png DELETED Viewed

Binary file (34.7 kB)

benchmarks/results_pro/20251108-123204/metrics_overview.png DELETED Viewed

Binary file (22.7 kB)

benchmarks/results_pro/20251108-123204/results.csv DELETED Viewed

@@ -1,6 +0,0 @@
-source,db_id,query,em,sm,exec_acc,ok,latency_ms
-demo,demo,list all customers,,,,❌,8406
-demo,demo,show total invoices per country,,,,✅,11003
-demo,demo,top 3 albums by total sales,,,,✅,1
-demo,demo,artists with more than 3 albums,,,,❌,14409
-demo,demo,number of employees per city,,,,✅,8938

benchmarks/results_pro/20251108-123204/summary.json DELETED Viewed

@@ -1,13 +0,0 @@
-{
-  "mode": "single-db",
-  "db_path": "/Users/melikakheirieh/Desktop/my/career-developement/LLM/nl2sql-copilot/demo.db",
-  "config": "/Users/melikakheirieh/Desktop/my/career-developement/LLM/nl2sql-copilot/configs/sqlite_pipeline.yaml",
-  "provider_hint": "REAL",
-  "total": 5,
-  "EM": 0.0,
-  "SM": 0.0,
-  "ExecAcc": 0.0,
-  "success_rate": 0.6,
-  "avg_latency_ms": 8551.4,
-  "timestamp": "2025-11-08 12:32:47"
-}

benchmarks/results_pro/20251108-124153/eval.jsonl DELETED Viewed

@@ -1,5 +0,0 @@
-{"source": "demo", "db_id": "demo", "query": "list all customers", "ok": false, "latency_ms": 6756, "trace": [{"stage": "detector", "ms": 0}, {"stage": "planner", "ms": 2729}, {"stage": "generator", "ms": 1343}, {"stage": "safety", "ms": 1}, {"stage": "executor", "ms": 2}, {"stage": "repair", "ms": 911}, {"stage": "safety", "ms": 1}, {"stage": "executor", "ms": 1}, {"stage": "repair", "ms": 1763}, {"stage": "safety", "ms": 1}, {"stage": "executor", "ms": 1}, {"stage": "pipeline", "ms": 0}], "error": null}
-{"source": "demo", "db_id": "demo", "query": "show total invoices per country", "ok": true, "latency_ms": 8901, "trace": [{"stage": "detector", "ms": 0}, {"stage": "planner", "ms": 4799}, {"stage": "generator", "ms": 1075}, {"stage": "safety", "ms": 2}, {"stage": "executor", "ms": 1}, {"stage": "repair", "ms": 1092}, {"stage": "safety", "ms": 1}, {"stage": "executor", "ms": 1}, {"stage": "repair", "ms": 1924}, {"stage": "safety", "ms": 1}, {"stage": "executor", "ms": 1}, {"stage": "pipeline", "ms": 0}, {"stage": "pipeline", "ms": 0}], "error": null}
-{"source": "demo", "db_id": "demo", "query": "top 3 albums by total sales", "ok": true, "latency_ms": 1, "trace": [{"stage": "detector", "ms": 0}], "error": null}
-{"source": "demo", "db_id": "demo", "query": "artists with more than 3 albums", "ok": false, "latency_ms": 12342, "trace": [{"stage": "detector", "ms": 0}, {"stage": "planner", "ms": 4882}, {"stage": "generator", "ms": 2684}, {"stage": "safety", "ms": 2}, {"stage": "executor", "ms": 1}, {"stage": "repair", "ms": 2630}, {"stage": "safety", "ms": 2}, {"stage": "executor", "ms": 1}, {"stage": "repair", "ms": 2135}, {"stage": "safety", "ms": 2}, {"stage": "executor", "ms": 1}, {"stage": "pipeline", "ms": 0}], "error": null}
-{"source": "demo", "db_id": "demo", "query": "number of employees per city", "ok": true, "latency_ms": 7547, "trace": [{"stage": "detector", "ms": 0}, {"stage": "planner", "ms": 4083}, {"stage": "generator", "ms": 1269}, {"stage": "safety", "ms": 1}, {"stage": "executor", "ms": 1}, {"stage": "repair", "ms": 1149}, {"stage": "safety", "ms": 1}, {"stage": "executor", "ms": 1}, {"stage": "repair", "ms": 1035}, {"stage": "safety", "ms": 2}, {"stage": "executor", "ms": 1}, {"stage": "pipeline", "ms": 0}, {"stage": "pipeline", "ms": 0}], "error": null}

benchmarks/results_pro/20251108-124153/latency_per_stage.png DELETED Viewed

Binary file (34.7 kB)

benchmarks/results_pro/20251108-124153/metrics_overview.png DELETED Viewed

Binary file (22.7 kB)

benchmarks/results_pro/20251108-124153/results.csv DELETED Viewed

@@ -1,6 +0,0 @@
-source,db_id,query,em,sm,exec_acc,ok,latency_ms
-demo,demo,list all customers,,,,❌,6756
-demo,demo,show total invoices per country,,,,✅,8901
-demo,demo,top 3 albums by total sales,,,,✅,1
-demo,demo,artists with more than 3 albums,,,,❌,12342
-demo,demo,number of employees per city,,,,✅,7547

benchmarks/results_pro/20251108-124153/summary.json DELETED Viewed

@@ -1,13 +0,0 @@
-{
-  "mode": "single-db",
-  "db_path": "/Users/melikakheirieh/Desktop/my/career-developement/LLM/nl2sql-copilot/demo.db",
-  "config": "/Users/melikakheirieh/Desktop/my/career-developement/LLM/nl2sql-copilot/configs/sqlite_pipeline.yaml",
-  "provider_hint": "REAL",
-  "total": 5,
-  "EM": 0.0,
-  "SM": 0.0,
-  "ExecAcc": 0.0,
-  "success_rate": 0.6,
-  "avg_latency_ms": 7109.4,
-  "timestamp": "2025-11-08 12:42:29"
-}

benchmarks/results_pro/20251108-125829/eval.jsonl DELETED Viewed

@@ -1,5 +0,0 @@
-{"source": "demo", "db_id": "demo", "query": "list all customers", "ok": false, "latency_ms": 6652, "trace": [{"stage": "detector", "ms": 0}, {"stage": "planner", "ms": 2554}, {"stage": "generator", "ms": 1370}, {"stage": "safety", "ms": 1}, {"stage": "executor", "ms": 1}, {"stage": "verifier", "ms": 1}, {"stage": "repair", "ms": 1295}, {"stage": "safety", "ms": 0}, {"stage": "executor", "ms": 0}, {"stage": "repair", "ms": 1426}, {"stage": "safety", "ms": 0}, {"stage": "executor", "ms": 0}, {"stage": "pipeline", "ms": 0}], "error": null}
-{"source": "demo", "db_id": "demo", "query": "show total invoices per country", "ok": true, "latency_ms": 7375, "trace": [{"stage": "detector", "ms": 0}, {"stage": "planner", "ms": 3866}, {"stage": "generator", "ms": 1265}, {"stage": "safety", "ms": 4}, {"stage": "executor", "ms": 1}, {"stage": "verifier", "ms": 0}, {"stage": "repair", "ms": 1126}, {"stage": "safety", "ms": 1}, {"stage": "executor", "ms": 1}, {"stage": "verifier", "ms": 0}, {"stage": "repair", "ms": 1106}, {"stage": "safety", "ms": 1}, {"stage": "executor", "ms": 1}, {"stage": "verifier", "ms": 0}, {"stage": "pipeline", "ms": 0}, {"stage": "pipeline", "ms": 0}], "error": null}
-{"source": "demo", "db_id": "demo", "query": "top 3 albums by total sales", "ok": true, "latency_ms": 1, "trace": [{"stage": "detector", "ms": 0}], "error": null}
-{"source": "demo", "db_id": "demo", "query": "artists with more than 3 albums", "ok": false, "latency_ms": 8629, "trace": [{"stage": "detector", "ms": 0}, {"stage": "planner", "ms": 4110}, {"stage": "generator", "ms": 1969}, {"stage": "safety", "ms": 2}, {"stage": "executor", "ms": 1}, {"stage": "verifier", "ms": 0}, {"stage": "repair", "ms": 1296}, {"stage": "safety", "ms": 2}, {"stage": "executor", "ms": 1}, {"stage": "repair", "ms": 1244}, {"stage": "safety", "ms": 2}, {"stage": "executor", "ms": 0}, {"stage": "pipeline", "ms": 0}], "error": null}
-{"source": "demo", "db_id": "demo", "query": "number of employees per city", "ok": true, "latency_ms": 5630, "trace": [{"stage": "detector", "ms": 0}, {"stage": "planner", "ms": 2602}, {"stage": "generator", "ms": 1097}, {"stage": "safety", "ms": 1}, {"stage": "executor", "ms": 0}, {"stage": "verifier", "ms": 0}, {"stage": "repair", "ms": 1018}, {"stage": "safety", "ms": 2}, {"stage": "executor", "ms": 1}, {"stage": "verifier", "ms": 0}, {"stage": "repair", "ms": 906}, {"stage": "safety", "ms": 2}, {"stage": "executor", "ms": 1}, {"stage": "verifier", "ms": 0}, {"stage": "pipeline", "ms": 0}, {"stage": "pipeline", "ms": 0}], "error": null}

benchmarks/results_pro/20251108-125829/latency_per_stage.png DELETED Viewed

Binary file (22.4 kB)

benchmarks/results_pro/20251108-125829/metrics_overview.png DELETED Viewed

Binary file (12.9 kB)

benchmarks/results_pro/20251108-125829/results.csv DELETED Viewed

@@ -1,6 +0,0 @@
-source,db_id,query,em,sm,exec_acc,ok,latency_ms
-demo,demo,list all customers,,,,❌,6652
-demo,demo,show total invoices per country,,,,✅,7375
-demo,demo,top 3 albums by total sales,,,,✅,1
-demo,demo,artists with more than 3 albums,,,,❌,8629
-demo,demo,number of employees per city,,,,✅,5630

benchmarks/results_pro/20251108-125829/summary.json DELETED Viewed

@@ -1,13 +0,0 @@
-{
-  "mode": "single-db",
-  "db_path": "/Users/melikakheirieh/Desktop/my/career-developement/LLM/nl2sql-copilot/demo.db",
-  "config": "/Users/melikakheirieh/Desktop/my/career-developement/LLM/nl2sql-copilot/configs/sqlite_pipeline.yaml",
-  "provider_hint": "REAL",
-  "total": 5,
-  "EM": 0.0,
-  "SM": 0.0,
-  "ExecAcc": 0.0,
-  "success_rate": 0.6,
-  "avg_latency_ms": 5657.4,
-  "timestamp": "2025-11-08 12:58:58"
-}

benchmarks/results_pro/20251109-092540/eval.jsonl ADDED Viewed

	@@ -0,0 +1,5 @@

+{"source": "spider", "db_id": "concert_singer", "query": "How many singers do we have?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "select count(*) from singer limit 1", "ok": true, "latency_ms": 9423, "em": 1.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 6884, "summary": "ok", "notes": {"len_plan": 1313}, "token_in": 270, "token_out": 313, "cost_usd": 0.0002283}, {"stage": "generator", "duration_ms": 891, "summary": "ok", "notes": {"rationale_len": 30}, "token_in": 801, "token_out": 19, "cost_usd": 0.00013155}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 2, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 673, "summary": "ok", "notes": {"old_sql_len": 27, "new_sql_len": 35}, "token_in": 318, "token_out": 8, "cost_usd": 5.2499999999999995e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 962, "summary": "ok", "notes": {"old_sql_len": 35, "new_sql_len": 35}, "token_in": 321, "token_out": 8, "cost_usd": 5.295e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
+{"source": "spider", "db_id": "concert_singer", "query": "What is the total number of singers?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "select count(*) from singer limit 1", "ok": true, "latency_ms": 9382, "em": 1.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 6936, "summary": "ok", "notes": {"len_plan": 1501}, "token_in": 271, "token_out": 351, "cost_usd": 0.00025124999999999995}, {"stage": "generator", "duration_ms": 1014, "summary": "ok", "notes": {"rationale_len": 30}, "token_in": 840, "token_out": 19, "cost_usd": 0.00013739999999999998}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 2, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 710, "summary": "ok", "notes": {"old_sql_len": 27, "new_sql_len": 35}, "token_in": 318, "token_out": 8, "cost_usd": 5.2499999999999995e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 710, "summary": "ok", "notes": {"old_sql_len": 35, "new_sql_len": 35}, "token_in": 321, "token_out": 8, "cost_usd": 5.295e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
+{"source": "spider", "db_id": "concert_singer", "query": "Show name, country, age for all singers ordered by age from the oldest to the youngest.", "gold_sql": "SELECT name ,  country ,  age FROM singer ORDER BY age DESC", "pred_sql": "", "ok": true, "latency_ms": 0, "em": 0.0, "sm": 0.0, "exec_acc": 0.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "ambiguous", "notes": {"ambiguous": true, "questions_len": 1}}]}
+{"source": "spider", "db_id": "concert_singer", "query": "What are the names, countries, and ages for every singer in descending order of age?", "gold_sql": "SELECT name ,  country ,  age FROM singer ORDER BY age DESC", "pred_sql": "select Name, Country, Age from singer order by Age desc LIMIT 10", "ok": true, "latency_ms": 11380, "em": 0.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 7152, "summary": "ok", "notes": {"len_plan": 1281}, "token_in": 281, "token_out": 295, "cost_usd": 0.00021914999999999996}, {"stage": "generator", "duration_ms": 2189, "summary": "ok", "notes": {"rationale_len": 85}, "token_in": 794, "token_out": 37, "cost_usd": 0.0001413}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 6, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 954, "summary": "ok", "notes": {"old_sql_len": 55, "new_sql_len": 64}, "token_in": 325, "token_out": 21, "cost_usd": 6.135e-05}, {"stage": "safety", "duration_ms": 2, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 6, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 1074, "summary": "ok", "notes": {"old_sql_len": 64, "new_sql_len": 64}, "token_in": 328, "token_out": 21, "cost_usd": 6.18e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 6, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
+{"source": "spider", "db_id": "concert_singer", "query": "What is the average, minimum, and maximum age of all singers from France?", "gold_sql": "SELECT avg(age) ,  min(age) ,  max(age) FROM singer WHERE country  =  'France'", "pred_sql": "select avg(Age), min(Age), max(Age) from singer where Country = 'France'", "ok": true, "latency_ms": 10894, "em": 0.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 7383, "summary": "ok", "notes": {"len_plan": 1579}, "token_in": 279, "token_out": 421, "cost_usd": 0.00029445}, {"stage": "generator", "duration_ms": 1242, "summary": "ok", "notes": {"rationale_len": 67}, "token_in": 918, "token_out": 42, "cost_usd": 0.00016289999999999998}, {"stage": "safety", "duration_ms": 2, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 1078, "summary": "ok", "notes": {"old_sql_len": 72, "new_sql_len": 80}, "token_in": 333, "token_out": 24, "cost_usd": 6.435e-05}, {"stage": "safety", "duration_ms": 3, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 3, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 1173, "summary": "ok", "notes": {"old_sql_len": 80, "new_sql_len": 72}, "token_in": 337, "token_out": 28, "cost_usd": 6.735e-05}, {"stage": "safety", "duration_ms": 2, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}

benchmarks/results_pro/20251109-092540/summary.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "timestamp": "2025-11-09T09:26:21",
+  "total": 5,
+  "success": 5,
+  "success_rate": 1.0,
+  "avg_latency_ms": 8215.8,
+  "EM": 0.4,
+  "SM": 0.8,
+  "ExecAcc": 0.8,
+  "split": "dev",
+  "config": "configs/sqlite_pipeline.yaml"
+}

benchmarks/results_pro/20251109-092823/eval.jsonl ADDED Viewed

	@@ -0,0 +1,5 @@

+{"source": "spider", "db_id": "concert_singer", "query": "How many singers do we have?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "select count(*) from singer limit 1", "ok": true, "latency_ms": 7982, "em": 1.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 5384, "summary": "ok", "notes": {"len_plan": 1287}, "token_in": 270, "token_out": 306, "cost_usd": 0.0002241}, {"stage": "generator", "duration_ms": 900, "summary": "ok", "notes": {"rationale_len": 30}, "token_in": 794, "token_out": 19, "cost_usd": 0.0001305}, {"stage": "safety", "duration_ms": 2, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 888, "summary": "ok", "notes": {"old_sql_len": 27, "new_sql_len": 35}, "token_in": 318, "token_out": 8, "cost_usd": 5.2499999999999995e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 797, "summary": "ok", "notes": {"old_sql_len": 35, "new_sql_len": 35}, "token_in": 321, "token_out": 8, "cost_usd": 5.295e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
+{"source": "spider", "db_id": "concert_singer", "query": "What is the total number of singers?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "select count(*) from singer limit 1", "ok": true, "latency_ms": 9717, "em": 1.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 6881, "summary": "ok", "notes": {"len_plan": 1352}, "token_in": 271, "token_out": 319, "cost_usd": 0.00023204999999999998}, {"stage": "generator", "duration_ms": 1162, "summary": "ok", "notes": {"rationale_len": 30}, "token_in": 808, "token_out": 19, "cost_usd": 0.0001326}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 716, "summary": "ok", "notes": {"old_sql_len": 27, "new_sql_len": 35}, "token_in": 318, "token_out": 8, "cost_usd": 5.2499999999999995e-05}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 0, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 950, "summary": "ok", "notes": {"old_sql_len": 35, "new_sql_len": 35}, "token_in": 321, "token_out": 8, "cost_usd": 5.295e-05}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 0, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
+{"source": "spider", "db_id": "concert_singer", "query": "Show name, country, age for all singers ordered by age from the oldest to the youngest.", "gold_sql": "SELECT name ,  country ,  age FROM singer ORDER BY age DESC", "pred_sql": "", "ok": true, "latency_ms": 0, "em": 0.0, "sm": 0.0, "exec_acc": 0.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "ambiguous", "notes": {"ambiguous": true, "questions_len": 1}}]}
+{"source": "spider", "db_id": "concert_singer", "query": "What are the names, countries, and ages for every singer in descending order of age?", "gold_sql": "SELECT name ,  country ,  age FROM singer ORDER BY age DESC", "pred_sql": "select Name, Country, Age from singer order by Age desc LIMIT 10", "ok": true, "latency_ms": 8523, "em": 0.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 5311, "summary": "ok", "notes": {"len_plan": 1449}, "token_in": 281, "token_out": 343, "cost_usd": 0.00024795}, {"stage": "generator", "duration_ms": 1306, "summary": "ok", "notes": {"rationale_len": 85}, "token_in": 842, "token_out": 37, "cost_usd": 0.00014849999999999998}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 6, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 996, "summary": "ok", "notes": {"old_sql_len": 55, "new_sql_len": 64}, "token_in": 325, "token_out": 21, "cost_usd": 6.135e-05}, {"stage": "safety", "duration_ms": 3, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 6, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 900, "summary": "ok", "notes": {"old_sql_len": 64, "new_sql_len": 64}, "token_in": 328, "token_out": 21, "cost_usd": 6.18e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 6, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
+{"source": "spider", "db_id": "concert_singer", "query": "What is the average, minimum, and maximum age of all singers from France?", "gold_sql": "SELECT avg(age) ,  min(age) ,  max(age) FROM singer WHERE country  =  'France'", "pred_sql": "select avg(Age), min(Age), max(Age) from singer where Country = 'France'", "ok": true, "latency_ms": 12291, "em": 0.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 8346, "summary": "ok", "notes": {"len_plan": 1363}, "token_in": 279, "token_out": 334, "cost_usd": 0.00024225}, {"stage": "generator", "duration_ms": 1636, "summary": "ok", "notes": {"rationale_len": 87}, "token_in": 831, "token_out": 46, "cost_usd": 0.00015225}, {"stage": "safety", "duration_ms": 3, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 2, "summary": "ok", "notes": {"row_count": 1, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 2, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 1137, "summary": "ok", "notes": {"old_sql_len": 72, "new_sql_len": 80}, "token_in": 333, "token_out": 25, "cost_usd": 6.495e-05}, {"stage": "safety", "duration_ms": 3, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 3, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 1151, "summary": "ok", "notes": {"old_sql_len": 80, "new_sql_len": 72}, "token_in": 337, "token_out": 28, "cost_usd": 6.735e-05}, {"stage": "safety", "duration_ms": 3, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}

benchmarks/results_pro/20251109-092823/summary.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "timestamp": "2025-11-09T09:29:01",
+  "total": 5,
+  "success": 5,
+  "success_rate": 1.0,
+  "avg_latency_ms": 7702.6,
+  "EM": 0.4,
+  "SM": 0.8,
+  "ExecAcc": 0.8,
+  "split": "dev",
+  "config": "configs/sqlite_pipeline.yaml"
+}

benchmarks/results_pro/20251109-093743/eval.jsonl ADDED Viewed

	@@ -0,0 +1,5 @@

+{"source": "spider", "db_id": "concert_singer", "query": "How many singers do we have?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "select count(*) from singer limit 1", "ok": true, "latency_ms": 10480, "em": 1.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 8010, "summary": "ok", "notes": {"len_plan": 1445}, "token_in": 270, "token_out": 337, "cost_usd": 0.00024270000000000002}, {"stage": "generator", "duration_ms": 1029, "summary": "ok", "notes": {"rationale_len": 30}, "token_in": 825, "token_out": 19, "cost_usd": 0.00013514999999999998}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 2, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 678, "summary": "ok", "notes": {"old_sql_len": 27, "new_sql_len": 35}, "token_in": 318, "token_out": 8, "cost_usd": 5.2499999999999995e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 2, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 750, "summary": "ok", "notes": {"old_sql_len": 35, "new_sql_len": 35}, "token_in": 321, "token_out": 8, "cost_usd": 5.295e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
+{"source": "spider", "db_id": "concert_singer", "query": "What is the total number of singers?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "select count(*) from singer limit 1", "ok": true, "latency_ms": 10687, "em": 1.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 6978, "summary": "ok", "notes": {"len_plan": 1512}, "token_in": 271, "token_out": 355, "cost_usd": 0.00025364999999999996}, {"stage": "generator", "duration_ms": 2192, "summary": "ok", "notes": {"rationale_len": 30}, "token_in": 844, "token_out": 19, "cost_usd": 0.000138}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 0, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 652, "summary": "ok", "notes": {"old_sql_len": 27, "new_sql_len": 35}, "token_in": 318, "token_out": 8, "cost_usd": 5.2499999999999995e-05}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 0, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 863, "summary": "ok", "notes": {"old_sql_len": 35, "new_sql_len": 35}, "token_in": 321, "token_out": 8, "cost_usd": 5.295e-05}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 0, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
+{"source": "spider", "db_id": "concert_singer", "query": "Show name, country, age for all singers ordered by age from the oldest to the youngest.", "gold_sql": "SELECT name ,  country ,  age FROM singer ORDER BY age DESC", "pred_sql": "", "ok": true, "latency_ms": 0, "em": 0.0, "sm": 0.0, "exec_acc": 0.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "ambiguous", "notes": {"ambiguous": true, "questions_len": 1}}]}
+{"source": "spider", "db_id": "concert_singer", "query": "What are the names, countries, and ages for every singer in descending order of age?", "gold_sql": "SELECT name ,  country ,  age FROM singer ORDER BY age DESC", "pred_sql": "select Name, Country, Age from singer order by Age desc LIMIT 10", "ok": true, "latency_ms": 16736, "em": 0.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 13205, "summary": "ok", "notes": {"len_plan": 1758}, "token_in": 281, "token_out": 409, "cost_usd": 0.00028754999999999997}, {"stage": "generator", "duration_ms": 1537, "summary": "ok", "notes": {"rationale_len": 83}, "token_in": 908, "token_out": 37, "cost_usd": 0.0001584}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 6, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 1019, "summary": "ok", "notes": {"old_sql_len": 55, "new_sql_len": 64}, "token_in": 325, "token_out": 21, "cost_usd": 6.135e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 0, "summary": "ok", "notes": {"row_count": 6, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 968, "summary": "ok", "notes": {"old_sql_len": 64, "new_sql_len": 64}, "token_in": 328, "token_out": 21, "cost_usd": 6.18e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 6, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
+{"source": "spider", "db_id": "concert_singer", "query": "What is the average, minimum, and maximum age of all singers from France?", "gold_sql": "SELECT avg(age) ,  min(age) ,  max(age) FROM singer WHERE country  =  'France'", "pred_sql": "select avg(Age), min(Age), max(Age) from singer where Country = 'France'", "ok": true, "latency_ms": 12440, "em": 0.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 7973, "summary": "ok", "notes": {"len_plan": 1377}, "token_in": 279, "token_out": 345, "cost_usd": 0.00024884999999999995}, {"stage": "generator", "duration_ms": 1827, "summary": "ok", "notes": {"rationale_len": 94}, "token_in": 841, "token_out": 47, "cost_usd": 0.00015434999999999998}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 1312, "summary": "ok", "notes": {"old_sql_len": 72, "new_sql_len": 80}, "token_in": 333, "token_out": 24, "cost_usd": 6.435e-05}, {"stage": "safety", "duration_ms": 3, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 2, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 1313, "summary": "ok", "notes": {"old_sql_len": 80, "new_sql_len": 72}, "token_in": 337, "token_out": 21, "cost_usd": 6.315e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}

benchmarks/results_pro/20251109-093743/summary.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "timestamp": "2025-11-09T09:38:33",
+  "total": 5,
+  "success": 5,
+  "success_rate": 1.0,
+  "avg_latency_ms": 10068.6,
+  "EM": 0.4,
+  "SM": 0.8,
+  "ExecAcc": 0.8,
+  "split": "dev",
+  "config": "configs/sqlite_pipeline.yaml"
+}

nl2sql/pipeline.py CHANGED Viewed

@@ -31,9 +31,8 @@ class FinalResult:
 class Pipeline:
     """
-    NL2SQL Copilot pipeline.
-    Stages return StageResult; final result is a type-safe FinalResult.
-    DI-ready: all dependencies are injected via __init__.
     """
     def __init__(
@@ -54,22 +53,21 @@ class Pipeline:
         self.executor = executor or NoOpExecutor()
         self.verifier = verifier or NoOpVerifier()
         self.repair = repair or NoOpRepair()
-    # ------------------------------------------------------------
     @staticmethod
     def _trace_list(*stages: Optional[StageResult]) -> List[dict]:
-        """Collect .trace objects (as dict) from StageResult items if present."""
         traces: List[dict] = []
         for s in stages:
             if not s:
                 continue
             t = getattr(s, "trace", None)
             if t is not None:
-                # t is likely a dataclass – expose as plain dict for JSON safety
                 traces.append(getattr(t, "__dict__", t))
         return traces
-    # ------------------------------------------------------------
     @staticmethod
     def _mk_trace(
         stage: str,
@@ -77,7 +75,6 @@ class Pipeline:
         summary: str,
         notes: Optional[Dict[str, Any]] = None,
     ) -> dict:
-        """Create a normalized trace dict (internal: duration may be float)."""
         return {
             "stage": stage,
             "duration_ms": float(duration_ms),
@@ -87,11 +84,6 @@ class Pipeline:
     @staticmethod
     def _normalize_traces(traces: List[dict]) -> List[dict]:
-        """
-        Normalize trace list for API/UI:
-        - coerce duration_ms to int
-        - ensure `summary` exists (fallback to a minimal one)
-        """
         norm: List[dict] = []
         for t in traces:
             stage = str(t.get("stage", "unknown"))
@@ -100,37 +92,24 @@ class Pipeline:
                 dur_int = int(round(float(dur)))
             except Exception:
                 dur_int = 0
-            summary = t.get("summary")
-            if not summary:
-                # fallback summary if not provided by stage
-                notes = t.get("notes") or {}
-                failed = bool(notes.get("error") or notes.get("errors"))
-                summary = "failed" if failed else "ok"
             notes = t.get("notes") or {}
-            # preserve any accounting fields if present (token_in/out, cost_usd, ...)
             payload = {
                 "stage": stage,
                 "duration_ms": dur_int,
                 "summary": summary,
                 "notes": notes,
             }
-            # keep extra accounting if exists
-            if "token_in" in t:
-                payload["token_in"] = t["token_in"]
-            if "token_out" in t:
-                payload["token_out"] = t["token_out"]
-            if "cost_usd" in t:
-                payload["cost_usd"] = t["cost_usd"]
             norm.append(payload)
         return norm
-    # ------------------------------------------------------------
     @staticmethod
     def _safe_stage(fn, **kwargs) -> StageResult:
-        """
-        Run a stage safely; if it throws, return a StageResult(ok=False, error=[...]).
-        If fn returns a non-StageResult (e.g., dict), coerce to StageResult(ok=True, data=...).
-        """
         try:
             r = fn(**kwargs)
             if isinstance(r, StageResult):
@@ -140,7 +119,7 @@ class Pipeline:
             tb = traceback.format_exc()
             return StageResult(ok=False, data=None, trace=None, error=[f"{e}", tb])
-    # ------------------------------------------------------------
     def run(
         self,
         *,
@@ -152,7 +131,6 @@ class Pipeline:
         traces: List[dict] = []
         details: List[str] = []
-        # Always push a normalized per-stage timing, even if StageResult.trace is empty
         def _fallback_trace(stage_name: str, dt_ms: float, ok: bool) -> None:
             traces.append(
                 self._mk_trace(
@@ -162,26 +140,24 @@ class Pipeline:
                 )
             )
-        # Normalize inputs
         schema_preview = schema_preview or ""
         clarify_answers = clarify_answers or {}
         try:
             # --- 1) detector ---
-            t_det0 = time.perf_counter()
             questions = self.detector.detect(user_query, schema_preview)
-            det_ms = (time.perf_counter() - t_det0) * 1000.0
             is_amb = bool(questions)
-            stage_duration_ms.labels("detector").observe(det_ms)
             traces.append(
                 self._mk_trace(
                     stage="detector",
-                    duration_ms=det_ms,
                     summary=("ambiguous" if is_amb else "clear"),
                     notes={"ambiguous": is_amb, "questions_len": len(questions or [])},
                 )
             )
             if questions:
                 pipeline_runs_total.labels(status="ambiguous").inc()
                 return FinalResult(
@@ -197,15 +173,15 @@ class Pipeline:
                 )
             # --- 2) planner ---
-            t_pln0 = time.perf_counter()
             r_plan = self._safe_stage(
                 self.planner.run, user_query=user_query, schema_preview=schema_preview
             )
-            pln_ms = (time.perf_counter() - t_pln0) * 1000.0
-            stage_duration_ms.labels("planner").observe(pln_ms)
             traces.extend(self._trace_list(r_plan))
             if not getattr(r_plan, "trace", None):
-                _fallback_trace("planner", pln_ms, r_plan.ok)
             if not r_plan.ok:
                 pipeline_runs_total.labels(status="error").inc()
                 return FinalResult(
@@ -221,7 +197,7 @@ class Pipeline:
                 )
             # --- 3) generator ---
-            t_gen0 = time.perf_counter()
             r_gen = self._safe_stage(
                 self.generator.run,
                 user_query=user_query,
@@ -229,11 +205,11 @@ class Pipeline:
                 plan_text=(r_plan.data or {}).get("plan"),
                 clarify_answers=clarify_answers,
             )
-            gen_ms = (time.perf_counter() - t_gen0) * 1000.0
-            stage_duration_ms.labels("generator").observe(gen_ms)
             traces.extend(self._trace_list(r_gen))
             if not getattr(r_gen, "trace", None):
-                _fallback_trace("generator", gen_ms, r_gen.ok)
             if not r_gen.ok:
                 pipeline_runs_total.labels(status="error").inc()
                 return FinalResult(
@@ -251,14 +227,32 @@ class Pipeline:
             sql = (r_gen.data or {}).get("sql")
             rationale = (r_gen.data or {}).get("rationale")
             # --- 4) safety ---
-            t_saf0 = time.perf_counter()
             r_safe = self._safe_stage(self.safety.run, sql=sql)
-            saf_ms = (time.perf_counter() - t_saf0) * 1000.0
-            stage_duration_ms.labels("safety").observe(saf_ms)
             traces.extend(self._trace_list(r_safe))
             if not getattr(r_safe, "trace", None):
-                _fallback_trace("safety", saf_ms, r_safe.ok)
             if not r_safe.ok:
                 pipeline_runs_total.labels(status="error").inc()
                 return FinalResult(
@@ -273,99 +267,112 @@ class Pipeline:
                     traces=self._normalize_traces(traces),
                 )
             # --- 5) executor ---
-            t_exe0 = time.perf_counter()
-            r_exec = self._safe_stage(
-                self.executor.run, sql=(r_safe.data or {}).get("sql", sql)
-            )
-            exe_ms = (time.perf_counter() - t_exe0) * 1000.0
-            stage_duration_ms.labels("executor").observe(exe_ms)
             traces.extend(self._trace_list(r_exec))
             if not getattr(r_exec, "trace", None):
-                _fallback_trace("executor", exe_ms, r_exec.ok)
             if not r_exec.ok and r_exec.error:
-                # executor failure is soft; collect for repair/verifier context
-                details.extend(r_exec.error)
             # --- 6) verifier ---
-            t_ver0 = time.perf_counter()
             r_ver = self._safe_stage(
-                self.verifier.run, sql=sql, exec_result=(r_exec.data or {})
             )
-            ver_ms = (time.perf_counter() - t_ver0) * 1000.0
-            stage_duration_ms.labels("verifier").observe(ver_ms)
             traces.extend(self._trace_list(r_ver))
             if not getattr(r_ver, "trace", None):
-                _fallback_trace("verifier", ver_ms, r_ver.ok)
             verified = bool(r_ver.data and r_ver.data.get("verified")) or r_ver.ok
-            # --- 7) repair loop if verification failed ---
             if not verified:
                 for _attempt in range(2):
                     # repair
-                    t_fix0 = time.perf_counter()
                     r_fix = self._safe_stage(
                         self.repair.run,
                         sql=sql,
                         error_msg="; ".join(details or ["unknown"]),
                         schema_preview=schema_preview,
                     )
-                    fix_ms = (time.perf_counter() - t_fix0) * 1000.0
-                    stage_duration_ms.labels("repair").observe(fix_ms)
                     traces.extend(self._trace_list(r_fix))
                     if not getattr(r_fix, "trace", None):
-                        _fallback_trace("repair", fix_ms, r_fix.ok)
                     if not r_fix.ok:
-                        break  # give up on repair
-                    # fixed SQL
                     sql = (r_fix.data or {}).get("sql", sql)
-                    # safety
-                    t_saf0 = time.perf_counter()
-                    r_safe = self._safe_stage(self.safety.run, sql=sql)
-                    saf_ms2 = (time.perf_counter() - t_saf0) * 1000.0
-                    stage_duration_ms.labels("safety").observe(saf_ms2)
-                    traces.extend(self._trace_list(r_safe))
-                    if not getattr(r_safe, "trace", None):
-                        _fallback_trace("safety", saf_ms2, r_safe.ok)
-                    if not r_safe.ok:
-                        if r_safe.error:
-                            details.extend(r_safe.error)
                         continue
-                    # executor
-                    t_exe0 = time.perf_counter()
-                    r_exec = self._safe_stage(
-                        self.executor.run, sql=(r_safe.data or {}).get("sql", sql)
-                    )
-                    exe_ms2 = (time.perf_counter() - t_exe0) * 1000.0
-                    stage_duration_ms.labels("executor").observe(exe_ms2)
-                    traces.extend(self._trace_list(r_exec))
-                    if not getattr(r_exec, "trace", None):
-                        _fallback_trace("executor", exe_ms2, r_exec.ok)
-                    if not r_exec.ok:
-                        if r_exec.error:
-                            details.extend(r_exec.error)
                         continue
-                    # verifier
-                    t_ver0 = time.perf_counter()
-                    r_ver = self._safe_stage(
-                        self.verifier.run, sql=sql, exec_result=(r_exec.data or {})
                     )
-                    ver_ms2 = (time.perf_counter() - t_ver0) * 1000.0
-                    stage_duration_ms.labels("verifier").observe(ver_ms2)
-                    traces.extend(self._trace_list(r_ver))
-                    if not getattr(r_ver, "trace", None):
-                        _fallback_trace("verifier", ver_ms2, r_ver.ok)
                     verified = (
-                        bool(r_ver.data and r_ver.data.get("verified")) or r_ver.ok
                     )
                     if verified:
                         break
-            # --- 8) fallback: verifier silent but executor succeeded ---
             if (verified is None or not verified) and not details:
                 any_exec_ok = any(
                     t.get("stage") == "executor"
@@ -385,13 +392,24 @@ class Pipeline:
             # --- 9) finalize ---
             has_errors = bool(details)
-            ok = bool(verified) and not has_errors
-            err = has_errors and not bool(verified)
-            if ok:
-                pipeline_runs_total.labels(status="ok").inc()
             else:
-                pipeline_runs_total.labels(status="error").inc()
             traces.append(
                 self._mk_trace(
@@ -399,8 +417,9 @@ class Pipeline:
                     duration_ms=0.0,
                     summary="finalize",
                     notes={
-                        "final_verified": bool(verified),
                         "details_len": len(details),
                     },
                 )
             )
@@ -412,18 +431,18 @@ class Pipeline:
                 details=details or None,
                 sql=sql,
                 rationale=rationale,
-                verified=verified,
                 questions=None,
                 traces=self._normalize_traces(traces),
             )
         except Exception:
-            # Any unexpected crash
             pipeline_runs_total.labels(status="error").inc()
             raise
         finally:
-            # Always record total latency even on early-return / exceptions
             stage_duration_ms.labels("pipeline_total").observe(
                 (time.perf_counter() - t_all0) * 1000.0
             )

 class Pipeline:
     """
+    NL2SQL Copilot pipeline:
+      detector → planner → generator → safety → executor → verifier → (optional repair loop).
     """
     def __init__(
         self.executor = executor or NoOpExecutor()
         self.verifier = verifier or NoOpVerifier()
         self.repair = repair or NoOpRepair()
+        # If the verifier explicitly requires verification, enforce it in finalize.
+        self.require_verification = bool(getattr(self.verifier, "required", False))
+    # ---------------------------- helpers ----------------------------
     @staticmethod
     def _trace_list(*stages: Optional[StageResult]) -> List[dict]:
         traces: List[dict] = []
         for s in stages:
             if not s:
                 continue
             t = getattr(s, "trace", None)
             if t is not None:
                 traces.append(getattr(t, "__dict__", t))
         return traces
     @staticmethod
     def _mk_trace(
         stage: str,
         summary: str,
         notes: Optional[Dict[str, Any]] = None,
     ) -> dict:
         return {
             "stage": stage,
             "duration_ms": float(duration_ms),
     @staticmethod
     def _normalize_traces(traces: List[dict]) -> List[dict]:
         norm: List[dict] = []
         for t in traces:
             stage = str(t.get("stage", "unknown"))
                 dur_int = int(round(float(dur)))
             except Exception:
                 dur_int = 0
             notes = t.get("notes") or {}
+            summary = t.get("summary") or (
+                "failed" if (notes.get("error") or notes.get("errors")) else "ok"
+            )
             payload = {
                 "stage": stage,
                 "duration_ms": dur_int,
                 "summary": summary,
                 "notes": notes,
             }
+            for k in ("token_in", "token_out", "cost_usd"):
+                if k in t:
+                    payload[k] = t[k]
             norm.append(payload)
         return norm
     @staticmethod
     def _safe_stage(fn, **kwargs) -> StageResult:
         try:
             r = fn(**kwargs)
             if isinstance(r, StageResult):
             tb = traceback.format_exc()
             return StageResult(ok=False, data=None, trace=None, error=[f"{e}", tb])
+    # ------------------------------ run ------------------------------
     def run(
         self,
         *,
         traces: List[dict] = []
         details: List[str] = []
         def _fallback_trace(stage_name: str, dt_ms: float, ok: bool) -> None:
             traces.append(
                 self._mk_trace(
                 )
             )
         schema_preview = schema_preview or ""
         clarify_answers = clarify_answers or {}
         try:
             # --- 1) detector ---
+            t0 = time.perf_counter()
             questions = self.detector.detect(user_query, schema_preview)
+            dt = (time.perf_counter() - t0) * 1000.0
             is_amb = bool(questions)
+            stage_duration_ms.labels("detector").observe(dt)
             traces.append(
                 self._mk_trace(
                     stage="detector",
+                    duration_ms=dt,
                     summary=("ambiguous" if is_amb else "clear"),
                     notes={"ambiguous": is_amb, "questions_len": len(questions or [])},
                 )
             )
             if questions:
                 pipeline_runs_total.labels(status="ambiguous").inc()
                 return FinalResult(
                 )
             # --- 2) planner ---
+            t0 = time.perf_counter()
             r_plan = self._safe_stage(
                 self.planner.run, user_query=user_query, schema_preview=schema_preview
             )
+            dt = (time.perf_counter() - t0) * 1000.0
+            stage_duration_ms.labels("planner").observe(dt)
             traces.extend(self._trace_list(r_plan))
             if not getattr(r_plan, "trace", None):
+                _fallback_trace("planner", dt, r_plan.ok)
             if not r_plan.ok:
                 pipeline_runs_total.labels(status="error").inc()
                 return FinalResult(
                 )
             # --- 3) generator ---
+            t0 = time.perf_counter()
             r_gen = self._safe_stage(
                 self.generator.run,
                 user_query=user_query,
                 plan_text=(r_plan.data or {}).get("plan"),
                 clarify_answers=clarify_answers,
             )
+            dt = (time.perf_counter() - t0) * 1000.0
+            stage_duration_ms.labels("generator").observe(dt)
             traces.extend(self._trace_list(r_gen))
             if not getattr(r_gen, "trace", None):
+                _fallback_trace("generator", dt, r_gen.ok)
             if not r_gen.ok:
                 pipeline_runs_total.labels(status="error").inc()
                 return FinalResult(
             sql = (r_gen.data or {}).get("sql")
             rationale = (r_gen.data or {}).get("rationale")
+            # Guard: empty SQL
+            if not sql or not str(sql).strip():
+                pipeline_runs_total.labels(status="error").inc()
+                traces.append(
+                    self._mk_trace("generator", 0.0, "failed", {"reason": "empty_sql"})
+                )
+                return FinalResult(
+                    ok=False,
+                    ambiguous=False,
+                    error=True,
+                    details=["empty_sql"],
+                    questions=None,
+                    sql=None,
+                    rationale=rationale,
+                    verified=None,
+                    traces=self._normalize_traces(traces),
+                )
             # --- 4) safety ---
+            t0 = time.perf_counter()
             r_safe = self._safe_stage(self.safety.run, sql=sql)
+            dt = (time.perf_counter() - t0) * 1000.0
+            stage_duration_ms.labels("safety").observe(dt)
             traces.extend(self._trace_list(r_safe))
             if not getattr(r_safe, "trace", None):
+                _fallback_trace("safety", dt, r_safe.ok)
             if not r_safe.ok:
                 pipeline_runs_total.labels(status="error").inc()
                 return FinalResult(
                     traces=self._normalize_traces(traces),
                 )
+            # Use sanitized SQL from safety
+            sql = (r_safe.data or {}).get("sql", sql)
             # --- 5) executor ---
+            t0 = time.perf_counter()
+            r_exec = self._safe_stage(self.executor.run, sql=sql)
+            dt = (time.perf_counter() - t0) * 1000.0
+            stage_duration_ms.labels("executor").observe(dt)
             traces.extend(self._trace_list(r_exec))
             if not getattr(r_exec, "trace", None):
+                _fallback_trace("executor", dt, r_exec.ok)
             if not r_exec.ok and r_exec.error:
+                details.extend(r_exec.error)  # soft: keep for repair/verifier context
             # --- 6) verifier ---
+            t0 = time.perf_counter()
             r_ver = self._safe_stage(
+                self.verifier.run,
+                sql=sql,
+                exec_result=(r_exec.data or {}),
+                adapter=getattr(
+                    self.executor, "adapter", None
+                ),  # let verifier use adapter
             )
+            dt = (time.perf_counter() - t0) * 1000.0
+            stage_duration_ms.labels("verifier").observe(dt)
             traces.extend(self._trace_list(r_ver))
             if not getattr(r_ver, "trace", None):
+                _fallback_trace("verifier", dt, r_ver.ok)
             verified = bool(r_ver.data and r_ver.data.get("verified")) or r_ver.ok
+            # consume repaired SQL from verifier if any
+            if r_ver.data and "sql" in r_ver.data and r_ver.data["sql"]:
+                sql = r_ver.data["sql"]
+            # --- 7) repair loop (if not verified) ---
             if not verified:
                 for _attempt in range(2):
                     # repair
+                    t0 = time.perf_counter()
                     r_fix = self._safe_stage(
                         self.repair.run,
                         sql=sql,
                         error_msg="; ".join(details or ["unknown"]),
                         schema_preview=schema_preview,
                     )
+                    dt = (time.perf_counter() - t0) * 1000.0
+                    stage_duration_ms.labels("repair").observe(dt)
                     traces.extend(self._trace_list(r_fix))
                     if not getattr(r_fix, "trace", None):
+                        _fallback_trace("repair", dt, r_fix.ok)
                     if not r_fix.ok:
+                        break
+                    # update SQL
                     sql = (r_fix.data or {}).get("sql", sql)
+                    # safety again
+                    t0 = time.perf_counter()
+                    r_safe2 = self._safe_stage(self.safety.run, sql=sql)
+                    dt2 = (time.perf_counter() - t0) * 1000.0
+                    stage_duration_ms.labels("safety").observe(dt2)
+                    traces.extend(self._trace_list(r_safe2))
+                    if not getattr(r_safe2, "trace", None):
+                        _fallback_trace("safety", dt2, r_safe2.ok)
+                    if not r_safe2.ok:
+                        if r_safe2.error:
+                            details.extend(r_safe2.error)
                         continue
+                    sql = (r_safe2.data or {}).get("sql", sql)
+                    # executor again
+                    t0 = time.perf_counter()
+                    r_exec2 = self._safe_stage(self.executor.run, sql=sql)
+                    dt2 = (time.perf_counter() - t0) * 1000.0
+                    stage_duration_ms.labels("executor").observe(dt2)
+                    traces.extend(self._trace_list(r_exec2))
+                    if not getattr(r_exec2, "trace", None):
+                        _fallback_trace("executor", dt2, r_exec2.ok)
+                    if not r_exec2.ok:
+                        if r_exec2.error:
+                            details.extend(r_exec2.error)
                         continue
+                    # verifier again
+                    t0 = time.perf_counter()
+                    r_ver2 = self._safe_stage(
+                        self.verifier.run,
+                        sql=sql,
+                        exec_result=(r_exec2.data or {}),
+                        adapter=getattr(self.executor, "adapter", None),
                     )
+                    dt2 = (time.perf_counter() - t0) * 1000.0
+                    stage_duration_ms.labels("verifier").observe(dt2)
+                    traces.extend(self._trace_list(r_ver2))
+                    if not getattr(r_ver2, "trace", None):
+                        _fallback_trace("verifier", dt2, r_ver2.ok)
                     verified = (
+                        bool(r_ver2.data and r_ver2.data.get("verified")) or r_ver2.ok
                     )
+                    if r_ver2.data and "sql" in r_ver2.data and r_ver2.data["sql"]:
+                        sql = r_ver2.data["sql"]
                     if verified:
                         break
+            # --- 8) optional soft auto-verify (executor success, no details) ---
             if (verified is None or not verified) and not details:
                 any_exec_ok = any(
                     t.get("stage") == "executor"
             # --- 9) finalize ---
             has_errors = bool(details)
+            need_ver = bool(self.require_verification)
+            # base success condition
+            final_ok_by_verifier = bool(verified)
+            base_ok = (
+                bool(sql) and not has_errors and (final_ok_by_verifier or not need_ver)
+            )
+            ok = base_ok
+            err = (not ok) and has_errors
+            # align `verified` with baseline semantics:
+            # if verification is NOT required and pipeline is ok, report verified=True
+            if not need_ver and ok and not final_ok_by_verifier:
+                verified_final = True
             else:
+                verified_final = bool(verified)
+            pipeline_runs_total.labels(status=("ok" if ok else "error")).inc()
             traces.append(
                 self._mk_trace(
                     duration_ms=0.0,
                     summary="finalize",
                     notes={
+                        "final_verified": bool(verified_final),
                         "details_len": len(details),
+                        "need_verification": need_ver,
                     },
                 )
             )
                 details=details or None,
                 sql=sql,
                 rationale=rationale,
+                verified=verified_final,
                 questions=None,
                 traces=self._normalize_traces(traces),
             )
         except Exception:
             pipeline_runs_total.labels(status="error").inc()
+            # bubble up to make failures visible in tests and logs
             raise
         finally:
+            # Always record total latency, even on early return/exception
             stage_duration_ms.labels("pipeline_total").observe(
                 (time.perf_counter() - t_all0) * 1000.0
             )

nl2sql/verifier.py CHANGED Viewed

@@ -1,8 +1,7 @@
 from __future__ import annotations
 import re
 import time
-from typing import Any, Iterable, List, Optional
 import sqlglot
 from sqlglot import expressions as exp
@@ -10,24 +9,65 @@ from sqlglot import expressions as exp
 from nl2sql.types import StageResult, StageTrace
 from nl2sql.metrics import (
     verifier_checks_total,
-    stage_duration_ms,
     verifier_failures_total,
 )
 def _ms(t0: float) -> int:
     return int((time.perf_counter() - t0) * 1000)
 class Verifier:
     name = "verifier"
-    # Textual fallback: scan for common aggregate calls
     _AGG_CALL_RE = re.compile(r"\b(count|sum|avg|min|max)\s*\(", re.IGNORECASE)
-    # ----------------------- AST helpers (version-friendly) --------------------
     def _walk(self, node: exp.Expression) -> Iterable[exp.Expression]:
-        """Non-recursive DFS over sqlglot Expression tree (avoid private APIs)."""
         stack = [node]
         while stack:
             cur = stack.pop()
@@ -43,6 +83,7 @@ class Verifier:
                                 stack.append(it)
     def _first_select(self, tree: exp.Expression) -> Optional[exp.Select]:
         for n in self._walk(tree):
             if isinstance(n, exp.Select):
                 return n
@@ -50,27 +91,22 @@ class Verifier:
     def _has_group_by(self, tree: exp.Expression) -> bool:
         sel = self._first_select(tree)
-        if not sel:
-            return False
-        # sqlglot stores GROUP BY on Select.group
-        return bool(getattr(sel, "group", None))
     def _is_distinct_projection(self, tree: exp.Expression) -> bool:
         sel = self._first_select(tree)
         if not sel:
             return False
-        # DISTINCT may appear as Select.distinct or a Distinct node
         if getattr(sel, "distinct", None):
             return True
         return any(isinstance(n, exp.Distinct) for n in self._walk(sel))
     def _has_windowed_aggregate(self, tree: exp.Expression) -> bool:
-        # If there is any OVER(...) window, aggregates without GROUP BY can be legitimate
         return any(isinstance(n, exp.Window) for n in self._walk(tree))
     def _expr_contains_agg(self, node: exp.Expression) -> bool:
-        """True if subtree contains an aggregate call (robust across sqlglot versions)."""
-        # Build aggregate classes dynamically to avoid attr errors and fixed-length tuples
         agg_type_names = (
             "Count",
             "Sum",
@@ -81,26 +117,24 @@ class Verifier:
             "ArrayAgg",
             "StringAgg",
         )
-        agg_types_list: list[type] = []
-        for name in agg_type_names:
-            t = getattr(exp, name, None)
-            if isinstance(t, type):
-                agg_types_list.append(t)
-        AGG_TYPES: tuple[type, ...] = tuple(agg_types_list)
-        # 1) Class-based check (if we found any known aggregate classes)
-        if AGG_TYPES and any(isinstance(n, AGG_TYPES) for n in self._walk(node)):
             return True
-        # 2) Fallback: generic function nodes with aggregate names
         Anonymous = getattr(exp, "Anonymous", None)
         func_like = (exp.Func,) + ((Anonymous,) if isinstance(Anonymous, type) else ())
-        AGG_NAMES = {"count", "sum", "avg", "min", "max"}
-        def _func_name(n: exp.Expression) -> str:
-            name = getattr(n, "name", None)
-            if isinstance(name, str) and name:
-                return name.lower()
             this = getattr(n, "this", None)
             if isinstance(this, str):
                 return this.lower()
@@ -110,82 +144,138 @@ class Verifier:
             return (str(this) or "").lower()
         for n in self._walk(node):
-            if isinstance(n, func_like) and _func_name(n) in AGG_NAMES:
-                return True
-        return False
-    def _has_nonagg_column(self, node: exp.Expression) -> bool:
-        """Subtree contains a column reference that is NOT inside an aggregate."""
-        # Check if there are any columns in this expression
-        columns = [n for n in self._walk(node) if isinstance(n, exp.Column)]
-        if not columns:
-            return False
-        # Check if all columns are inside aggregates
-        for col in columns:
-            # Walk up from column to see if it's inside an aggregate
-            # is_in_agg = False
-            # For simplicity, check if the entire expression contains both column and aggregate
-            # A more precise check would require parent tracking
-            if self._expr_contains_agg(node):
-                # This is a simplified check - if the node has both columns and aggregates,
-                # we need more complex logic to determine if columns are outside aggregates
-                return True
-            else:
-                # No aggregates, so if there are columns, they're non-aggregate
                 return True
         return False
-    # ----------------------- Textual fallback helpers -------------------------
     def _clean_sql_for_fn_scan(self, sql: str) -> str:
-        """Remove comments/strings so regex won't be fooled."""
         s = re.sub(r"/\*.*?\*/", " ", sql, flags=re.DOTALL)  # block comments
         s = re.sub(r"--.*?$", " ", s, flags=re.MULTILINE)  # line comments
         s = re.sub(
             r"('([^']|'')*'|\"([^\"]|\"\")*\"|`[^`]*`)", " ", s
-        )  # quoted strings / idents
         s = re.sub(r"\s+", " ", s).strip()
         return s
-    # ----------------------- Adapter result helpers ---------------------------
-    def _extract_ok(self, exec_result: Any) -> Optional[bool]:
-        if isinstance(exec_result, dict):
-            v = exec_result.get("ok")
-            if isinstance(v, bool):
-                return v
         return None
-    def _extract_error(self, exec_result: Any) -> Optional[str]:
-        if isinstance(exec_result, dict):
-            for k in ("error", "message", "detail"):
-                if k in exec_result and exec_result[k]:
-                    return str(exec_result[k])
-        return None
-    # ----------------------------- Main entry ---------------------------------
-    def verify(self, sql: str, *, adapter: Any) -> StageResult:
         t0 = time.perf_counter()
         issues: List[str] = []
-        # 1) Parse - Check for errors in the parsed result
-        try:
-            tree = sqlglot.parse_one(sql, read=None)  # autodetect dialect
-            # Check if the parse actually succeeded
             if tree is None:
                 return StageResult(
                     ok=False,
                     error=["parse_error"],
                     trace=StageTrace(stage=self.name, duration_ms=_ms(t0)),
                 )
-            # sqlglot may parse broken SQL as an "Unknown" or "Command" type
-            # Check if we got a proper SQL statement type
             tree_type = type(tree).__name__
-            # Check for common sqlglot error indicators
-            # When sqlglot can't parse properly, it often creates Command or Unknown nodes
             if tree_type in ("Command", "Unknown"):
                 verifier_checks_total.labels(ok="false").inc()
                 verifier_failures_total.labels(reason="parse_error").inc()
@@ -194,36 +284,6 @@ class Verifier:
                     error=["parse_error"],
                     trace=StageTrace(stage=self.name, duration_ms=_ms(t0)),
                 )
-            # Also check if the tree has errors attribute (some versions of sqlglot)
-            if hasattr(tree, "errors") and tree.errors:
-                verifier_checks_total.labels(ok="false").inc()
-                verifier_failures_total.labels(reason="parse_error").inc()
-                return StageResult(
-                    ok=False,
-                    error=["parse_error"],
-                    trace=StageTrace(stage=self.name, duration_ms=_ms(t0)),
-                )
-            # Additional check: if it's not a recognized DML/DQL statement
-            valid_types = ("Select", "With", "Union", "Intersect", "Except", "Values")
-            if tree_type not in valid_types:
-                # This might be a parse error disguised as a different statement type
-                # Let's check if it looks like it should be a SELECT
-                sql_lower = sql.lower().strip()
-                if any(
-                    sql_lower.startswith(kw)
-                    for kw in ["selct", "slect", "selet", "seelct"]
-                ):
-                    # Common misspellings of SELECT
-                    verifier_checks_total.labels(ok="false").inc()
-                    verifier_failures_total.labels(reason="parse_error").inc()
-                    return StageResult(
-                        ok=False,
-                        error=["parse_error"],
-                        trace=StageTrace(stage=self.name, duration_ms=_ms(t0)),
-                    )
         except Exception:
             verifier_checks_total.labels(ok="false").inc()
             verifier_failures_total.labels(reason="parse_error").inc()
@@ -233,29 +293,22 @@ class Verifier:
                 trace=StageTrace(stage=self.name, duration_ms=_ms(t0)),
             )
-        # 2) Semantic checks (AST-first)
         try:
             sel = self._first_select(tree)
             if sel:
                 has_group = self._has_group_by(tree)
                 has_window = self._has_windowed_aggregate(tree)
                 is_distinct = self._is_distinct_projection(tree)
                 select_items = list(getattr(sel, "expressions", []) or [])
                 any_agg = any(self._expr_contains_agg(it) for it in select_items)
-                # More precise check for non-aggregate columns
-                any_nonagg_col = False
-                for item in select_items:
-                    # Check if this select item has columns but no aggregates
-                    has_cols = any(isinstance(n, exp.Column) for n in self._walk(item))
-                    has_aggs = self._expr_contains_agg(item)
-                    if has_cols and not has_aggs:
-                        any_nonagg_col = True
-                        break
-                # Core rule: aggregate + non-aggregate column without GROUP BY is an issue,
-                # unless DISTINCT or windowed aggregate makes it legitimate.
                 if (
                     any_agg
                     and any_nonagg_col
@@ -264,72 +317,111 @@ class Verifier:
                     verifier_failures_total.labels(reason="semantic_error").inc()
                     issues.append("aggregation_without_group_by")
         except Exception as e:
-            # Don't crash the verifier; surface a soft issue and let fallback run
             verifier_failures_total.labels(reason="semantic_error").inc()
             issues.append(f"semantic_check_error:{e!s}")
-        # 3) Fallback textual scan — only if AST didn't already flag
-        if not any("aggregation_without_group_by" in i for i in issues):
-            try:
-                cleaned = self._clean_sql_for_fn_scan(sql)
-                has_agg_call = bool(self._AGG_CALL_RE.search(cleaned))
-                has_group_kw = re.search(r"\bgroup\s+by\b", cleaned, re.IGNORECASE)
-                has_over_kw = re.search(r"\bover\s*\(", cleaned, re.IGNORECASE)
-                has_distinct_kw = re.search(
-                    r"\bselect\s+distinct\b", cleaned, re.IGNORECASE
                 )
-                if has_agg_call and not (
-                    has_group_kw or has_over_kw or has_distinct_kw
-                ):
-                    m_sel = re.search(
-                        r"\bselect\s+(?P<sel>.+?)\s+\bfrom\b",
-                        cleaned,
-                        re.IGNORECASE | re.DOTALL,
-                    )
-                    if m_sel:
-                        select_list = m_sel.group("sel")
-                        # a comma strongly suggests mixing aggregate and non-aggregate in projection
-                        if "," in select_list:
                             verifier_failures_total.labels(
-                                reason="agg_without_group_by"
                             ).inc()
                             issues.append("aggregation_without_group_by")
-            except Exception:
-                # ignore fallback errors
-                pass
-        # 4) Optional: cheap preview execution (adapter may be a stub in tests)
         try:
-            exec_result = adapter.execute_preview(sql) if adapter else {"ok": True}
-            ok_val = self._extract_ok(exec_result)
-            if ok_val is False:
-                err = self._extract_error(exec_result)
                 verifier_failures_total.labels(reason="preview_exec_error").inc()
-                issues.append(f"exec_error:{err}" if err else "exec_error")
         except Exception as e:
             verifier_failures_total.labels(reason="preview_exec_error").inc()
             issues.append(f"exec_exception:{e!s}")
-        # 5) Final decision — AFTER all checks (note: no early return before fallback)
-        if issues:
-            verifier_checks_total.labels(ok="false").inc()
-            stage_duration_ms.labels("verifier").observe(_ms(t0) / 1.0)
             return StageResult(
                 ok=False,
-                error=issues,
                 trace=StageTrace(
                     stage=self.name, duration_ms=_ms(t0), notes={"issues": issues}
                 ),
             )
-        verifier_checks_total.labels(ok="true").inc()
-        stage_duration_ms.labels("verifier").observe(_ms(t0) / 1.0)
-        return StageResult(
-            ok=True,
-            data={"verified": True},
-            trace=StageTrace(stage=self.name, duration_ms=_ms(t0)),
-        )
-    def run(self, *, sql: str, adapter: Any) -> StageResult:
-        return self.verify(sql, adapter=adapter)

 from __future__ import annotations
 import re
 import time
+from typing import Any, Iterable, List, Optional, Dict, Tuple
 import sqlglot
 from sqlglot import expressions as exp
 from nl2sql.types import StageResult, StageTrace
 from nl2sql.metrics import (
     verifier_checks_total,
     verifier_failures_total,
 )
 def _ms(t0: float) -> int:
+    """Return elapsed milliseconds since t0, as int."""
     return int((time.perf_counter() - t0) * 1000)
+# ---------------- Small Levenshtein distance for schema matching ----------------
+def _lev(a: str, b: str) -> int:
+    n = len(b)
+    dp = list(range(n + 1))
+    for i, ca in enumerate(a, 1):
+        prev, dp[0] = dp[0], i
+        for j, cb in enumerate(b, 1):
+            cur = min(
+                dp[j] + 1,  # delete
+                dp[j - 1] + 1,  # insert
+                prev + (0 if ca == cb else 1),  # replace
+            )
+            prev, dp[j] = dp[j], cur
+    return dp[n]
+def _closest(name: str, candidates: List[str]) -> Tuple[str, int]:
+    """Find the closest match (by edit distance) for a given name."""
+    best, dist = name, 10**9
+    for c in candidates:
+        d = _lev(name.lower(), c.lower())
+        if d < dist:
+            best, dist = c, d
+    return best, dist
+def _maybe_singular(plural: str, tables: List[str]) -> Optional[str]:
+    """Simple singularization heuristic: 'singers' -> 'singer'."""
+    if plural.endswith("s"):
+        cand = plural[:-1]
+        if cand in tables:
+            return cand
+    return None
+# ---------------- Verifier with schema-aware repair ----------------
 class Verifier:
     name = "verifier"
+    # Aggregate call detector used by both AST and regex fallbacks
     _AGG_CALL_RE = re.compile(r"\b(count|sum|avg|min|max)\s*\(", re.IGNORECASE)
+    # Fast token sanity: require SELECT and FROM to exist in the cleaned SQL
+    _REQ_SELECT = re.compile(r"\bselect\b", re.IGNORECASE)
+    _REQ_FROM = re.compile(r"\bfrom\b", re.IGNORECASE)
+    # ---------- AST helpers ----------
     def _walk(self, node: exp.Expression) -> Iterable[exp.Expression]:
+        """Depth-first traversal of a SQLGlot AST."""
         stack = [node]
         while stack:
             cur = stack.pop()
                                 stack.append(it)
     def _first_select(self, tree: exp.Expression) -> Optional[exp.Select]:
+        """Return the first SELECT node from the AST (if any)."""
         for n in self._walk(tree):
             if isinstance(n, exp.Select):
                 return n
     def _has_group_by(self, tree: exp.Expression) -> bool:
         sel = self._first_select(tree)
+        return bool(getattr(sel, "group", None)) if sel else False
     def _is_distinct_projection(self, tree: exp.Expression) -> bool:
         sel = self._first_select(tree)
         if not sel:
             return False
         if getattr(sel, "distinct", None):
             return True
         return any(isinstance(n, exp.Distinct) for n in self._walk(sel))
     def _has_windowed_aggregate(self, tree: exp.Expression) -> bool:
         return any(isinstance(n, exp.Window) for n in self._walk(tree))
     def _expr_contains_agg(self, node: exp.Expression) -> bool:
+        """Return True if an expression contains an aggregate function."""
+        agg_names = {"count", "sum", "avg", "min", "max"}
         agg_type_names = (
             "Count",
             "Sum",
             "ArrayAgg",
             "StringAgg",
         )
+        agg_types = tuple(
+            t
+            for t in (getattr(exp, n, None) for n in agg_type_names)
+            if isinstance(t, type)
+        )
+        # AST type-based check (preferred)
+        if agg_types and any(isinstance(n, agg_types) for n in self._walk(node)):
             return True
+        # Fallback: function-like name check
         Anonymous = getattr(exp, "Anonymous", None)
         func_like = (exp.Func,) + ((Anonymous,) if isinstance(Anonymous, type) else ())
+        def _fname(n: exp.Expression) -> str:
+            nm = getattr(n, "name", None)
+            if isinstance(nm, str) and nm:
+                return nm.lower()
             this = getattr(n, "this", None)
             if isinstance(this, str):
                 return this.lower()
             return (str(this) or "").lower()
         for n in self._walk(node):
+            if isinstance(n, func_like) and _fname(n) in agg_names:
                 return True
         return False
     def _clean_sql_for_fn_scan(self, sql: str) -> str:
+        """Normalize SQL before scanning for function names or keywords."""
         s = re.sub(r"/\*.*?\*/", " ", sql, flags=re.DOTALL)  # block comments
         s = re.sub(r"--.*?$", " ", s, flags=re.MULTILINE)  # line comments
         s = re.sub(
             r"('([^']|'')*'|\"([^\"]|\"\")*\"|`[^`]*`)", " ", s
+        )  # quoted strings
         s = re.sub(r"\s+", " ", s).strip()
         return s
+    # ---------------- Schema-Guard Repair ----------------
+    def _schema_dict(self, adapter: Any) -> Optional[Dict[str, List[str]]]:
+        """Fetch schema dict {table: [columns]} from adapter if available."""
+        if not adapter:
+            return None
+        get = getattr(adapter, "schema_dict", None)
+        if callable(get):
+            try:
+                d = get()
+                if isinstance(d, dict):
+                    return {str(k): list(v) for k, v in d.items()}
+            except Exception:
+                return None
         return None
+    def _repair_with_schema(
+        self, sql: str, schema: Dict[str, List[str]]
+    ) -> Tuple[str, bool, List[str]]:
+        """Try to fix table/column names using schema similarity (singularize + closest edit-distance <= 2)."""
+        notes: List[str] = []
+        try:
+            ast = sqlglot.parse_one(sql)
+        except Exception as e:
+            return sql, False, [f"parse_error:{e!s}"]
+        tables = list(schema.keys())
+        changed = False
+        # Fix table names
+        def _fix_table(node: exp.Expression) -> exp.Expression:
+            nonlocal changed
+            if isinstance(node, exp.Table):
+                orig = node.name
+                if orig in schema:
+                    return node
+                s1 = _maybe_singular(orig, tables)
+                if s1:
+                    changed = True
+                    return exp.Table(this=sqlglot.to_identifier(s1))
+                best, dist = _closest(orig, tables)
+                if dist <= 2:
+                    changed = True
+                    return exp.Table(this=sqlglot.to_identifier(best))
+            return node
+        ast = ast.transform(_fix_table)
+        # Fix column names
+        def _fix_col(node: exp.Expression) -> exp.Expression:
+            nonlocal changed
+            if isinstance(node, exp.Column):
+                name = node.name
+                if not name:
+                    return node
+                tbl = node.table
+                if tbl and tbl in schema:
+                    candidates = schema[tbl]
+                else:
+                    candidates = [c for cols in schema.values() for c in cols]
+                if name in candidates:
+                    return node
+                best, dist = _closest(name, candidates) if candidates else (name, 99)
+                if dist <= 2:
+                    changed = True
+                    node.set("this", sqlglot.to_identifier(best))
+            return node
+        ast = ast.transform(_fix_col)
+        if not changed:
+            return sql, True, notes
+        try:
+            repaired = ast.sql(dialect="sqlite")
+        except Exception as e:
+            return sql, False, notes + [f"rebuild_error:{e!s}"]
+        notes.append("schema_guard_repair")
+        return repaired, True, notes
+    # ---------------- Main verifier logic ----------------
+    def verify(
+        self, sql: str, *, exec_result: Any = None, adapter: Any = None
+    ) -> StageResult:
+        """
+        Verify syntax, basic semantics, and optionally schema correctness and preview-execution.
+        Returns:
+          StageResult with:
+            - ok: boolean
+            - data: may include {"verified": True, "sql": <repaired_sql>}
+            - trace: StageTrace(stage="verifier", duration_ms=...)
+        """
         t0 = time.perf_counter()
         issues: List[str] = []
+        repaired_sql = None
+        # 0) Fast token sanity: must contain SELECT and FROM (handles typos like SELCT/FRM).
+        sql_scan = self._clean_sql_for_fn_scan(sql)
+        if not self._REQ_SELECT.search(sql_scan) or not self._REQ_FROM.search(sql_scan):
+            verifier_checks_total.labels(ok="false").inc()
+            verifier_failures_total.labels(reason="parse_error").inc()
+            return StageResult(
+                ok=False,
+                error=["parse_error"],
+                trace=StageTrace(stage=self.name, duration_ms=_ms(t0)),
+            )
+        # 1) Syntax validation via sqlglot
+        try:
+            tree = sqlglot.parse_one(sql, read=None)
             if tree is None:
                 return StageResult(
                     ok=False,
                     error=["parse_error"],
                     trace=StageTrace(stage=self.name, duration_ms=_ms(t0)),
                 )
             tree_type = type(tree).__name__
             if tree_type in ("Command", "Unknown"):
                 verifier_checks_total.labels(ok="false").inc()
                 verifier_failures_total.labels(reason="parse_error").inc()
                     error=["parse_error"],
                     trace=StageTrace(stage=self.name, duration_ms=_ms(t0)),
                 )
         except Exception:
             verifier_checks_total.labels(ok="false").inc()
             verifier_failures_total.labels(reason="parse_error").inc()
                 trace=StageTrace(stage=self.name, duration_ms=_ms(t0)),
             )
+        # 2) Semantic rule: avoid aggregate + non-aggregate mix without GROUP BY (unless DISTINCT/window)
         try:
             sel = self._first_select(tree)
             if sel:
                 has_group = self._has_group_by(tree)
                 has_window = self._has_windowed_aggregate(tree)
                 is_distinct = self._is_distinct_projection(tree)
                 select_items = list(getattr(sel, "expressions", []) or [])
                 any_agg = any(self._expr_contains_agg(it) for it in select_items)
+                any_nonagg_col = any(
+                    (
+                        any(isinstance(n, exp.Column) for n in self._walk(it))
+                        and not self._expr_contains_agg(it)
+                    )
+                    for it in select_items
+                )
                 if (
                     any_agg
                     and any_nonagg_col
                     verifier_failures_total.labels(reason="semantic_error").inc()
                     issues.append("aggregation_without_group_by")
         except Exception as e:
             verifier_failures_total.labels(reason="semantic_error").inc()
             issues.append(f"semantic_check_error:{e!s}")
+        # 2b) Regex fallback for aggregate + non-aggregate without GROUP BY.
+        #     Skip if DISTINCT or any WINDOW (OVER ...) is present in the SELECT list.
+        try:
+            low = sql_scan.lower()
+            if "group by" not in low and "distinct" not in low:
+                m = re.search(
+                    r"select\s+(?P<sel>.+?)\s+from\b",
+                    sql_scan,
+                    flags=re.IGNORECASE | re.DOTALL,
                 )
+                if m:
+                    sel_clause = m.group("sel")
+                    # If window functions are present, allow (COUNT(*) OVER (...), etc.)
+                    if re.search(r"\bover\b", sel_clause, flags=re.IGNORECASE):
+                        pass  # windowed aggregates are acceptable without GROUP BY
+                    else:
+                        has_agg = bool(self._AGG_CALL_RE.search(sel_clause))
+                        # Heuristic: presence of a comma OR a bare identifier besides pure aggregate-only select
+                        has_bare_col = "," in sel_clause or (
+                            bool(re.search(r"\b[a-zA-Z_][\w.]*\b", sel_clause))
+                            and not re.fullmatch(
+                                r"\s*(count|sum|avg|min|max)\s*\([^)]*\)\s*",
+                                sel_clause,
+                                flags=re.IGNORECASE,
+                            )
+                        )
+                        if (
+                            has_agg
+                            and has_bare_col
+                            and "aggregation_without_group_by" not in issues
+                        ):
                             verifier_failures_total.labels(
+                                reason="semantic_error"
                             ).inc()
                             issues.append("aggregation_without_group_by")
+        except Exception:
+            # Non-fatal; AST path already attempted.
+            pass
+        # 3) Schema-based auto-repair (optional)
+        schema = self._schema_dict(adapter)
+        if schema:
+            fixed, ok_fix, notes = self._repair_with_schema(sql, schema)
+            if ok_fix is True and fixed != sql:
+                repaired_sql = fixed
+            if notes:
+                issues.extend(
+                    [f"note:{n}" for n in notes if not n.startswith("parse_error")]
+                )
+        # 4) Preview execution check:
+        #    - If exec_result is provided, use it directly
+        #    - Otherwise, if adapter has execute_preview, run it
         try:
+            if exec_result is not None:
+                er = exec_result
+            elif adapter is not None and hasattr(adapter, "execute_preview"):
+                er = adapter.execute_preview(repaired_sql or sql)
+            else:
+                er = {"ok": True}
+            ok_val = (
+                isinstance(er, dict) and isinstance(er.get("ok"), bool) and er["ok"]
+            )
+            if not ok_val:
+                msg = None
+                if isinstance(er, dict):
+                    for k in ("error", "message", "detail"):
+                        if k in er and er[k]:
+                            msg = str(er[k])
+                            break
                 verifier_failures_total.labels(reason="preview_exec_error").inc()
+                issues.append(f"exec_error:{msg or 'preview_failed'}")
         except Exception as e:
             verifier_failures_total.labels(reason="preview_exec_error").inc()
             issues.append(f"exec_exception:{e!s}")
+        # 5) Final result and trace
+        is_ok: bool = (not issues) or all(i.startswith("note:") for i in issues)
+        ok_label: str = "true" if is_ok else "false"
+        verifier_checks_total.labels(ok=ok_label).inc()
+        if is_ok:
+            data: Dict[str, Any] = {"verified": True}
+            if repaired_sql:
+                data["sql"] = repaired_sql
+            return StageResult(
+                ok=True,
+                data=data,
+                trace=StageTrace(stage=self.name, duration_ms=_ms(t0)),
+            )
+        else:
             return StageResult(
                 ok=False,
+                error=[i for i in issues if not i.startswith("note:")],
                 trace=StageTrace(
                     stage=self.name, duration_ms=_ms(t0), notes={"issues": issues}
                 ),
             )
+    # Public alias for backward compatibility
+    def run(
+        self, *, sql: str, exec_result: Any = None, adapter: Any = None
+    ) -> StageResult:
+        """Back-compat wrapper around verify()."""
+        return self.verify(sql, exec_result=exec_result, adapter=adapter)