Spaces:

melikakheirieh
/

nl2sql-copilot

Sleeping

App Files Files Community

Melika Kheirieh commited on Nov 9

Commit

296a94d

1 Parent(s): b794494

feat(bench): gold-aware EM/SM/ExecAcc + p50/p95; write per-stage means; richer plots

Browse files

Files changed (7) hide show

benchmarks/evaluate_spider_pro.py +276 -279
benchmarks/plot_results.py +129 -89
benchmarks/results_pro/20251109-095552/eval.jsonl +5 -0
benchmarks/results_pro/20251109-095552/summary.json +12 -0
benchmarks/results_pro/20251109-100021/eval.jsonl +5 -0
benchmarks/results_pro/20251109-100021/results.csv +6 -0
benchmarks/results_pro/20251109-100021/summary.json +21 -0

benchmarks/evaluate_spider_pro.py CHANGED Viewed

@@ -1,7 +1,9 @@
-#!/usr/bin/env python3
 """
-Enhanced Spider benchmark evaluator for NL2SQL pipeline.
-No external dependencies - uses internal evaluation logic.
 """
 from __future__ import annotations
@@ -20,423 +22,418 @@ from nl2sql.pipeline_factory import pipeline_from_config_with_adapter
 from adapters.db.sqlite_adapter import SQLiteAdapter
 from benchmarks.spider_loader import load_spider_sqlite
-# ==================== Configuration ====================
 RESULT_ROOT = Path("benchmarks/results_pro")
 TIMESTAMP = time.strftime("%Y%m%d-%H%M%S")
 RESULT_DIR = RESULT_ROOT / TIMESTAMP
-# ==================== SQL Processing ====================
 def extract_clean_sql(text: str | None) -> str:
-    """Safely extract a clean SQL string from input text possibly containing markdown fences or JSON."""
-    # Always initialize variable to empty string
-    sql = text or ""
-    # Remove markdown code fences
-    sql = re.sub(r"```(?:sql)?\s*\n?", "", sql, flags=re.IGNORECASE)
-    sql = re.sub(r"```\s*$", "", sql)
-    # Try JSON pattern like {"sql": "..."}
-    m_json = re.search(r'"sql"\s*:\s*"([^"]+)"', sql)
-    if m_json:
-        sql = m_json.group(1)
-    # Clean escaped characters
     sql = sql.replace('\\"', '"').replace("\\n", " ").replace("\\t", " ")
-    # Try to locate SQL statement keywords
-    m_sql = re.search(
-        r"\b(select|with|insert|update|delete)\b[\s\S]+", sql, re.IGNORECASE
-    )
-    if m_sql:
-        sql = m_sql.group(0)
     sql = re.sub(r"\s+", " ", sql).strip().rstrip(";")
     return sql
 def normalize_sql(sql: str) -> str:
-    """Enhanced SQL normalization for better matching."""
     if not sql:
         return ""
-    sql = sql.strip().upper()
-    # Remove all whitespace variations
-    sql = re.sub(r"\s+", " ", sql)
-    # Remove trailing semicolon
-    sql = sql.rstrip(";")
-    # Remove table prefixes (e.g., singer.name -> name)
-    sql = re.sub(r"\b\w+\.(\w+)\b", r"\1", sql)
-    # Remove AS aliases
-    sql = re.sub(r"\s+AS\s+\w+", "", sql, flags=re.IGNORECASE)
-    # Remove DISTINCT if used with COUNT(*)
-    sql = re.sub(r"COUNT\s*\(\s*DISTINCT\s+", "COUNT(", sql)
-    # Normalize COUNT variations
-    sql = re.sub(r"COUNT\s*\(\s*\w+\s*\)", "COUNT(*)", sql)
-    # Remove LIMIT at end
-    sql = re.sub(r"\s+LIMIT\s+\d+$", "", sql)
-    # Normalize quotes
-    sql = re.sub(r'"(\w+)"', r"\1", sql)
-    sql = re.sub(r"`(\w+)`", r"\1", sql)
-    return sql
-# ==================== Schema Extraction ====================
 def get_database_schema(db_path: Path) -> Dict[str, Any]:
-    """Extract complete schema from SQLite database."""
     if not db_path.exists():
-        return {}
     conn = sqlite3.connect(str(db_path))
-    cursor = conn.cursor()
-    schema: dict[str, Any] = {"tables": {}}
     try:
-        # Get all tables
-        cursor.execute(
             "SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'"
         )
-        tables = cursor.fetchall()
-        for (table_name,) in tables:
-            # Get columns
-            cursor.execute(f"PRAGMA table_info('{table_name}')")
-            columns = cursor.fetchall()
-            col_info = []
-            for col in columns:
-                col_name = col[1]
-                col_type = col[2]
-                is_pk = col[5]
-                col_dict = {
-                    "name": col_name,
-                    "type": col_type,
-                    "primary_key": bool(is_pk),
-                }
-                col_info.append(col_dict)
-            # Get foreign keys
-            cursor.execute(f"PRAGMA foreign_key_list('{table_name}')")
-            fks = cursor.fetchall()
-            fk_info = []
-            for fk in fks:
-                fk_info.append(
-                    {
-                        "column": fk[3],
-                        "referenced_table": fk[2],
-                        "referenced_column": fk[4],
-                    }
-                )
-            schema["tables"][table_name] = {
-                "columns": col_info,
-                "foreign_keys": fk_info,
-            }
     finally:
         conn.close()
     return schema
 def format_schema_for_prompt(schema: Dict[str, Any]) -> str:
-    """Format schema for LLM prompt."""
-    if not schema or not schema.get("tables"):
         return ""
-    lines = []
-    for table_name, table_info in schema["tables"].items():
-        cols = []
-        for col in table_info["columns"]:
-            col_str = f"{col['name']} {col['type']}"
-            if col.get("primary_key"):
-                col_str += " PRIMARY KEY"
-            cols.append(col_str)
-        lines.append(f"Table: {table_name}")
         lines.append(f"Columns: {', '.join(cols)}")
-        if table_info.get("foreign_keys"):
-            fks = []
-            for fk in table_info["foreign_keys"]:
-                fks.append(
                     f"{fk['column']} -> {fk['referenced_table']}.{fk['referenced_column']}"
                 )
-            lines.append(f"Foreign Keys: {', '.join(fks)}")
-        lines.append("")  # Empty line between tables
     return "\n".join(lines).strip()
-# ==================== SQL Evaluation ====================
-def execute_sql(db_path: Path, sql: str) -> Tuple[bool, List[Tuple]]:
-    """Execute SQL and return success flag and results."""
     if not sql:
         return False, []
     try:
-        conn = sqlite3.connect(str(db_path))
-        cursor = conn.cursor()
-        cursor.execute(sql)
-        results = cursor.fetchall()
         conn.close()
-        return True, results
     except Exception:
         return False, []
-def compare_sql_results(gold_results: List[Tuple], pred_results: List[Tuple]) -> bool:
-    """Compare SQL execution results."""
-    if len(gold_results) != len(pred_results):
-        return False
-    # Convert to sets for comparison (order independent)
-    gold_set = set(gold_results)
-    pred_set = set(pred_results)
-    return gold_set == pred_set
-def evaluate_sql_match(pred_sql: str, gold_sql: str, db_path: Path) -> Dict[str, float]:
-    """Evaluate predicted SQL against gold SQL."""
-    metrics = {"exact_match": 0.0, "set_match": 0.0, "exec_accuracy": 0.0}
-    if not pred_sql:
-        return metrics
-    # Exact match
-    if normalize_sql(pred_sql) == normalize_sql(gold_sql):
-        metrics["exact_match"] = 1.0
-    # Execution-based evaluation
-    gold_success, gold_results = execute_sql(db_path, gold_sql)
-    pred_success, pred_results = execute_sql(db_path, pred_sql)
-    if gold_success and pred_success:
-        # Set match (results match)
-        if compare_sql_results(gold_results, pred_results):
-            metrics["set_match"] = 1.0
-            metrics["exec_accuracy"] = 1.0
         else:
-            # Partial credit for successful execution
-            metrics["exec_accuracy"] = 0.5
-    return metrics
-# ==================== Pipeline Runner ====================
 @dataclass
 class SpiderSample:
-    """Spider dataset sample."""
     question: str
     db_id: str
     db_path: Path
     gold_sql: str
 def run_pipeline_on_sample(
     pipeline: Any,
     sample: SpiderSample,
     schema_cache: Dict[str, str],
     debug: bool = False,
 ) -> Dict[str, Any]:
-    """Run NL2SQL pipeline on a single sample."""
-    # Get/cache schema
     if sample.db_id not in schema_cache:
         schema_dict = get_database_schema(sample.db_path)
-        schema_str = format_schema_for_prompt(schema_dict)
-        schema_cache[sample.db_id] = schema_str
         if debug:
-            print(f"    [schema] Loaded {len(schema_str)} chars for {sample.db_id}")
-    schema: str = schema_cache[sample.db_id]
-    # Run pipeline
     try:
-        result = pipeline.run(user_query=sample.question, schema_preview=schema)
-        # Extract SQL from result
-        if hasattr(result, "sql") and result.sql:
-            pred_sql = extract_clean_sql(result.sql)
         else:
-            # Try to extract from various fields
-            for attr in ["final_sql", "generated_sql", "answer"]:
-                if hasattr(result, attr):
-                    val = getattr(result, attr)
-                    if val:
-                        pred_sql = extract_clean_sql(str(val))
-                        if pred_sql:
-                            break
-            else:
-                pred_sql = ""
         return {
-            "ok": bool(getattr(result, "ok", True)),
             "sql": pred_sql,
-            "raw_response": getattr(result, "sql", ""),
-            "traces": getattr(result, "traces", []),
             "error": None,
         }
     except Exception as e:
         if debug:
             import traceback
             traceback.print_exc()
-        return {
-            "ok": False,
-            "sql": "",
-            "raw_response": "",
-            "traces": [],
-            "error": str(e),
-        }
-# ==================== Main Evaluation ====================
-def main():
-    parser = argparse.ArgumentParser(description="Evaluate NL2SQL on Spider")
-    parser.add_argument("--spider", action="store_true", help="Run Spider evaluation")
-    parser.add_argument("--split", default="dev", choices=["dev", "train"])
-    parser.add_argument("--limit", type=int, help="Limit number of samples")
-    parser.add_argument("--debug", action="store_true", help="Enable debug output")
-    parser.add_argument("--config", default="configs/sqlite_pipeline.yaml")
-    args = parser.parse_args()
     if not args.spider:
-        print("Please use --spider flag to run Spider evaluation")
         return
-    # Load Spider samples
     print(f"Loading Spider {args.split} split...")
-    samples = load_spider_sqlite(split=args.split, limit=args.limit)
-    if not samples:
-        print("❌ No samples loaded. Check SPIDER_ROOT environment variable.")
         return
-    print(f"✔ Loaded {len(samples)} samples")
-    # Prepare results directory
     RESULT_DIR.mkdir(parents=True, exist_ok=True)
-    # Initialize schema cache
-    schema_cache = {}
-    # Process each sample
-    results = []
-    for i, spider_item in enumerate(samples, 1):
-        # Convert to our sample format
         sample = SpiderSample(
-            question=spider_item.question,
-            db_id=spider_item.db_id,
-            db_path=Path(spider_item.db_path),
-            gold_sql=spider_item.gold_sql,
         )
-        print(f"\n🧠 [{i}/{len(samples)}] [{sample.db_id}] {sample.question}")
-        # Create adapter and pipeline for this database
-        adapter = SQLiteAdapter(sample.db_path)
         pipeline = pipeline_from_config_with_adapter(args.config, adapter=adapter)
-        # Run pipeline
         t0 = time.perf_counter()
-        result = run_pipeline_on_sample(pipeline, sample, schema_cache, args.debug)
         latency_ms = int((time.perf_counter() - t0) * 1000)
-        # Evaluate
-        metrics = evaluate_sql_match(result["sql"], sample.gold_sql, sample.db_path)
-        # Store result
-        eval_result = {
             "source": "spider",
             "db_id": sample.db_id,
             "query": sample.question,
             "gold_sql": sample.gold_sql,
-            "pred_sql": result["sql"],
-            "ok": result["ok"],
             "latency_ms": latency_ms,
-            "em": metrics["exact_match"],
-            "sm": metrics["set_match"],
-            "exec_acc": metrics["exec_accuracy"],
-            "error": result.get("error"),
-            "trace": result.get("traces", []),
         }
-        results.append(eval_result)
-        # Debug output
         if args.debug:
-            status = "✅" if result["ok"] and metrics["exact_match"] == 1 else "⚠️"
             print(
-                f"{status} ({latency_ms} ms) | EM={metrics['exact_match']:.0f} SM={metrics['set_match']:.0f} ExecAcc={metrics['exec_accuracy']:.1f}"
             )
-            if metrics["exact_match"] < 1:
-                print(f"    gold: {sample.gold_sql[:100]}")
-                print(f"    pred: {result['sql'][:100] if result['sql'] else 'EMPTY'}")
-    # Calculate aggregates
-    total = len(results)
-    successful = sum(1 for r in results if r["ok"])
-    avg_em = sum(r["em"] for r in results) / total if total > 0 else 0
-    avg_sm = sum(r["sm"] for r in results) / total if total > 0 else 0
-    avg_ea = sum(r["exec_acc"] for r in results) / total if total > 0 else 0
-    avg_latency = sum(r["latency_ms"] for r in results) / total if total > 0 else 0
-    # Save results
-    eval_jsonl = RESULT_DIR / "eval.jsonl"
-    with open(eval_jsonl, "w") as f:
-        for r in results:
             json.dump(r, f, ensure_ascii=False)
             f.write("\n")
     summary = {
         "timestamp": datetime.now().isoformat(timespec="seconds"),
         "total": total,
-        "success": successful,
-        "success_rate": round(successful / total, 3) if total else 0,
-        "avg_latency_ms": round(avg_latency, 1),
         "EM": round(avg_em, 3),
         "SM": round(avg_sm, 3),
-        "ExecAcc": round(avg_ea, 3),
-        "split": args.split,
-        "config": args.config,
     }
     (RESULT_DIR / "summary.json").write_text(
         json.dumps(summary, indent=2, ensure_ascii=False), encoding="utf-8"
     )
     print("\n================== Evaluation Summary ==================")
     print(f"Total samples:   {total}")
-    print(f"Successful runs: {successful} ({summary['success_rate'] * 100:.1f}%)")
     print(f"Avg EM:          {summary['EM']}")
     print(f"Avg SM:          {summary['SM']}")
     print(f"Avg ExecAcc:     {summary['ExecAcc']}")
-    print(f"Avg Latency:     {summary['avg_latency_ms']} ms")
     print(f"Results saved to {RESULT_DIR}")
     print("========================================================")

 """
+Spider benchmark evaluator (pro):
+- Computes EM / SM / ExecAcc vs. gold SQL
+- Records per-sample latency and (if present) per-stage timings from pipeline traces
+- Persists eval.jsonl (per-sample), summary.json (aggregates incl. p50/p95, per-stage means), results.csv
+- No external deps; percentile and normalization are implemented locally.
 """
 from __future__ import annotations
 from adapters.db.sqlite_adapter import SQLiteAdapter
 from benchmarks.spider_loader import load_spider_sqlite
+# -------------------------- Config --------------------------
 RESULT_ROOT = Path("benchmarks/results_pro")
 TIMESTAMP = time.strftime("%Y%m%d-%H%M%S")
 RESULT_DIR = RESULT_ROOT / TIMESTAMP
+STAGES = [
+    "detector",
+    "planner",
+    "generator",
+    "safety",
+    "executor",
+    "verifier",
+    "repair",
+]
+# -------------------------- SQL utils -----------------------
 def extract_clean_sql(text: str | None) -> str:
+    """Extract a clean SQL string from LLM-ish output (may include fences/JSON)."""
+    sql = (text or "").strip()
+    # strip ```sql fences
+    sql = re.sub(r"```(?:sql)?\s*", "", sql, flags=re.I)
+    sql = sql.replace("```", "")
+    # JSON-like {"sql": "..."}
+    m = re.search(r'"sql"\s*:\s*"([^"]+)"', sql)
+    if m:
+        sql = m.group(1)
+    # unescape
     sql = sql.replace('\\"', '"').replace("\\n", " ").replace("\\t", " ")
+    # find first SQL-ish keyword
+    m2 = re.search(r"\b(select|with|insert|update|delete)\b[\s\S]+", sql, re.I)
+    if m2:
+        sql = m2.group(0)
     sql = re.sub(r"\s+", " ", sql).strip().rstrip(";")
     return sql
 def normalize_sql(sql: str) -> str:
+    """Light normalization to make EM stricter-but-fair."""
     if not sql:
         return ""
+    s = sql.strip()
+    # unify case but keep literals recognizable
+    s = re.sub(r"\s+", " ", s).strip()
+    s = s.rstrip(";")
+    # drop table prefixes a.b -> b
+    s = re.sub(r"\b\w+\.(\w+)\b", r"\1", s)
+    # collapse quotes around identifiers
+    s = re.sub(r"`([A-Za-z_]\w*)`", r"\1", s)
+    s = re.sub(r'"([A-Za-z_]\w*)"', r"\1", s)
+    # COUNT(foo) -> COUNT(*), DISTINCT inside COUNT -> COUNT(*)
+    s = re.sub(r"(?i)COUNT\s*\(\s*DISTINCT\s+[^)]+\)", "COUNT(*)", s)
+    s = re.sub(r"(?i)COUNT\s*\(\s*[A-Za-z_]\w*\s*\)", "COUNT(*)", s)
+    # strip trailing LIMIT n
+    s = re.sub(r"(?i)\s+LIMIT\s+\d+\s*$", "", s)
+    # canonical whitespace + upper keywords for stability
+    s = re.sub(r"\s+", " ", s).strip()
+    # keyword upper (a bit heuristic)
+    for kw in [
+        "select",
+        "from",
+        "where",
+        "group by",
+        "order by",
+        "having",
+        "limit",
+        "join",
+        "on",
+        "and",
+        "or",
+        "asc",
+        "desc",
+    ]:
+        s = re.sub(rf"(?i)\b{kw}\b", kw.upper(), s)
+    return s
+# ---------------------- Schema extraction -------------------
 def get_database_schema(db_path: Path) -> Dict[str, Any]:
+    """Extract schema from SQLite database (tables, columns, FKs)."""
+    schema: Dict[str, Any] = {"tables": {}}
     if not db_path.exists():
+        return schema
     conn = sqlite3.connect(str(db_path))
+    cur = conn.cursor()
     try:
+        cur.execute(
             "SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'"
         )
+        for (table,) in cur.fetchall():
+            cur.execute(f"PRAGMA table_info('{table}')")
+            cols = [
+                {"name": c[1], "type": c[2], "primary_key": bool(c[5])}
+                for c in cur.fetchall()
+            ]
+            cur.execute(f"PRAGMA foreign_key_list('{table}')")
+            fks = [
+                {"column": fk[3], "referenced_table": fk[2], "referenced_column": fk[4]}
+                for fk in cur.fetchall()
+            ]
+            schema["tables"][table] = {"columns": cols, "foreign_keys": fks}
     finally:
         conn.close()
     return schema
 def format_schema_for_prompt(schema: Dict[str, Any]) -> str:
+    """Plain-text schema for prompt (minimal but helpful)."""
+    if not schema.get("tables"):
         return ""
+    lines: List[str] = []
+    for t, info in schema["tables"].items():
+        cols = [
+            f"{c['name']} {c['type']}{' PK' if c.get('primary_key') else ''}"
+            for c in info.get("columns", [])
+        ]
+        lines.append(f"Table: {t}")
         lines.append(f"Columns: {', '.join(cols)}")
+        fks = info.get("foreign_keys") or []
+        if fks:
+            lines.append(
+                "FKs: "
+                + ", ".join(
                     f"{fk['column']} -> {fk['referenced_table']}.{fk['referenced_column']}"
+                    for fk in fks
                 )
+            )
+        lines.append("")
     return "\n".join(lines).strip()
+# ---------------------- Exec/eval metrics -------------------
+def _exec_sql(db: Path, sql: str) -> Tuple[bool, List[Tuple]]:
     if not sql:
         return False, []
     try:
+        conn = sqlite3.connect(str(db))
+        cur = conn.cursor()
+        cur.execute(sql)
+        rows = cur.fetchall()
         conn.close()
+        return True, rows
     except Exception:
         return False, []
+def _same_rows(a: List[Tuple], b: List[Tuple]) -> bool:
+    return set(a) == set(b) and len(a) == len(b)
+def evaluate_sql(pred: str, gold: str, db: Path) -> Dict[str, float]:
+    """Return {'em', 'sm', 'exec'} in {0.0,1.0} (sm ~ set-match)."""
+    em = 1.0 if normalize_sql(pred) == normalize_sql(gold) else 0.0
+    gold_ok, gold_rows = _exec_sql(db, gold)
+    pred_ok, pred_rows = _exec_sql(db, pred)
+    sm = 0.0
+    exec_acc = 0.0
+    if gold_ok and pred_ok:
+        if _same_rows(gold_rows, pred_rows):
+            sm = 1.0
+            exec_acc = 1.0
         else:
+            exec_acc = 0.5  # partial credit for executing but mismatched rows
+    return {"em": em, "sm": sm, "exec": exec_acc}
+# ---------------------- Dataclass + runner ------------------
 @dataclass
 class SpiderSample:
     question: str
     db_id: str
     db_path: Path
     gold_sql: str
+def _percentile(values: List[float], p: float) -> float:
+    """Compute p-th percentile (0..100) without numpy."""
+    if not values:
+        return 0.0
+    vals = sorted(values)
+    k = (len(vals) - 1) * (p / 100.0)
+    f = int(k)
+    c = min(f + 1, len(vals) - 1)
+    if f == c:
+        return float(vals[int(k)])
+    return float(vals[f] * (c - k) + vals[c] * (k - f))
+def _stage_ms_from_trace(trace_item: Dict[str, Any]) -> float:
+    """Accepts {'stage':..., 'ms':...} OR {'stage':..., 'duration_ms':...}."""
+    if not trace_item:
+        return 0.0
+    if "ms" in trace_item:
+        try:
+            return float(trace_item["ms"])
+        except Exception:
+            return 0.0
+    if "duration_ms" in trace_item:
+        try:
+            return float(trace_item["duration_ms"])
+        except Exception:
+            return 0.0
+    return 0.0
+def _collect_stage_means(eval_rows: List[Dict[str, Any]]) -> Dict[str, float]:
+    """Average per-stage ms across all records (0 if absent)."""
+    totals = {s: 0.0 for s in STAGES}
+    counts = {s: 0 for s in STAGES}
+    for r in eval_rows:
+        trace_list = r.get("trace") or r.get("traces") or []
+        for t in trace_list:
+            s = t.get("stage")
+            if s in totals:
+                ms = _stage_ms_from_trace(t)
+                totals[s] += ms
+                counts[s] += 1
+    return {s: round(totals[s] / counts[s], 2) if counts[s] else 0.0 for s in STAGES}
 def run_pipeline_on_sample(
     pipeline: Any,
     sample: SpiderSample,
     schema_cache: Dict[str, str],
     debug: bool = False,
 ) -> Dict[str, Any]:
+    """Run pipeline on one sample and extract normalized prediction + traces."""
+    # cache schema
     if sample.db_id not in schema_cache:
         schema_dict = get_database_schema(sample.db_path)
+        schema_cache[sample.db_id] = format_schema_for_prompt(schema_dict)
         if debug:
+            print(
+                f"    [schema] Loaded {len(schema_cache[sample.db_id])} chars for {sample.db_id}"
+            )
+    schema = schema_cache[sample.db_id]
     try:
+        res = pipeline.run(user_query=sample.question, schema_preview=schema)
+        # extract SQL
+        pred_sql = ""
+        if hasattr(res, "sql") and res.sql:
+            pred_sql = extract_clean_sql(res.sql)
         else:
+            for attr in ("final_sql", "generated_sql", "answer"):
+                if getattr(res, attr, None):
+                    pred_sql = extract_clean_sql(str(getattr(res, attr)))
+                    if pred_sql:
+                        break
         return {
+            "ok": bool(getattr(res, "ok", True)),
             "sql": pred_sql,
+            "trace": getattr(res, "traces", []) or getattr(res, "trace", []),
             "error": None,
         }
     except Exception as e:
         if debug:
             import traceback
             traceback.print_exc()
+        return {"ok": False, "sql": "", "trace": [], "error": str(e)}
+# --------------------------- Main --------------------------
+def main() -> None:
+    ap = argparse.ArgumentParser(description="Evaluate NL2SQL on Spider (pro)")
+    ap.add_argument("--spider", action="store_true", help="Use Spider dataset loader")
+    ap.add_argument("--split", default="dev", choices=["dev", "train"])
+    ap.add_argument("--limit", type=int, default=20)
+    ap.add_argument("--debug", action="store_true")
+    ap.add_argument("--config", default="configs/sqlite_pipeline.yaml")
+    args = ap.parse_args()
     if not args.spider:
+        print("Use --spider to run Spider evaluation.")
         return
+    # load items
     print(f"Loading Spider {args.split} split...")
+    items = load_spider_sqlite(split=args.split, limit=args.limit)
+    if not items:
+        print("❌ No samples loaded. Check SPIDER_ROOT.")
         return
+    print(f"✔ Loaded {len(items)} samples")
     RESULT_DIR.mkdir(parents=True, exist_ok=True)
+    schema_cache: Dict[str, str] = {}
+    eval_rows: List[Dict[str, Any]] = []
+    for i, it in enumerate(items, 1):
         sample = SpiderSample(
+            question=it.question,
+            db_id=it.db_id,
+            db_path=Path(it.db_path),
+            gold_sql=it.gold_sql,
         )
+        print(f"\n🧠 [{i}/{len(items)}] [{sample.db_id}] {sample.question}")
+        adapter = SQLiteAdapter(str(sample.db_path))
         pipeline = pipeline_from_config_with_adapter(args.config, adapter=adapter)
         t0 = time.perf_counter()
+        out = run_pipeline_on_sample(pipeline, sample, schema_cache, args.debug)
         latency_ms = int((time.perf_counter() - t0) * 1000)
+        metrics = evaluate_sql(out["sql"], sample.gold_sql, sample.db_path)
+        row = {
             "source": "spider",
             "db_id": sample.db_id,
             "query": sample.question,
             "gold_sql": sample.gold_sql,
+            "pred_sql": out["sql"],
+            "ok": out["ok"],
             "latency_ms": latency_ms,
+            "em": metrics["em"],
+            "sm": metrics["sm"],
+            "exec_acc": metrics["exec"],
+            "error": out.get("error"),
+            "trace": out.get("trace", []),
         }
+        eval_rows.append(row)
         if args.debug:
+            status = "✅" if row["ok"] and row["em"] == 1.0 else "⚠️"
             print(
+                f"{status} ({latency_ms} ms) | EM={row['em']} SM={row['sm']} ExecAcc={row['exec_acc']}"
             )
+            if row["em"] < 1.0:
+                print(f"    gold: {sample.gold_sql}")
+                print(f"    pred: {out['sql'] or 'EMPTY'}")
+    # persist eval.jsonl
+    RESULT_ROOT.mkdir(parents=True, exist_ok=True)
+    RESULT_DIR.mkdir(parents=True, exist_ok=True)
+    with (RESULT_DIR / "eval.jsonl").open("w", encoding="utf-8") as f:
+        for r in eval_rows:
             json.dump(r, f, ensure_ascii=False)
             f.write("\n")
+    # aggregates
+    total = len(eval_rows)
+    success = sum(1 for r in eval_rows if r["ok"])
+    avg_em = sum(r["em"] for r in eval_rows) / total if total else 0.0
+    avg_sm = sum(r["sm"] for r in eval_rows) / total if total else 0.0
+    avg_exec = sum(r["exec_acc"] for r in eval_rows) / total if total else 0.0
+    avg_lat = sum(r["latency_ms"] for r in eval_rows) / total if total else 0.0
+    p50 = _percentile([r["latency_ms"] for r in eval_rows], 50.0)
+    p95 = _percentile([r["latency_ms"] for r in eval_rows], 95.0)
+    stage_means = _collect_stage_means(eval_rows)
     summary = {
         "timestamp": datetime.now().isoformat(timespec="seconds"),
+        "split": args.split,
+        "config": args.config,
         "total": total,
+        "success": success,
+        "success_rate": round(success / total, 3) if total else 0.0,
+        "avg_latency_ms": round(avg_lat, 1),
+        "p50_latency_ms": round(p50, 1),
+        "p95_latency_ms": round(p95, 1),
         "EM": round(avg_em, 3),
         "SM": round(avg_sm, 3),
+        "ExecAcc": round(avg_exec, 3),
+        **{f"{s}_avg_ms": stage_means[s] for s in STAGES},
     }
     (RESULT_DIR / "summary.json").write_text(
         json.dumps(summary, indent=2, ensure_ascii=False), encoding="utf-8"
     )
+    # CSV
+    with (RESULT_DIR / "results.csv").open("w", encoding="utf-8") as f:
+        f.write("db_id,query,ok,em,sm,exec_acc,latency_ms\n")
+        for r in eval_rows:
+            f.write(
+                f"{r['db_id']},{json.dumps(r['query'])},{'✅' if r['ok'] else '❌'},"
+                f"{r['em']},{r['sm']},{r['exec_acc']},{r['latency_ms']}\n"
+            )
     print("\n================== Evaluation Summary ==================")
     print(f"Total samples:   {total}")
+    print(f"Successful runs: {success} ({summary['success_rate'] * 100:.1f}%)")
     print(f"Avg EM:          {summary['EM']}")
     print(f"Avg SM:          {summary['SM']}")
     print(f"Avg ExecAcc:     {summary['ExecAcc']}")
+    print(
+        f"Avg Latency:     {summary['avg_latency_ms']} ms | p50={summary['p50_latency_ms']} ms | p95={summary['p95_latency_ms']} ms"
+    )
     print(f"Results saved to {RESULT_DIR}")
     print("========================================================")

benchmarks/plot_results.py CHANGED Viewed

@@ -1,101 +1,141 @@
 """
-Plot evaluation summaries for NL2SQL Copilot benchmark runs.
-Automatically detects the latest results folder under benchmarks/results_pro/,
-reads summary.json + eval.jsonl, and plots:
-  1. Average latency per pipeline stage (ms)
-  2. EM / SM / ExecAcc overview
-If summary.json lacks per-stage averages, they are derived from eval.jsonl traces.
 """
 import json
-import time
 from pathlib import Path
 import matplotlib.pyplot as plt
-# -------------------------------------------------------------------
-# Locate latest results directory
-# -------------------------------------------------------------------
 ROOT = Path("benchmarks/results_pro")
-run_dirs = sorted(
-    ROOT.glob("*/summary.json"), key=lambda p: p.stat().st_mtime, reverse=True
-)
-if not run_dirs:
-    raise SystemExit("❌ No benchmark results found under benchmarks/results_pro/")
-summary_path = run_dirs[0]
-run_dir = summary_path.parent
-print(f"📂 Using latest run: {run_dir.name}")
-# -------------------------------------------------------------------
-# Load summary
-# -------------------------------------------------------------------
-with summary_path.open(encoding="utf-8") as f:
-    summary = json.load(f)
-# -------------------------------------------------------------------
-# Derive per-stage averages if not present
-# -------------------------------------------------------------------
-STAGES = ["detector", "planner", "generator", "safety", "executor", "verifier"]
-stage_means = {s: summary.get(f"{s}_avg_ms") for s in STAGES}
-need_fallback = any(v is None for v in stage_means.values())
-if need_fallback:
-    eval_path = run_dir / "eval.jsonl"
-    totals = {s: 0.0 for s in STAGES}
-    counts = {s: 0 for s in STAGES}
-    if eval_path.exists():
-        with eval_path.open(encoding="utf-8") as f:
-            for line in f:
-                rec = json.loads(line)
-                for t in rec.get("trace", []) or []:
-                    s = t.get("stage")
                     ms = t.get("ms", t.get("duration_ms", 0.0))
-                    if s in totals:
                         totals[s] += float(ms)
                         counts[s] += 1
-    stage_means = {
-        s: round(totals[s] / max(counts[s], 1), 2) if counts[s] else 0.0 for s in STAGES
-    }
-latencies = [stage_means[s] for s in STAGES]
-# -------------------------------------------------------------------
-# Plot average latency per stage
-# -------------------------------------------------------------------
-plt.figure(figsize=(7, 5))
-plt.bar(STAGES, latencies, color="#6fa8dc")
-plt.title("Average Latency per Stage (ms)")
-plt.xlabel("Stage")
-plt.ylabel("Latency (ms)")
-plt.tight_layout()
-plt.savefig(run_dir / "latency_per_stage.png")
-print(f"📊 Saved latency chart → {run_dir / 'latency_per_stage.png'}")
-# -------------------------------------------------------------------
-# Plot EM / SM / ExecAcc metrics
-# -------------------------------------------------------------------
-metrics = ["EM", "SM", "ExecAcc"]
-scores = [summary.get(k, 0.0) for k in metrics]
-plt.figure(figsize=(7, 5))
-plt.bar(metrics, scores, color="#93c47d")
-plt.title("EM / SM / ExecAcc")
-plt.xlabel("Metric")
-plt.ylabel("Score")
-plt.ylim(0, 1)
-plt.tight_layout()
-plt.savefig(run_dir / "metrics_overview.png")
-print(f"📊 Saved metrics chart → {run_dir / 'metrics_overview.png'}")
-# -------------------------------------------------------------------
-# Quick textual summary
-# -------------------------------------------------------------------
-print(
-    f"\n✅ Summary for {run_dir.name}\n"
-    f"Avg latency: {summary.get('avg_latency_ms', 'n/a')} ms\n"
-    f"Success rate: {summary.get('success_rate', 0.0):.0%}\n"
-    f"EM: {summary.get('EM', 0.0):.3f} | SM: {summary.get('SM', 0.0):.3f} | ExecAcc: {summary.get('ExecAcc', 0.0):.3f}\n"
-)
-time.sleep(0.2)

 """
+Plot latest Spider benchmark results.
+Outputs in the latest folder under benchmarks/results_pro/:
+- metrics_overview.png: EM/SM/ExecAcc + latency (avg, p50, p95)
+- latency_per_stage.png: bar of average per-stage latency
+- latency_histogram.png: latency distribution across samples
 """
+from __future__ import annotations
 import json
 from pathlib import Path
 import matplotlib.pyplot as plt
 ROOT = Path("benchmarks/results_pro")
+def _latest_run_dir() -> Path:
+    summaries = sorted(
+        ROOT.glob("*/summary.json"), key=lambda p: p.stat().st_mtime, reverse=True
+    )
+    if not summaries:
+        raise SystemExit("❌ No benchmark results found under benchmarks/results_pro/")
+    return summaries[0].parent
+def _load_summary(run: Path) -> dict:
+    return json.loads((run / "summary.json").read_text(encoding="utf-8"))
+def _load_eval_rows(run: Path) -> list[dict]:
+    lines = (run / "eval.jsonl").read_text(encoding="utf-8").splitlines()
+    return [json.loads(x) for x in lines]
+def plot_metrics_overview(run: Path, summary: dict) -> None:
+    # EM/SM/ExecAcc on [0,1]; latency in ms (show as seconds for scale)
+    labels = ["EM", "SM", "ExecAcc", "avg(s)", "p50(s)", "p95(s)"]
+    values = [
+        summary.get("EM", 0.0),
+        summary.get("SM", 0.0),
+        summary.get("ExecAcc", 0.0),
+        summary.get("avg_latency_ms", 0.0) / 1000.0,
+        summary.get("p50_latency_ms", 0.0) / 1000.0,
+        summary.get("p95_latency_ms", 0.0) / 1000.0,
+    ]
+    plt.figure(figsize=(9, 5))
+    bars = plt.bar(labels, values)
+    for b, v in zip(bars, values):
+        plt.text(b.get_x() + b.get_width() / 2, v, f"{v:.2f}", ha="center", va="bottom")
+    plt.title("Metrics Overview (Spider)")
+    plt.ylim(0, max(1.0, max(values) * 1.15 if values else 1.0))
+    plt.tight_layout()
+    plt.savefig(run / "metrics_overview.png")
+    plt.close()
+def plot_latency_hist(run: Path, rows: list[dict]) -> None:
+    latencies = [
+        r.get("latency_ms", 0)
+        for r in rows
+        if isinstance(r.get("latency_ms"), (int, float))
+    ]
+    if not latencies:
+        return
+    plt.figure(figsize=(9, 4))
+    plt.hist(latencies, bins=min(20, max(5, int(len(latencies) ** 0.5))))
+    plt.title("Latency Distribution (ms)")
+    plt.xlabel("Latency (ms)")
+    plt.ylabel("Count")
+    plt.tight_layout()
+    plt.savefig(run / "latency_histogram.png")
+    plt.close()
+def plot_latency_per_stage(run: Path, summary: dict, rows: list[dict]) -> None:
+    stages = [
+        "detector",
+        "planner",
+        "generator",
+        "safety",
+        "executor",
+        "verifier",
+        "repair",
+    ]
+    # prefer summary keys if available; else derive from traces
+    raw_values = [summary.get(f"{s}_avg_ms") for s in stages]
+    # convert Nones to 0.0
+    values: list[float] = [float(v or 0.0) for v in raw_values]
+    if not any(values):
+        totals = {s: 0.0 for s in stages}
+        counts = {s: 0 for s in stages}
+        for r in rows:
+            trace = r.get("trace") or r.get("traces") or []
+            for t in trace:
+                s = t.get("stage")
+                if s in totals:
                     ms = t.get("ms", t.get("duration_ms", 0.0))
+                    try:
                         totals[s] += float(ms)
                         counts[s] += 1
+                    except Exception:
+                        pass
+        values = [round(totals[s] / counts[s], 2) if counts[s] else 0.0 for s in stages]
+    plt.figure(figsize=(10, 5))
+    bars = plt.bar(stages, values)
+    for b, v in zip(bars, values):
+        plt.text(
+            b.get_x() + b.get_width() / 2,
+            float(v),
+            f"{v:.1f}",
+            ha="center",
+            va="bottom",
+        )
+    plt.title("Average Latency per Stage (ms)")
+    plt.xlabel("Stage")
+    plt.ylabel("Latency (ms)")
+    plt.tight_layout()
+    plt.savefig(run / "latency_per_stage.png")
+    plt.close()
+def main() -> None:
+    run = _latest_run_dir()
+    print(f"📂 Using latest run: {run.name}")
+    summary = _load_summary(run)
+    rows = _load_eval_rows(run)
+    plot_metrics_overview(run, summary)
+    plot_latency_hist(run, rows)
+    plot_latency_per_stage(run, summary, rows)
+    print(
+        "✅ Saved: metrics_overview.png, latency_histogram.png, latency_per_stage.png"
+    )
+if __name__ == "__main__":
+    main()

benchmarks/results_pro/20251109-095552/eval.jsonl ADDED Viewed

	@@ -0,0 +1,5 @@

+{"source": "spider", "db_id": "concert_singer", "query": "How many singers do we have?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "select count(*) from singer limit 1", "ok": true, "latency_ms": 11661, "em": 1.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 8989, "summary": "ok", "notes": {"len_plan": 1451}, "token_in": 270, "token_out": 347, "cost_usd": 0.0002487}, {"stage": "generator", "duration_ms": 977, "summary": "ok", "notes": {"rationale_len": 30}, "token_in": 834, "token_out": 19, "cost_usd": 0.00013649999999999998}, {"stage": "safety", "duration_ms": 2, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 2, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 745, "summary": "ok", "notes": {"old_sql_len": 27, "new_sql_len": 35}, "token_in": 318, "token_out": 8, "cost_usd": 5.2499999999999995e-05}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 937, "summary": "ok", "notes": {"old_sql_len": 35, "new_sql_len": 35}, "token_in": 321, "token_out": 8, "cost_usd": 5.295e-05}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
+{"source": "spider", "db_id": "concert_singer", "query": "What is the total number of singers?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "select count(*) from singer limit 1", "ok": true, "latency_ms": 9786, "em": 1.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 6574, "summary": "ok", "notes": {"len_plan": 1479}, "token_in": 271, "token_out": 343, "cost_usd": 0.00024645}, {"stage": "generator", "duration_ms": 955, "summary": "ok", "notes": {"rationale_len": 30}, "token_in": 831, "token_out": 19, "cost_usd": 0.00013605}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 2, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 986, "summary": "ok", "notes": {"old_sql_len": 27, "new_sql_len": 35}, "token_in": 318, "token_out": 8, "cost_usd": 5.2499999999999995e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 1262, "summary": "ok", "notes": {"old_sql_len": 35, "new_sql_len": 35}, "token_in": 321, "token_out": 8, "cost_usd": 5.295e-05}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
+{"source": "spider", "db_id": "concert_singer", "query": "Show name, country, age for all singers ordered by age from the oldest to the youngest.", "gold_sql": "SELECT name ,  country ,  age FROM singer ORDER BY age DESC", "pred_sql": "", "ok": true, "latency_ms": 0, "em": 0.0, "sm": 0.0, "exec_acc": 0.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "ambiguous", "notes": {"ambiguous": true, "questions_len": 1}}]}
+{"source": "spider", "db_id": "concert_singer", "query": "What are the names, countries, and ages for every singer in descending order of age?", "gold_sql": "SELECT name ,  country ,  age FROM singer ORDER BY age DESC", "pred_sql": "select Name, Country, Age from singer order by Age desc LIMIT 10", "ok": true, "latency_ms": 8674, "em": 0.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 5293, "summary": "ok", "notes": {"len_plan": 1333}, "token_in": 281, "token_out": 305, "cost_usd": 0.00022514999999999997}, {"stage": "generator", "duration_ms": 1510, "summary": "ok", "notes": {"rationale_len": 85}, "token_in": 803, "token_out": 37, "cost_usd": 0.00014265}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 6, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 857, "summary": "ok", "notes": {"old_sql_len": 55, "new_sql_len": 64}, "token_in": 325, "token_out": 21, "cost_usd": 6.135e-05}, {"stage": "safety", "duration_ms": 2, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 6, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 1004, "summary": "ok", "notes": {"old_sql_len": 64, "new_sql_len": 64}, "token_in": 328, "token_out": 21, "cost_usd": 6.18e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 6, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
+{"source": "spider", "db_id": "concert_singer", "query": "What is the average, minimum, and maximum age of all singers from France?", "gold_sql": "SELECT avg(age) ,  min(age) ,  max(age) FROM singer WHERE country  =  'France'", "pred_sql": "select avg(Age), min(Age), max(Age) from singer where Country = 'France'", "ok": true, "latency_ms": 11247, "em": 0.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 7296, "summary": "ok", "notes": {"len_plan": 1578}, "token_in": 279, "token_out": 425, "cost_usd": 0.00029685}, {"stage": "generator", "duration_ms": 1552, "summary": "ok", "notes": {"rationale_len": 67}, "token_in": 921, "token_out": 42, "cost_usd": 0.00016334999999999999}, {"stage": "safety", "duration_ms": 2, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 1222, "summary": "ok", "notes": {"old_sql_len": 72, "new_sql_len": 80}, "token_in": 333, "token_out": 24, "cost_usd": 6.435e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 1163, "summary": "ok", "notes": {"old_sql_len": 80, "new_sql_len": 72}, "token_in": 337, "token_out": 28, "cost_usd": 6.735e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}

benchmarks/results_pro/20251109-095552/summary.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "timestamp": "2025-11-09T09:56:33",
+  "total": 5,
+  "success": 5,
+  "success_rate": 1.0,
+  "avg_latency_ms": 8273.6,
+  "EM": 0.4,
+  "SM": 0.8,
+  "ExecAcc": 0.8,
+  "split": "dev",
+  "config": "configs/sqlite_pipeline.yaml"
+}

benchmarks/results_pro/20251109-100021/eval.jsonl ADDED Viewed

	@@ -0,0 +1,5 @@

+{"source": "spider", "db_id": "concert_singer", "query": "How many singers do we have?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "select count(*) from singer limit 1", "ok": true, "latency_ms": 9656, "em": 1.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 7138, "summary": "ok", "notes": {"len_plan": 1287}, "token_in": 265, "token_out": 303, "cost_usd": 0.00022154999999999996}, {"stage": "generator", "duration_ms": 875, "summary": "ok", "notes": {"rationale_len": 30}, "token_in": 785, "token_out": 19, "cost_usd": 0.00012915}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 803, "summary": "ok", "notes": {"old_sql_len": 27, "new_sql_len": 35}, "token_in": 313, "token_out": 8, "cost_usd": 5.175e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 829, "summary": "ok", "notes": {"old_sql_len": 35, "new_sql_len": 35}, "token_in": 316, "token_out": 8, "cost_usd": 5.2199999999999995e-05}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
+{"source": "spider", "db_id": "concert_singer", "query": "What is the total number of singers?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "select count(*) from singer limit 1", "ok": true, "latency_ms": 11252, "em": 1.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 8353, "summary": "ok", "notes": {"len_plan": 1399}, "token_in": 266, "token_out": 330, "cost_usd": 0.00023789999999999998}, {"stage": "generator", "duration_ms": 1048, "summary": "ok", "notes": {"rationale_len": 30}, "token_in": 813, "token_out": 19, "cost_usd": 0.00013335}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 794, "summary": "ok", "notes": {"old_sql_len": 27, "new_sql_len": 35}, "token_in": 313, "token_out": 8, "cost_usd": 5.175e-05}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 1052, "summary": "ok", "notes": {"old_sql_len": 35, "new_sql_len": 35}, "token_in": 316, "token_out": 8, "cost_usd": 5.2199999999999995e-05}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
+{"source": "spider", "db_id": "concert_singer", "query": "Show name, country, age for all singers ordered by age from the oldest to the youngest.", "gold_sql": "SELECT name ,  country ,  age FROM singer ORDER BY age DESC", "pred_sql": "", "ok": true, "latency_ms": 0, "em": 0.0, "sm": 0.0, "exec_acc": 0.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "ambiguous", "notes": {"ambiguous": true, "questions_len": 1}}]}
+{"source": "spider", "db_id": "concert_singer", "query": "What are the names, countries, and ages for every singer in descending order of age?", "gold_sql": "SELECT name ,  country ,  age FROM singer ORDER BY age DESC", "pred_sql": "select Name, Country, Age from singer order by Age desc LIMIT 10", "ok": true, "latency_ms": 8517, "em": 0.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 5263, "summary": "ok", "notes": {"len_plan": 1304}, "token_in": 276, "token_out": 300, "cost_usd": 0.0002214}, {"stage": "generator", "duration_ms": 1022, "summary": "ok", "notes": {"rationale_len": 85}, "token_in": 793, "token_out": 37, "cost_usd": 0.00014115}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 6, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 977, "summary": "ok", "notes": {"old_sql_len": 55, "new_sql_len": 64}, "token_in": 320, "token_out": 21, "cost_usd": 6.0599999999999996e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 6, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 1249, "summary": "ok", "notes": {"old_sql_len": 64, "new_sql_len": 64}, "token_in": 323, "token_out": 21, "cost_usd": 6.104999999999999e-05}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 0, "summary": "ok", "notes": {"row_count": 6, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
+{"source": "spider", "db_id": "concert_singer", "query": "What is the average, minimum, and maximum age of all singers from France?", "gold_sql": "SELECT avg(age) ,  min(age) ,  max(age) FROM singer WHERE country  =  'France'", "pred_sql": "select avg(Age), min(Age), max(Age) from singer where Country = 'France'", "ok": true, "latency_ms": 15468, "em": 0.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 11390, "summary": "ok", "notes": {"len_plan": 1400}, "token_in": 274, "token_out": 348, "cost_usd": 0.0002499}, {"stage": "generator", "duration_ms": 1252, "summary": "ok", "notes": {"rationale_len": 95}, "token_in": 839, "token_out": 45, "cost_usd": 0.00015285}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 1384, "summary": "ok", "notes": {"old_sql_len": 72, "new_sql_len": 80}, "token_in": 328, "token_out": 24, "cost_usd": 6.36e-05}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 0, "summary": "ok", "notes": {"row_count": 1, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 1437, "summary": "ok", "notes": {"old_sql_len": 80, "new_sql_len": 72}, "token_in": 332, "token_out": 21, "cost_usd": 6.24e-05}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 0, "summary": "ok", "notes": {"row_count": 1, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}

benchmarks/results_pro/20251109-100021/results.csv ADDED Viewed

	@@ -0,0 +1,6 @@

+db_id,query,ok,em,sm,exec_acc,latency_ms
+concert_singer,"How many singers do we have?",✅,1.0,1.0,1.0,9656
+concert_singer,"What is the total number of singers?",✅,1.0,1.0,1.0,11252
+concert_singer,"Show name, country, age for all singers ordered by age from the oldest to the youngest.",✅,0.0,0.0,0.0,0
+concert_singer,"What are the names, countries, and ages for every singer in descending order of age?",✅,0.0,1.0,1.0,8517
+concert_singer,"What is the average, minimum, and maximum age of all singers from France?",✅,0.0,1.0,1.0,15468

benchmarks/results_pro/20251109-100021/summary.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+  "timestamp": "2025-11-09T10:01:06",
+  "split": "dev",
+  "config": "configs/sqlite_pipeline.yaml",
+  "total": 5,
+  "success": 5,
+  "success_rate": 1.0,
+  "avg_latency_ms": 8978.6,
+  "p50_latency_ms": 9656.0,
+  "p95_latency_ms": 14624.8,
+  "EM": 0.4,
+  "SM": 0.8,
+  "ExecAcc": 0.8,
+  "detector_avg_ms": 0.0,
+  "planner_avg_ms": 8036.0,
+  "generator_avg_ms": 1049.25,
+  "safety_avg_ms": 0.33,
+  "executor_avg_ms": 0.75,
+  "verifier_avg_ms": 0.0,
+  "repair_avg_ms": 1065.62
+}