Spaces:

melikakheirieh
/

nl2sql-copilot

Sleeping

App Files Files Community

Melika Kheirieh commited on Nov 6

Commit

454d146

1 Parent(s): 8103714

fix(grafana): move nl2sql.json into provisioning folder and fix dashboard mount path

Browse files

Files changed (14) hide show

benchmarks/evaluate_spider.py +312 -115
benchmarks/evaluate_spider_pro.py +396 -215
benchmarks/results/20251108-110451/eval.jsonl +20 -0
benchmarks/results/20251108-110451/results.csv +21 -0
benchmarks/results/20251108-110451/summary.json +12 -0
benchmarks/results_demo/20251108-111403/demo.jsonl +5 -0
benchmarks/results_demo/20251108-111403/results.csv +6 -0
benchmarks/results_demo/20251108-111403/summary.json +8 -0
benchmarks/results_pro/20251108-105442/spider_eval_pro.jsonl +20 -0
benchmarks/results_pro/20251108-105442/summary.csv +21 -0
benchmarks/results_pro/20251108-105442/summary.json +8 -0
benchmarks/run.py +0 -214
benchmarks/spider_loader.py +136 -27
scripts/smoke_run.py +335 -0

benchmarks/evaluate_spider.py CHANGED Viewed

@@ -1,73 +1,93 @@
 from __future__ import annotations
 import csv
 import json
 import os
 import time
 from pathlib import Path
 from typing import Any, Dict, List, Optional
-# Reuse existing factories from your FastAPI router (no new DI needed)
-from app.routers.nl2sql import (  # type: ignore
-    _pipeline as DEFAULT_PIPELINE,
-    _build_pipeline,
-    _select_adapter,
-)
-# -------------------- Config --------------------
-DATASET: List[str] = [
     "list all customers",
     "show total invoices per country",
     "top 3 albums by total sales",
     "artists with more than 3 albums",
     "number of employees per city",
 ]
-# DB id/mode follows your router convention; adjust if needed
-DB_ID: str = os.getenv("DB_MODE", "sqlite")
-# Results directory with timestamped subfolder (keeps previous runs)
 RESULT_ROOT = Path("benchmarks") / "results"
 TIMESTAMP = time.strftime("%Y%m%d-%H%M%S")
 RESULT_DIR = RESULT_ROOT / TIMESTAMP
-# -------------------- Helpers --------------------
 def _int_ms(start: float) -> int:
     return int((time.perf_counter() - start) * 1000)
 def _derive_schema_preview_safe(pipeline_obj: Any) -> Optional[str]:
-    """
-    Try to derive schema preview from the adapter/executor if such a method exists.
-    Kept intentionally permissive to avoid tight coupling.
-    """
     try:
-        # common places the adapter might live
-        candidates: List[Any] = [
             getattr(pipeline_obj, "executor", None),
             getattr(pipeline_obj, "adapter", None),
         ]
         for c in candidates:
             if c and hasattr(c, "derive_schema_preview"):
-                return c.derive_schema_preview()  # type: ignore[no-any-return, call-arg]
     except Exception:
         pass
     return None
 def _to_stage_list(trace_obj: Any) -> List[Dict[str, Any]]:
-    """
-    Normalize pipeline trace (list of dataclass or dict) to a list of dicts:
-    [{ "stage": str, "ms": int }, ...]
-    """
-    stages: List[Dict[str, Any]] = []
     if not isinstance(trace_obj, list):
-        return stages
     for t in trace_obj:
         if isinstance(t, dict):
             stage = t.get("stage", "?")
@@ -76,126 +96,303 @@ def _to_stage_list(trace_obj: Any) -> List[Dict[str, Any]]:
             stage = getattr(t, "stage", "?")
             ms = getattr(t, "duration_ms", 0)
         try:
-            stages.append({"stage": str(stage), "ms": int(ms)})
         except Exception:
-            stages.append({"stage": str(stage), "ms": 0})
-    return stages
-# -------------------- Main --------------------
-def main() -> None:
     RESULT_DIR.mkdir(parents=True, exist_ok=True)
-    # Build pipeline from router factories (no new DI required)
-    try:
-        adapter = _select_adapter(DB_ID)  # e.g., "sqlite" / "postgres"
-        pipeline = _build_pipeline(adapter)
-        using_default = False
-    except Exception:
-        pipeline = DEFAULT_PIPELINE
-        using_default = True
     print(
-        f"✅ Pipeline ready "
-        f"(db_id={DB_ID}, source={'default' if using_default else 'custom adapter'})"
     )
-    # Optional schema preview
     schema_preview = _derive_schema_preview_safe(pipeline)
     if schema_preview:
         print("📄 Derived schema preview ✓")
     else:
         print("ℹ️ No schema preview (adapter does not expose it or not needed)")
-    # Evaluate
-    records: List[Dict[str, Any]] = []
-    for q in DATASET:
         print(f"\n🧠 Query: {q}")
         t0 = time.perf_counter()
         try:
-            result = pipeline.run(
-                user_query=q,
-                schema_preview=schema_preview or "",  # <- force str
             )
-            latency_ms = _int_ms(t0)
-            # ok flag -> coerce to bool for mypy and consistency
-            ok_flag = bool(getattr(result, "ok", True))
-            stages = _to_stage_list(getattr(result, "trace", None))
-            rec: Dict[str, Any] = {
-                "query": q,
-                "ok": ok_flag,
-                "latency_ms": latency_ms,
-                "trace": stages,
-                "error": None,
-            }
-            records.append(rec)
             print(f"✅ Success ({latency_ms} ms)")
         except Exception as exc:
-            latency_ms = _int_ms(t0)
-            rec = {
-                "query": q,
-                "ok": False,
-                "latency_ms": latency_ms,
-                "trace": [],
-                "error": str(exc),
-            }
-            records.append(rec)
             print(f"❌ Failed: {exc!s} ({latency_ms} ms)")
-    # Aggregate metrics
-    avg_latency = (
-        round(sum(r["latency_ms"] for r in records) / max(len(records), 1), 1)
-        if records
-        else 0.0
-    )
-    success_rate = (
-        sum(1 for r in records if bool(r.get("ok"))) / max(len(records), 1)
-        if records
-        else 0.0
-    )
-    summary: Dict[str, Any] = {
-        "queries_total": len(records),
-        "success_rate": success_rate,
-        "avg_latency_ms": avg_latency,
-        "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
-        "db_id": DB_ID,
-        "pipeline_source": "default" if using_default else "adapter",
     }
-    # Persist outputs
-    jsonl_path = RESULT_DIR / "spider_eval.jsonl"
-    with jsonl_path.open("w", encoding="utf-8") as f:
-        for r in records:
-            json.dump(r, f, ensure_ascii=False)
-            f.write("\n")
-    summary_path = RESULT_DIR / "metrics_summary.json"
-    with summary_path.open("w", encoding="utf-8") as f:
-        json.dump(summary, f, indent=2)
-    csv_path = RESULT_DIR / "results.csv"
-    with csv_path.open("w", newline="", encoding="utf-8") as f:
-        writer = csv.DictWriter(f, fieldnames=["query", "ok", "latency_ms"])
-        writer.writeheader()
-        for r in records:
-            writer.writerow(
                 {
-                    "query": r["query"],
-                    "ok": "✅" if bool(r["ok"]) else "❌",
-                    "latency_ms": int(r["latency_ms"]),
                 }
             )
-    print(
-        "\n💾 Saved outputs:\n"
-        f"- {jsonl_path}\n- {summary_path}\n- {csv_path}\n"
-        f"📊 Avg latency: {avg_latency} ms | Success rate: {success_rate:.0%}"
     )
 if __name__ == "__main__":

+"""
+Lightweight eval runner for two modes:
+  1) Single-DB demo mode (default): run a list of questions against one SQLite DB.
+  2) Spider mode (--spider): load a subset of the Spider dataset and run each question
+     against its own database (resolved via SPIDER_ROOT).
+- Uses your official pipeline factory (no app/router imports).
+- Works with real LLM (OPENAI_API_KEY) or stub mode (PYTEST_CURRENT_TEST=1).
+- Produces JSONL + JSON summary + CSV under benchmarks/results/<timestamp>/
+Examples:
+  # Demo (single DB), stub mode
+  PYTHONPATH=$PWD PYTEST_CURRENT_TEST=1 \
+  python benchmarks/evaluate_spider.py --db-path demo.db
+  # Spider subset (20 items), stub mode
+  export SPIDER_ROOT=$PWD/data/spider
+  PYTHONPATH=$PWD PYTEST_CURRENT_TEST=1 \
+  python benchmarks/evaluate_spider.py --spider --split dev --limit 20
+Notes:
+  - In stub mode, all LLM calls are mocked for offline evaluation.
+  - Results are saved under benchmarks/results/<timestamp>/.
+"""
 from __future__ import annotations
+import argparse
 import csv
 import json
 import os
 import time
 from pathlib import Path
 from typing import Any, Dict, List, Optional
+import sqlite3
+from nl2sql.pipeline_factory import pipeline_from_config_with_adapter
+from adapters.db.sqlite_adapter import SQLiteAdapter
+# Only needed in --spider mode
+try:
+    from benchmarks.spider_loader import load_spider_sqlite, open_readonly_connection
+except Exception:
+    load_spider_sqlite = None  # type: ignore[assignment]
+    open_readonly_connection = None  # type: ignore[assignment]
+# Resolve repo root and default config path relative to this file (not CWD)
+THIS_DIR = Path(__file__).resolve().parent  # .../benchmarks
+REPO_ROOT = THIS_DIR.parent  # repo root
+CONFIG_PATH = str(REPO_ROOT / "configs" / "sqlite_pipeline.yaml")
+DEFAULT_DATASET: List[str] = [
     "list all customers",
     "show total invoices per country",
     "top 3 albums by total sales",
     "artists with more than 3 albums",
     "number of employees per city",
 ]
+# Back-compat for tests: monkeypatchable dataset at module level
+DATASET: List[str] = list(DEFAULT_DATASET)
 RESULT_ROOT = Path("benchmarks") / "results"
 TIMESTAMP = time.strftime("%Y%m%d-%H%M%S")
 RESULT_DIR = RESULT_ROOT / TIMESTAMP
 def _int_ms(start: float) -> int:
+    """Convert elapsed seconds to integer milliseconds."""
     return int((time.perf_counter() - start) * 1000)
 def _derive_schema_preview_safe(pipeline_obj: Any) -> Optional[str]:
+    """Safely call derive_schema_preview() if available on adapter/executor."""
     try:
+        candidates = [
             getattr(pipeline_obj, "executor", None),
             getattr(pipeline_obj, "adapter", None),
         ]
         for c in candidates:
             if c and hasattr(c, "derive_schema_preview"):
+                return c.derive_schema_preview()  # type: ignore[no-any-return]
     except Exception:
         pass
     return None
 def _to_stage_list(trace_obj: Any) -> List[Dict[str, Any]]:
+    """Normalize pipeline trace into a list of dicts for logging/CSV export."""
+    out: List[Dict[str, Any]] = []
     if not isinstance(trace_obj, list):
+        return out
     for t in trace_obj:
         if isinstance(t, dict):
             stage = t.get("stage", "?")
             stage = getattr(t, "stage", "?")
             ms = getattr(t, "duration_ms", 0)
         try:
+            out.append({"stage": str(stage), "ms": int(ms)})
         except Exception:
+            out.append({"stage": str(stage), "ms": 0})
+    return out
+def _load_dataset_from_file(path: Optional[str]) -> List[str]:
+    """
+    Load dataset questions.
+    Accepts either a list of strings or a list of {"question": "..."} objects.
+    """
+    if not path:
+        # Use module-level DATASET so tests can monkeypatch it
+        return list(DATASET)
+    p = Path(path)
+    if not p.exists():
+        raise FileNotFoundError(f"dataset file not found: {p}")
+    data = json.loads(p.read_text(encoding="utf-8"))
+    if isinstance(data, list):
+        if all(isinstance(x, str) for x in data):
+            return list(data)
+        if all(isinstance(x, dict) and "question" in x for x in data):
+            return [str(x["question"]) for x in data]
+    raise ValueError(
+        "Dataset file must be a JSON array of strings or objects with 'question' field."
+    )
+def _ensure_demo_db(db_path: Path) -> None:
+    """Create an empty SQLite DB for demo runs if it doesn't exist."""
+    if db_path.exists():
+        return
+    db_path.parent.mkdir(parents=True, exist_ok=True)
+    conn = sqlite3.connect(str(db_path))
+    try:
+        # Keep it minimal; SELECT 1 works without any tables.
+        conn.execute("SELECT 1;")
+    finally:
+        conn.close()
+def _save_outputs(rows: List[Dict[str, Any]], meta: Dict[str, Any]) -> None:
+    """Persist JSONL + JSON summary + CSV (write both new and legacy filenames)."""
     RESULT_DIR.mkdir(parents=True, exist_ok=True)
+    # Filenames (new + legacy for back-compat with tests)
+    jsonl_path = RESULT_DIR / "eval.jsonl"
+    summary_path = RESULT_DIR / "summary.json"
+    csv_path = RESULT_DIR / "results.csv"
+    jsonl_path_legacy = RESULT_DIR / "spider_eval.jsonl"
+    summary_path_legacy = RESULT_DIR / "metrics_summary.json"
+    # --- Write JSONL (both names) ---
+    with jsonl_path.open("w", encoding="utf-8") as f:
+        for r in rows:
+            json.dump(r, f, ensure_ascii=False)
+            f.write("\n")
+    # duplicate for legacy name
+    with jsonl_path_legacy.open("w", encoding="utf-8") as f:
+        for r in rows:
+            json.dump(r, f, ensure_ascii=False)
+            f.write("\n")
+    # --- Build summary dict ---
+    summary = {
+        # keep both for compatibility with old tests/consumers
+        "queries_total": len(rows),
+        "total": len(rows),
+        "pipeline_source": meta.get(
+            "pipeline_source", "adapter"
+        ),  # for backward-compat with tests
+        "success_rate": (sum(1 for r in rows if r.get("ok")) / max(len(rows), 1))
+        if rows
+        else 0.0,
+        "avg_latency_ms": (
+            round(sum(int(r.get("latency_ms", 0)) for r in rows) / max(len(rows), 1), 1)
+        )
+        if rows
+        else 0.0,
+        **meta,
+        "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
+    }
+    # --- Write summary (both names) ---
+    with summary_path.open("w", encoding="utf-8") as f:
+        json.dump(summary, f, indent=2)
+    with summary_path_legacy.open("w", encoding="utf-8") as f:
+        json.dump(summary, f, indent=2)
+    # --- Write CSV (single name) ---
+    with csv_path.open("w", newline="", encoding="utf-8") as f:
+        writer = csv.DictWriter(f, fieldnames=["query", "ok", "latency_ms"])
+        writer.writeheader()
+        for r in rows:
+            writer.writerow(
+                {
+                    "query": r.get("query", ""),
+                    "ok": "✅" if r.get("ok") else "❌",
+                    "latency_ms": int(r.get("latency_ms", 0)),
+                }
+            )
     print(
+        "\n💾 Saved outputs:\n"
+        f"- {jsonl_path} (and {jsonl_path_legacy})\n"
+        f"- {summary_path} (and {summary_path_legacy})\n"
+        f"- {csv_path}\n"
+        f"📊 Avg latency: {summary['avg_latency_ms']} ms | "
+        f"Success rate: {summary['success_rate']:.0%}\n"
     )
+def _run_single_db_mode(db_path: Path, questions: List[str], config_path: str) -> None:
+    """Evaluate a list of questions against a single SQLite DB."""
+    adapter = SQLiteAdapter(str(db_path))
+    pipeline = pipeline_from_config_with_adapter(config_path, adapter=adapter)
     schema_preview = _derive_schema_preview_safe(pipeline)
     if schema_preview:
         print("📄 Derived schema preview ✓")
     else:
         print("ℹ️ No schema preview (adapter does not expose it or not needed)")
+    rows: List[Dict[str, Any]] = []
+    for q in questions:
         print(f"\n🧠 Query: {q}")
         t0 = time.perf_counter()
         try:
+            result = pipeline.run(user_query=q, schema_preview=schema_preview or "")
+            latency_ms = _int_ms(t0) or 1  # clamp to 1ms for nicer CSV in stub mode
+            stages = _to_stage_list(
+                getattr(result, "traces", getattr(result, "trace", []))
+            )
+            rows.append(
+                {
+                    "source": "demo",
+                    "db_id": Path(db_path).stem,
+                    "query": q,
+                    "ok": bool(getattr(result, "ok", True)),
+                    "latency_ms": latency_ms,
+                    "trace": stages,
+                    "error": None,
+                }
             )
             print(f"✅ Success ({latency_ms} ms)")
         except Exception as exc:
+            latency_ms = _int_ms(t0) or 1
+            rows.append(
+                {
+                    "source": "demo",
+                    "db_id": Path(db_path).stem,
+                    "query": q,
+                    "ok": False,
+                    "latency_ms": latency_ms,
+                    "trace": [],
+                    "error": str(exc),
+                }
+            )
             print(f"❌ Failed: {exc!s} ({latency_ms} ms)")
+    meta = {
+        "mode": "single-db",
+        "db_path": str(db_path),
+        "config": config_path,
+        "provider_hint": ("STUBS" if os.getenv("PYTEST_CURRENT_TEST") else "REAL"),
     }
+    _save_outputs(rows, meta)
+def _run_spider_mode(split: str, limit: int, config_path: str) -> None:
+    """Evaluate a Spider subset. Each example points to its own DB under SPIDER_ROOT."""
+    if load_spider_sqlite is None or open_readonly_connection is None:
+        raise RuntimeError(
+            "Spider utilities are not available. Ensure benchmarks/spider_loader.py exists."
+        )
+    items = load_spider_sqlite(split=split, limit=limit)
+    print(f"🗂  Loaded {len(items)} Spider items (split={split}).")
+    rows: List[Dict[str, Any]] = []
+    for i, ex in enumerate(items, 1):
+        print(f"\n[{i}] {ex.db_id} :: {ex.question}")
+        adapter = SQLiteAdapter(ex.db_path)
+        pipeline = pipeline_from_config_with_adapter(config_path, adapter=adapter)
+        # derive schema per-DB (optional)
+        schema_preview = _derive_schema_preview_safe(pipeline)
+        t0 = time.perf_counter()
+        try:
+            result = pipeline.run(
+                user_query=ex.question, schema_preview=schema_preview or ""
+            )
+            latency_ms = _int_ms(t0) or 1
+            stages = _to_stage_list(
+                getattr(result, "traces", getattr(result, "trace", []))
+            )
+            rows.append(
+                {
+                    "source": "spider",
+                    "db_id": ex.db_id,
+                    "query": ex.question,
+                    "ok": bool(getattr(result, "ok", True)),
+                    "latency_ms": latency_ms,
+                    "trace": stages,
+                    "error": None,
+                }
+            )
+            print(f"✅ Success ({latency_ms} ms)")
+        except Exception as exc:
+            latency_ms = _int_ms(t0) or 1
+            rows.append(
                 {
+                    "source": "spider",
+                    "db_id": ex.db_id,
+                    "query": ex.question,
+                    "ok": False,
+                    "latency_ms": latency_ms,
+                    "trace": [],
+                    "error": str(exc),
                 }
             )
+            print(f"❌ Failed: {exc!s} ({latency_ms} ms)")
+    meta = {
+        "mode": "spider",
+        "split": split,
+        "limit": limit,
+        "config": config_path,
+        "provider_hint": ("STUBS" if os.getenv("PYTEST_CURRENT_TEST") else "REAL"),
+        "spider_root": os.getenv("SPIDER_ROOT", ""),
+    }
+    _save_outputs(rows, meta)
+def main() -> None:
+    ap = argparse.ArgumentParser()
+    ap.add_argument(
+        "--spider",
+        action="store_true",
+        help="Enable Spider mode (reads from SPIDER_ROOT; ignores --db-path).",
     )
+    ap.add_argument(
+        "--split",
+        type=str,
+        default="dev",
+        choices=["dev", "train"],
+        help="Spider split to use (default: dev).",
+    )
+    ap.add_argument(
+        "--limit",
+        type=int,
+        default=20,
+        help="Number of Spider items to evaluate (default: 20).",
+    )
+    ap.add_argument(
+        "--db-path",
+        type=str,
+        default="demo.db",
+        help="Path to SQLite database file (single-DB mode).",
+    )
+    ap.add_argument(
+        "--dataset-file",
+        type=str,
+        default=None,
+        help="Optional JSON file with questions (single-DB mode).",
+    )
+    ap.add_argument(
+        "--config",
+        type=str,
+        default=CONFIG_PATH,
+        help=f"Pipeline YAML config (default: {CONFIG_PATH})",
+    )
+    args, _unknown = ap.parse_known_args()
+    if args.spider:
+        # Spider mode: read items from SPIDER_ROOT and evaluate per-DB
+        if not os.getenv("SPIDER_ROOT"):
+            raise RuntimeError(
+                "SPIDER_ROOT is not set. It must point to the folder that contains "
+                "dev.json/train_spider.json and the database/ directory."
+            )
+        _run_spider_mode(args.split, args.limit, args.config)
+    else:
+        # Single-DB demo mode
+        db_path = Path(args.db_path).resolve()
+        # Auto-create demo DB for test/smoke runs; otherwise keep strict check
+        if db_path.name == "demo.db":
+            _ensure_demo_db(db_path)
+        elif not db_path.exists():
+            raise FileNotFoundError(f"SQLite DB not found: {db_path}")
+        questions = _load_dataset_from_file(args.dataset_file)
+        _run_single_db_mode(db_path, questions, args.config)
 if __name__ == "__main__":

benchmarks/evaluate_spider_pro.py CHANGED Viewed

@@ -1,18 +1,38 @@
 """
-Full benchmark for NL2SQL pipeline.
-Metrics:
-- EM (exact match)
-- Structural Match (sqlglot AST)
-- Execution Accuracy
-- Safety consistency (pipeline vs AST)
-- Latency (end-to-end) + per-stage trace (via pipeline if available)
-Outputs:
-  JSONL (logs), JSON (summary), CSV (compact table)
-Run example:
-    python benchmarks/evaluate_spider_pro.py --limit 10 --sleep 0.1 --db sqlite --adapter data/chinook.db
 """
 from __future__ import annotations
@@ -20,71 +40,71 @@ from __future__ import annotations
 import argparse
 import csv
 import json
-import sqlite3
 import time
 from pathlib import Path
-from typing import Any, Dict, List, Optional, cast
 import sqlglot
 from sqlglot.errors import ParseError
-# Reuse existing factories from FastAPI router (no new DI needed)
-from app.routers.nl2sql import (  # type: ignore
-    _pipeline as DEFAULT_PIPELINE,
-    _build_pipeline,
-    _select_adapter,
-)
-from nl2sql.safety import Safety
-# -------------------- Helpers --------------------
-def _int_ms(start: float) -> int:
-    return int((time.perf_counter() - start) * 1000)
-def _parse_sql(sql: str) -> Optional[sqlglot.Expression]:
-    try:
-        return sqlglot.parse_one(sql, read="sqlite")
-    except ParseError:
-        return None
-def _is_structural_match(sql1: str, sql2: str) -> bool:
-    a, b = _parse_sql(sql1), _parse_sql(sql2)
-    return (a == b) if (a is not None and b is not None) else False
-def _exec_sql(conn: sqlite3.Connection, sql: str) -> List[tuple]:
-    try:
-        cur = conn.execute(sql)
-        return [tuple(r) for r in cur.fetchall()]
-    except Exception:
-        return []
 def _derive_schema_preview_safe(pipeline_obj: Any) -> Optional[str]:
-    for attr in ("executor", "adapter"):
-        obj = getattr(pipeline_obj, attr, None)
-        if obj is not None and hasattr(obj, "derive_schema_preview"):
-            try:
-                # type: ignore[no-any-return]
-                return obj.derive_schema_preview()  # pragma: no cover
-            except Exception:
-                pass
     return None
 def _to_stage_list(trace_obj: Any) -> List[Dict[str, Any]]:
-    """
-    Normalize pipeline trace (list of dataclass or dict) to:
-    [{'stage': str, 'ms': int}, ...]
-    """
-    stages: List[Dict[str, Any]] = []
     if not isinstance(trace_obj, list):
-        return stages
     for t in trace_obj:
         if isinstance(t, dict):
             stage = t.get("stage", "?")
@@ -93,216 +113,377 @@ def _to_stage_list(trace_obj: Any) -> List[Dict[str, Any]]:
             stage = getattr(t, "stage", "?")
             ms = getattr(t, "duration_ms", 0)
         try:
-            stages.append({"stage": str(stage), "ms": int(ms)})
         except Exception:
-            stages.append({"stage": str(stage), "ms": 0})
-    return stages
-# -------------------- Main --------------------
-def main() -> None:
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--limit", type=int, default=10, help="Max number of examples")
-    parser.add_argument("--resume", type=int, default=0, help="Skip first N examples")
-    parser.add_argument(
-        "--sleep", type=float, default=0.0, help="Delay (seconds) between queries"
     )
-    parser.add_argument(
-        "--split", type=str, default="test", help="Dataset split (placeholder)"
     )
-    parser.add_argument(
-        "--db", type=str, default="sqlite", help="Database ID (e.g., sqlite/postgres)"
     )
-    parser.add_argument(
-        "--adapter",
-        type=str,
-        default="data/chinook.db",
-        help="SQLite file path for local eval",
     )
-    args = parser.parse_args()
-    # SQLite connection for execution-accuracy
-    conn = sqlite3.connect(args.adapter)
-    # Build pipeline from router factories
-    try:
-        adapter = _select_adapter(args.db)
-        pipeline = _build_pipeline(adapter)
-        using_default = False
-    except Exception:
-        pipeline = DEFAULT_PIPELINE
-        using_default = True
-    safety = Safety()
-    schema_preview = _derive_schema_preview_safe(pipeline)
-    print(f"✅ Pipeline ready (db={args.db}, default={using_default})")
-    # Minimal sample dataset for demonstration; replace with real Spider subset if available
-    DATASET: List[Dict[str, Any]] = [
-        {
-            "id": 1,
-            "question": "list all customers",
-            "gold_sql": "SELECT * FROM customers;",
-        },
-        {
-            "id": 2,
-            "question": "top 3 albums by total sales",
-            "gold_sql": """
-                SELECT a.Title, SUM(i.Quantity * i.UnitPrice) AS total
-                FROM albums a
-                JOIN tracks t ON a.AlbumId = t.AlbumId
-                JOIN invoice_items i ON t.TrackId = i.TrackId
-                GROUP BY a.AlbumId
-                ORDER BY total DESC
-                LIMIT 3;
-            """,
-        },
-        {
-            "id": 3,
-            "question": "number of employees per city",
-            "gold_sql": """
-                SELECT City, COUNT(*) AS cnt
-                FROM employees
-                GROUP BY City
-                ORDER BY cnt DESC;
-            """,
-        },
-    ]
-    sliced = DATASET[args.resume : args.resume + args.limit]
-    # Eval loop
-    results: List[Dict[str, Any]] = []
-    for idx, ex in enumerate(sliced, start=1):
-        qid = cast(int, ex.get("id", idx))
-        q: str = cast(str, ex.get("question", ""))
-        gold_sql: str = cast(str, ex.get("gold_sql", "")).strip()
-        print(f"\n[{idx}] {q}")
         t0 = time.perf_counter()
         try:
-            out = pipeline.run(user_query=q, schema_preview=(schema_preview or ""))  # type: ignore[misc]
-            latency = _int_ms(t0)
-            # Safely extract predicted SQL:
-            sql_pred_obj = getattr(out, "sql", None)
-            if sql_pred_obj is None:
-                data_obj = getattr(out, "data", None)
-                if data_obj is not None:
-                    sql_pred_obj = getattr(data_obj, "sql", None)
-            sql_pred: str = str(sql_pred_obj) if sql_pred_obj is not None else ""
-            if not sql_pred.strip():
-                raise ValueError("No SQL generated")
-            # Metrics
-            em = sql_pred.strip().lower() == gold_sql.strip().lower()
-            sm = _is_structural_match(sql_pred, gold_sql)
-            safe_ast = safety.check(sql_pred)  # pipeline has its own safety as well
-            safe_pipeline = bool(getattr(out, "ok", True))
-            safety_consistent = safe_ast.ok == safe_pipeline
-            gold_exec = _exec_sql(conn, gold_sql)
-            pred_exec = _exec_sql(conn, sql_pred)
-            exec_acc = gold_exec == pred_exec
-            stages = _to_stage_list(getattr(out, "trace", None))
-            results.append(
                 {
-                    "id": qid,
-                    "question": q,
                     "sql_pred": sql_pred,
                     "sql_gold": gold_sql,
                     "em": em,
                     "sm": sm,
                     "exec_acc": exec_acc,
-                    "safety_consistent": safety_consistent,
-                    "latency_ms": latency,
                     "trace": stages,
                     "error": None,
                 }
             )
-            print(f"✅ OK | EM={em} | SM={sm} | Exec={exec_acc} | {latency} ms")
-        except Exception as e:
-            latency = _int_ms(t0)
-            results.append(
                 {
-                    "id": qid,
-                    "question": q,
                     "sql_pred": None,
-                    "sql_gold": gold_sql,
                     "em": False,
                     "sm": False,
                     "exec_acc": False,
-                    "safety_consistent": None,
-                    "latency_ms": latency,
                     "trace": [],
-                    "error": str(e),
                 }
             )
-            print(f"❌ Fail ({latency} ms): {e}")
-        time.sleep(args.sleep)
-    # Summary
-    total = len(results)
-    avg_latency = round(sum(r["latency_ms"] for r in results) / max(total, 1), 1)
-    em_rate = (sum(1 for r in results if r["em"]) / max(total, 1)) if total else 0.0
-    sm_rate = (sum(1 for r in results if r["sm"]) / max(total, 1)) if total else 0.0
-    exec_acc_rate = (
-        (sum(1 for r in results if r["exec_acc"]) / max(total, 1)) if total else 0.0
     )
-    summary: Dict[str, Any] = {
         "total": total,
         "avg_latency_ms": avg_latency,
-        "EM": em_rate,
-        "SM": sm_rate,
-        "ExecAcc": exec_acc_rate,
         "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
-        "db": args.db,
-        "using_default_pipeline": using_default,
     }
-    # Persist outputs (timestamped dir)
-    out_dir = Path("benchmarks") / "results_pro" / time.strftime("%Y%m%d-%H%M%S")
-    out_dir.mkdir(parents=True, exist_ok=True)
-    jsonl_path = out_dir / "spider_eval_pro.jsonl"
-    with jsonl_path.open("w", encoding="utf-8") as f:
-        for r in results:
-            json.dump(r, f, ensure_ascii=False)
-            f.write("\n")
-    json_path = out_dir / "summary.json"
-    with json_path.open("w", encoding="utf-8") as f:
-        json.dump(summary, f, indent=2)
-    csv_path = out_dir / "summary.csv"
-    with csv_path.open("w", newline="", encoding="utf-8") as f:
-        writer = csv.DictWriter(
-            f,
-            fieldnames=["id", "question", "em", "sm", "exec_acc", "latency_ms"],
-        )
-        writer.writeheader()
-        for r in results:
-            writer.writerow(
-                {
-                    "id": r["id"],
-                    "question": r["question"],
-                    "em": "✅" if r["em"] else "❌",
-                    "sm": "✅" if r["sm"] else "❌",
-                    "exec_acc": "✅" if r["exec_acc"] else "❌",
-                    "latency_ms": r["latency_ms"],
-                }
-            )
-    print("\n📊 Summary:", json.dumps(summary, indent=2))
-    print(f"💾 Saved to:\n- {jsonl_path}\n- {json_path}\n- {csv_path}")
 if __name__ == "__main__":

 """
+Pro evaluation runner with two modes:
+Extension of `evaluate_spider.py` with additional metrics (EM, SM, ExecAcc) and richer logging for research-style benchmarking.
+1) Single-DB demo mode (default)
+   - Runs a list of questions against one SQLite DB
+   - Reports latency/ok (no EM/SM/ExecAcc because there's no gold SQL)
+2) Spider mode (--spider)
+   - Loads a subset of the Spider dataset via SPIDER_ROOT
+   - For each item, builds a per-DB pipeline and computes:
+       * EM (exact SQL string match, case-insensitive)
+       * SM (structural match via sqlglot AST)
+       * ExecAcc (result equivalence by executing gold vs. predicted SQL)
+   - Also logs latency, (optional) traces, and aggregates a summary
+Works with:
+- Real LLM (OPENAI_API_KEY set)
+- Stub mode (PYTEST_CURRENT_TEST=1) for zero-cost offline runs
+Outputs:
+  benchmarks/results_pro/<timestamp>/
+    - eval.jsonl        # per-sample rows
+    - summary.json      # aggregate metrics
+    - results.csv       # human-friendly table
+Examples:
+  # Demo (single DB), stub mode
+  PYTHONPATH=$PWD PYTEST_CURRENT_TEST=1 \
+  python benchmarks/evaluate_spider_pro.py --db-path demo.db
+  # Spider subset (20 items), stub mode
+  export SPIDER_ROOT=$PWD/data/spider
+  PYTHONPATH=$PWD PYTEST_CURRENT_TEST=1 \
+  python benchmarks/evaluate_spider_pro.py --spider --split dev --limit 20
 """
 from __future__ import annotations
 import argparse
 import csv
 import json
+import os
 import time
 from pathlib import Path
+from typing import Any, Dict, List, Optional
 import sqlglot
 from sqlglot.errors import ParseError
+from nl2sql.pipeline_factory import pipeline_from_config_with_adapter
+from adapters.db.sqlite_adapter import SQLiteAdapter
+# Only needed for Spider mode
+try:
+    from benchmarks.spider_loader import load_spider_sqlite, open_readonly_connection
+except Exception:
+    load_spider_sqlite = None  # type: ignore[assignment]
+    open_readonly_connection = None  # type: ignore[assignment]
+# Resolve repo root and default config path relative to this file (not CWD)
+THIS_DIR = Path(__file__).resolve().parent  # .../benchmarks
+REPO_ROOT = THIS_DIR.parent  # repo root
+CONFIG_PATH = str(REPO_ROOT / "configs" / "sqlite_pipeline.yaml")
+# Default demo questions for single-DB mode
+DEFAULT_DATASET: List[str] = [
+    "list all customers",
+    "show total invoices per country",
+    "top 3 albums by total sales",
+    "artists with more than 3 albums",
+    "number of employees per city",
+]
+RESULT_ROOT = Path("benchmarks") / "results_pro"
+TIMESTAMP = time.strftime("%Y%m%d-%H%M%S")
+RESULT_DIR = RESULT_ROOT / TIMESTAMP
+# -------------------- Utilities --------------------
+def _int_ms(start: float) -> int:
+    """Convert elapsed seconds to integer milliseconds."""
+    return int((time.perf_counter() - start) * 1000)
 def _derive_schema_preview_safe(pipeline_obj: Any) -> Optional[str]:
+    """Safely call derive_schema_preview() if available on adapter/executor."""
+    try:
+        for c in (
+            getattr(pipeline_obj, "executor", None),
+            getattr(pipeline_obj, "adapter", None),
+        ):
+            if c and hasattr(c, "derive_schema_preview"):
+                return c.derive_schema_preview()  # type: ignore[no-any-return]
+    except Exception:
+        pass
     return None
 def _to_stage_list(trace_obj: Any) -> List[Dict[str, Any]]:
+    """Normalize pipeline trace into a list of dicts for logging/export."""
+    out: List[Dict[str, Any]] = []
     if not isinstance(trace_obj, list):
+        return out
     for t in trace_obj:
         if isinstance(t, dict):
             stage = t.get("stage", "?")
             stage = getattr(t, "stage", "?")
             ms = getattr(t, "duration_ms", 0)
         try:
+            out.append({"stage": str(stage), "ms": int(ms)})
         except Exception:
+            out.append({"stage": str(stage), "ms": 0})
+    return out
+def _parse_sql(sql: str):
+    try:
+        return sqlglot.parse_one(sql, read="sqlite")
+    except ParseError:
+        return None
+def _structural_match(pred: str, gold: str) -> bool:
+    """AST-level equality via sqlglot; returns False if either side can't be parsed."""
+    a, b = _parse_sql(pred), _parse_sql(gold)
+    return (a == b) if (a is not None and b is not None) else False
+def _load_dataset_from_file(path: Optional[str]) -> List[str]:
+    """Load questions from a JSON file: list[str] or list[{question: str}]."""
+    if not path:
+        return DEFAULT_DATASET
+    p = Path(path)
+    if not p.exists():
+        raise FileNotFoundError(f"dataset file not found: {p}")
+    data = json.loads(p.read_text(encoding="utf-8"))
+    if isinstance(data, list):
+        if all(isinstance(x, str) for x in data):
+            return list(data)
+        if all(isinstance(x, dict) and "question" in x for x in data):
+            return [str(x["question"]) for x in data]
+    raise ValueError(
+        "Dataset file must be a JSON array of strings or objects with 'question' field."
     )
+def _extract_sql(result: Any) -> str:
+    """
+    Extract SQL from pipeline result in a mypy-friendly way.
+    Supports both result.sql and result.data.sql shapes.
+    """
+    sql_pred: Optional[str] = getattr(result, "sql", None)
+    if not sql_pred:
+        data = getattr(result, "data", None)
+        if data is not None:
+            sql_pred = getattr(data, "sql", None)
+    return (sql_pred or "").strip()
+def _save_outputs(rows: List[Dict[str, Any]], summary: Dict[str, Any]) -> None:
+    """Persist JSONL + JSON summary + CSV for pro runner."""
+    RESULT_DIR.mkdir(parents=True, exist_ok=True)
+    jsonl_path = RESULT_DIR / "eval.jsonl"
+    with jsonl_path.open("w", encoding="utf-8") as f:
+        for r in rows:
+            f.write(json.dumps(r, ensure_ascii=False) + "\n")
+    with (RESULT_DIR / "summary.json").open("w", encoding="utf-8") as f:
+        json.dump(summary, f, indent=2)
+    csv_path = RESULT_DIR / "results.csv"
+    # For pro, include pro columns when present (Spider mode)
+    fieldnames = [
+        "source",
+        "db_id",
+        "query",
+        "em",
+        "sm",
+        "exec_acc",
+        "ok",
+        "latency_ms",
+    ]
+    with csv_path.open("w", newline="", encoding="utf-8") as f:
+        wr = csv.DictWriter(f, fieldnames=fieldnames)
+        wr.writeheader()
+        for r in rows:
+            wr.writerow(
+                {
+                    "source": r.get("source", "demo"),
+                    "db_id": r.get("db_id", ""),
+                    "query": r.get("query", ""),
+                    "em": "✅" if r.get("em") else "❌" if "em" in r else "",
+                    "sm": "✅" if r.get("sm") else "❌" if "sm" in r else "",
+                    "exec_acc": "✅"
+                    if r.get("exec_acc")
+                    else "❌"
+                    if "exec_acc" in r
+                    else "",
+                    "ok": "✅" if r.get("ok") else "❌",
+                    "latency_ms": int(r.get("latency_ms", 0)),
+                }
+            )
+    print(
+        "\n💾 Saved outputs:\n"
+        f"- {jsonl_path}\n- {RESULT_DIR / 'summary.json'}\n- {csv_path}\n"
+        f"📊 Avg latency: {summary.get('avg_latency_ms', 0.0)} ms "
+        f"| EM: {summary.get('EM', 0.0):.3f} "
+        f"| SM: {summary.get('SM', 0.0):.3f} "
+        f"| ExecAcc: {summary.get('ExecAcc', 0.0):.3f} "
+        f"| Success: {summary.get('success_rate', 0.0):.0%}\n"
     )
+# -------------------- Runners --------------------
+def _run_single_db_mode(db_path: Path, questions: List[str], config_path: str) -> None:
+    """
+    Single-DB demo mode.
+    Only latency/ok is reported (no EM/SM/ExecAcc, because we don't have gold SQL).
+    """
+    adapter = SQLiteAdapter(str(db_path))
+    pipeline = pipeline_from_config_with_adapter(config_path, adapter=adapter)
+    schema_preview = _derive_schema_preview_safe(pipeline)
+    if schema_preview:
+        print("📄 Derived schema preview ✓")
+    else:
+        print("ℹ️ No schema preview (adapter does not expose it or not needed)")
+    rows: List[Dict[str, Any]] = []
+    for q in questions:
+        print(f"\n🧠 Query: {q}")
+        t0 = time.perf_counter()
+        try:
+            result = pipeline.run(user_query=q, schema_preview=schema_preview or "")
+            latency_ms = _int_ms(t0) or 1  # clamp to 1ms for nicer CSV in stub mode
+            stages = _to_stage_list(
+                getattr(result, "traces", getattr(result, "trace", []))
+            )
+            rows.append(
+                {
+                    "source": "demo",
+                    "db_id": Path(db_path).stem,
+                    "query": q,
+                    "ok": bool(getattr(result, "ok", True)),
+                    "latency_ms": latency_ms,
+                    "trace": stages,
+                    "error": None,
+                }
+            )
+            print(f"✅ Success ({latency_ms} ms)")
+        except Exception as exc:
+            latency_ms = _int_ms(t0) or 1
+            rows.append(
+                {
+                    "source": "demo",
+                    "db_id": Path(db_path).stem,
+                    "query": q,
+                    "ok": False,
+                    "latency_ms": latency_ms,
+                    "trace": [],
+                    "error": str(exc),
+                }
+            )
+            print(f"❌ Failed: {exc!s} ({latency_ms} ms)")
+    success_rate = (
+        (sum(1 for r in rows if r.get("ok")) / max(len(rows), 1)) if rows else 0.0
     )
+    avg_latency = (
+        round(sum(int(r.get("latency_ms", 0)) for r in rows) / max(len(rows), 1), 1)
+        if rows
+        else 0.0
     )
+    summary = {
+        "mode": "single-db",
+        "db_path": str(db_path),
+        "config": config_path,
+        "provider_hint": ("STUBS" if os.getenv("PYTEST_CURRENT_TEST") else "REAL"),
+        "total": len(rows),
+        "EM": 0.0,
+        "SM": 0.0,
+        "ExecAcc": 0.0,  # not applicable in demo
+        "success_rate": success_rate,
+        "avg_latency_ms": avg_latency,
+        "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
+    }
+    _save_outputs(rows, summary)
+def _run_spider_mode(split: str, limit: int, config_path: str) -> None:
+    """
+    Spider mode: compute EM/SM/ExecAcc with per-DB pipelines.
+    Requires SPIDER_ROOT pointing to a folder that contains dev.json/train_spider.json and database/.
+    """
+    if load_spider_sqlite is None or open_readonly_connection is None:
+        raise RuntimeError(
+            "Spider utilities are not available. Ensure benchmarks/spider_loader.py exists."
+        )
+    items = load_spider_sqlite(split=split, limit=limit)
+    print(f"🗂  Loaded {len(items)} Spider items (split={split}).")
+    rows: List[Dict[str, Any]] = []
+    for i, ex in enumerate(items, 1):
+        print(f"\n[{i}] {ex.db_id} :: {ex.question}")
+        adapter = SQLiteAdapter(ex.db_path)
+        pipeline = pipeline_from_config_with_adapter(config_path, adapter=adapter)
+        # Optional schema preview per DB
+        schema_preview = _derive_schema_preview_safe(pipeline)
+        # Open read-only connection for ExecAcc computation
+        conn = open_readonly_connection(ex.db_path)
         t0 = time.perf_counter()
         try:
+            result = pipeline.run(
+                user_query=ex.question, schema_preview=schema_preview or ""
+            )
+            latency_ms = _int_ms(t0) or 1
+            stages = _to_stage_list(
+                getattr(result, "traces", getattr(result, "trace", []))
+            )
+            # Extract predicted SQL from result (support both .sql and .data.sql)
+            sql_pred = _extract_sql(result)
+            # Pro metrics
+            gold_sql = ex.gold_sql.strip()
+            em = (sql_pred.lower() == gold_sql.lower()) if sql_pred else False
+            sm = _structural_match(sql_pred, gold_sql) if sql_pred else False
+            try:
+                gold_exec = conn.execute(gold_sql).fetchall()
+            except Exception:
+                gold_exec = []
+            try:
+                pred_exec = conn.execute(sql_pred).fetchall() if sql_pred else []
+            except Exception:
+                pred_exec = []
+            exec_acc = gold_exec == pred_exec
+            rows.append(
                 {
+                    "source": "spider",
+                    "db_id": ex.db_id,
+                    "query": ex.question,
                     "sql_pred": sql_pred,
                     "sql_gold": gold_sql,
                     "em": em,
                     "sm": sm,
                     "exec_acc": exec_acc,
+                    "ok": bool(getattr(result, "ok", True)),
+                    "latency_ms": latency_ms,
                     "trace": stages,
                     "error": None,
                 }
             )
+            print(f"✅ OK | EM={em} | SM={sm} | Exec={exec_acc} | {latency_ms} ms")
+        except Exception as exc:
+            latency_ms = _int_ms(t0) or 1
+            rows.append(
                 {
+                    "source": "spider",
+                    "db_id": ex.db_id,
+                    "query": ex.question,
                     "sql_pred": None,
+                    "sql_gold": ex.gold_sql,
                     "em": False,
                     "sm": False,
                     "exec_acc": False,
+                    "ok": False,
+                    "latency_ms": latency_ms,
                     "trace": [],
+                    "error": str(exc),
                 }
             )
+            print(f"❌ Fail: {exc!s} ({latency_ms} ms)")
+        finally:
+            try:
+                conn.close()
+            except Exception:
+                pass
+    # Aggregate pro metrics
+    total = len(rows)
+    em_rate = (sum(1 for r in rows if r.get("em")) / max(total, 1)) if rows else 0.0
+    sm_rate = (sum(1 for r in rows if r.get("sm")) / max(total, 1)) if rows else 0.0
+    exec_rate = (
+        (sum(1 for r in rows if r.get("exec_acc")) / max(total, 1)) if rows else 0.0
+    )
+    success_rate = (
+        (sum(1 for r in rows if r.get("ok")) / max(total, 1)) if rows else 0.0
+    )
+    avg_latency = (
+        round(sum(int(r.get("latency_ms", 0)) for r in rows) / max(total, 1), 1)
+        if rows
+        else 0.0
     )
+    summary = {
+        "mode": "spider",
+        "split": split,
+        "limit": limit,
+        "config": config_path,
+        "provider_hint": ("STUBS" if os.getenv("PYTEST_CURRENT_TEST") else "REAL"),
+        "spider_root": os.getenv("SPIDER_ROOT", ""),
         "total": total,
+        "EM": round(em_rate, 3),
+        "SM": round(sm_rate, 3),
+        "ExecAcc": round(exec_rate, 3),
+        "success_rate": success_rate,
         "avg_latency_ms": avg_latency,
         "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
     }
+    _save_outputs(rows, summary)
+# -------------------- CLI --------------------
+def main() -> None:
+    ap = argparse.ArgumentParser()
+    ap.add_argument(
+        "--spider",
+        action="store_true",
+        help="Enable Spider mode (reads from SPIDER_ROOT; ignores --db-path).",
+    )
+    ap.add_argument(
+        "--split",
+        type=str,
+        default="dev",
+        choices=["dev", "train"],
+        help="Spider split to use (default: dev).",
+    )
+    ap.add_argument(
+        "--limit",
+        type=int,
+        default=20,
+        help="Number of Spider items to evaluate (default: 20).",
+    )
+    ap.add_argument(
+        "--db-path",
+        type=str,
+        default="demo.db",
+        help="Path to SQLite database file (single-DB mode).",
+    )
+    ap.add_argument(
+        "--dataset-file",
+        type=str,
+        default=None,
+        help="Optional JSON file with questions (single-DB mode).",
+    )
+    ap.add_argument(
+        "--config",
+        type=str,
+        default=CONFIG_PATH,
+        help=f"Pipeline YAML config (default: {CONFIG_PATH})",
+    )
+    args = ap.parse_args()
+    if args.spider:
+        if not os.getenv("SPIDER_ROOT"):
+            raise RuntimeError(
+                "SPIDER_ROOT is not set. It must point to the folder that directly contains "
+                "dev.json/train_spider.json and the database/ directory."
+            )
+        _run_spider_mode(args.split, args.limit, args.config)
+    else:
+        db_path = Path(args.db_path).resolve()
+        if not db_path.exists():
+            raise FileNotFoundError(f"SQLite DB not found: {db_path}")
+        questions = _load_dataset_from_file(args.dataset_file)
+        _run_single_db_mode(db_path, questions, args.config)
 if __name__ == "__main__":

benchmarks/results/20251108-110451/eval.jsonl ADDED Viewed

	@@ -0,0 +1,20 @@

+{"source": "spider", "db_id": "concert_singer", "query": "How many singers do we have?", "ok": true, "latency_ms": 1, "trace": [{"stage": "detector", "ms": 0}, {"stage": "planner", "ms": 0}, {"stage": "generator", "ms": 0}, {"stage": "safety", "ms": 0}, {"stage": "executor", "ms": 0}, {"stage": "verifier", "ms": 0}, {"stage": "pipeline", "ms": 0}], "error": null}
+{"source": "spider", "db_id": "concert_singer", "query": "What is the total number of singers?", "ok": true, "latency_ms": 1, "trace": [{"stage": "detector", "ms": 0}, {"stage": "planner", "ms": 0}, {"stage": "generator", "ms": 0}, {"stage": "safety", "ms": 0}, {"stage": "executor", "ms": 0}, {"stage": "verifier", "ms": 0}, {"stage": "pipeline", "ms": 0}], "error": null}
+{"source": "spider", "db_id": "concert_singer", "query": "Show name, country, age for all singers ordered by age from the oldest to the youngest.", "ok": true, "latency_ms": 1, "trace": [{"stage": "detector", "ms": 0}, {"stage": "planner", "ms": 0}, {"stage": "generator", "ms": 0}, {"stage": "safety", "ms": 0}, {"stage": "executor", "ms": 0}, {"stage": "verifier", "ms": 0}, {"stage": "pipeline", "ms": 0}], "error": null}
+{"source": "spider", "db_id": "concert_singer", "query": "What are the names, countries, and ages for every singer in descending order of age?", "ok": true, "latency_ms": 1, "trace": [{"stage": "detector", "ms": 0}, {"stage": "planner", "ms": 0}, {"stage": "generator", "ms": 0}, {"stage": "safety", "ms": 0}, {"stage": "executor", "ms": 0}, {"stage": "verifier", "ms": 0}, {"stage": "pipeline", "ms": 0}], "error": null}
+{"source": "spider", "db_id": "concert_singer", "query": "What is the average, minimum, and maximum age of all singers from France?", "ok": true, "latency_ms": 1, "trace": [{"stage": "detector", "ms": 0}, {"stage": "planner", "ms": 0}, {"stage": "generator", "ms": 0}, {"stage": "safety", "ms": 0}, {"stage": "executor", "ms": 0}, {"stage": "verifier", "ms": 0}, {"stage": "pipeline", "ms": 0}], "error": null}
+{"source": "spider", "db_id": "concert_singer", "query": "What is the average, minimum, and maximum age for all French singers?", "ok": true, "latency_ms": 1, "trace": [{"stage": "detector", "ms": 0}, {"stage": "planner", "ms": 0}, {"stage": "generator", "ms": 0}, {"stage": "safety", "ms": 0}, {"stage": "executor", "ms": 0}, {"stage": "verifier", "ms": 0}, {"stage": "pipeline", "ms": 0}], "error": null}
+{"source": "spider", "db_id": "concert_singer", "query": "Show the name and the release year of the song by the youngest singer.", "ok": true, "latency_ms": 1, "trace": [{"stage": "detector", "ms": 0}, {"stage": "planner", "ms": 0}, {"stage": "generator", "ms": 0}, {"stage": "safety", "ms": 0}, {"stage": "executor", "ms": 0}, {"stage": "verifier", "ms": 0}, {"stage": "pipeline", "ms": 0}], "error": null}
+{"source": "spider", "db_id": "concert_singer", "query": "What are the names and release years for all the songs of the youngest singer?", "ok": true, "latency_ms": 1, "trace": [{"stage": "detector", "ms": 0}, {"stage": "planner", "ms": 0}, {"stage": "generator", "ms": 0}, {"stage": "safety", "ms": 0}, {"stage": "executor", "ms": 0}, {"stage": "verifier", "ms": 0}, {"stage": "pipeline", "ms": 0}], "error": null}
+{"source": "spider", "db_id": "concert_singer", "query": "What are all distinct countries where singers above age 20 are from?", "ok": true, "latency_ms": 1, "trace": [{"stage": "detector", "ms": 0}, {"stage": "planner", "ms": 0}, {"stage": "generator", "ms": 0}, {"stage": "safety", "ms": 0}, {"stage": "executor", "ms": 0}, {"stage": "verifier", "ms": 0}, {"stage": "pipeline", "ms": 0}], "error": null}
+{"source": "spider", "db_id": "concert_singer", "query": "What are  the different countries with singers above age 20?", "ok": true, "latency_ms": 1, "trace": [{"stage": "detector", "ms": 0}, {"stage": "planner", "ms": 0}, {"stage": "generator", "ms": 0}, {"stage": "safety", "ms": 0}, {"stage": "executor", "ms": 0}, {"stage": "verifier", "ms": 0}, {"stage": "pipeline", "ms": 0}], "error": null}
+{"source": "spider", "db_id": "concert_singer", "query": "Show all countries and the number of singers in each country.", "ok": true, "latency_ms": 1, "trace": [{"stage": "detector", "ms": 0}, {"stage": "planner", "ms": 0}, {"stage": "generator", "ms": 0}, {"stage": "safety", "ms": 0}, {"stage": "executor", "ms": 0}, {"stage": "verifier", "ms": 0}, {"stage": "pipeline", "ms": 0}], "error": null}
+{"source": "spider", "db_id": "concert_singer", "query": "How many singers are from each country?", "ok": true, "latency_ms": 1, "trace": [{"stage": "detector", "ms": 0}, {"stage": "planner", "ms": 0}, {"stage": "generator", "ms": 0}, {"stage": "safety", "ms": 0}, {"stage": "executor", "ms": 0}, {"stage": "verifier", "ms": 0}, {"stage": "pipeline", "ms": 0}], "error": null}
+{"source": "spider", "db_id": "concert_singer", "query": "List all song names by singers above the average age.", "ok": true, "latency_ms": 1, "trace": [{"stage": "detector", "ms": 0}, {"stage": "planner", "ms": 0}, {"stage": "generator", "ms": 0}, {"stage": "safety", "ms": 0}, {"stage": "executor", "ms": 0}, {"stage": "verifier", "ms": 0}, {"stage": "pipeline", "ms": 0}], "error": null}
+{"source": "spider", "db_id": "concert_singer", "query": "What are all the song names by singers who are older than average?", "ok": true, "latency_ms": 1, "trace": [{"stage": "detector", "ms": 0}, {"stage": "planner", "ms": 0}, {"stage": "generator", "ms": 0}, {"stage": "safety", "ms": 0}, {"stage": "executor", "ms": 0}, {"stage": "verifier", "ms": 0}, {"stage": "pipeline", "ms": 0}], "error": null}
+{"source": "spider", "db_id": "concert_singer", "query": "Show location and name for all stadiums with a capacity between 5000 and 10000.", "ok": true, "latency_ms": 1, "trace": [{"stage": "detector", "ms": 0}, {"stage": "planner", "ms": 0}, {"stage": "generator", "ms": 0}, {"stage": "safety", "ms": 0}, {"stage": "executor", "ms": 0}, {"stage": "verifier", "ms": 0}, {"stage": "pipeline", "ms": 0}], "error": null}
+{"source": "spider", "db_id": "concert_singer", "query": "What are the locations and names of all stations with capacity between 5000 and 10000?", "ok": true, "latency_ms": 1, "trace": [{"stage": "detector", "ms": 0}, {"stage": "planner", "ms": 0}, {"stage": "generator", "ms": 0}, {"stage": "safety", "ms": 0}, {"stage": "executor", "ms": 0}, {"stage": "verifier", "ms": 0}, {"stage": "pipeline", "ms": 0}], "error": null}
+{"source": "spider", "db_id": "concert_singer", "query": "What is the maximum capacity and the average of all stadiums ?", "ok": true, "latency_ms": 1, "trace": [{"stage": "detector", "ms": 0}, {"stage": "planner", "ms": 0}, {"stage": "generator", "ms": 0}, {"stage": "safety", "ms": 0}, {"stage": "executor", "ms": 0}, {"stage": "verifier", "ms": 0}, {"stage": "pipeline", "ms": 0}], "error": null}
+{"source": "spider", "db_id": "concert_singer", "query": "What is the average and maximum capacities for all stadiums ?", "ok": true, "latency_ms": 1, "trace": [{"stage": "detector", "ms": 0}, {"stage": "planner", "ms": 0}, {"stage": "generator", "ms": 0}, {"stage": "safety", "ms": 0}, {"stage": "executor", "ms": 0}, {"stage": "verifier", "ms": 0}, {"stage": "pipeline", "ms": 0}], "error": null}
+{"source": "spider", "db_id": "concert_singer", "query": "What is the name and capacity for the stadium with highest average attendance?", "ok": true, "latency_ms": 1, "trace": [{"stage": "detector", "ms": 0}, {"stage": "planner", "ms": 0}, {"stage": "generator", "ms": 0}, {"stage": "safety", "ms": 0}, {"stage": "executor", "ms": 0}, {"stage": "verifier", "ms": 0}, {"stage": "pipeline", "ms": 0}], "error": null}
+{"source": "spider", "db_id": "concert_singer", "query": "What is the name and capacity for the stadium with the highest average attendance?", "ok": true, "latency_ms": 1, "trace": [{"stage": "detector", "ms": 0}, {"stage": "planner", "ms": 0}, {"stage": "generator", "ms": 0}, {"stage": "safety", "ms": 0}, {"stage": "executor", "ms": 0}, {"stage": "verifier", "ms": 0}, {"stage": "pipeline", "ms": 0}], "error": null}

benchmarks/results/20251108-110451/results.csv ADDED Viewed

	@@ -0,0 +1,21 @@

+source,db_id,query,ok,latency_ms
+spider,concert_singer,How many singers do we have?,✅,1
+spider,concert_singer,What is the total number of singers?,✅,1
+spider,concert_singer,"Show name, country, age for all singers ordered by age from the oldest to the youngest.",✅,1
+spider,concert_singer,"What are the names, countries, and ages for every singer in descending order of age?",✅,1
+spider,concert_singer,"What is the average, minimum, and maximum age of all singers from France?",✅,1
+spider,concert_singer,"What is the average, minimum, and maximum age for all French singers?",✅,1
+spider,concert_singer,Show the name and the release year of the song by the youngest singer.,✅,1
+spider,concert_singer,What are the names and release years for all the songs of the youngest singer?,✅,1
+spider,concert_singer,What are all distinct countries where singers above age 20 are from?,✅,1
+spider,concert_singer,What are  the different countries with singers above age 20?,✅,1
+spider,concert_singer,Show all countries and the number of singers in each country.,✅,1
+spider,concert_singer,How many singers are from each country?,✅,1
+spider,concert_singer,List all song names by singers above the average age.,✅,1
+spider,concert_singer,What are all the song names by singers who are older than average?,✅,1
+spider,concert_singer,Show location and name for all stadiums with a capacity between 5000 and 10000.,✅,1
+spider,concert_singer,What are the locations and names of all stations with capacity between 5000 and 10000?,✅,1
+spider,concert_singer,What is the maximum capacity and the average of all stadiums ?,✅,1
+spider,concert_singer,What is the average and maximum capacities for all stadiums ?,✅,1
+spider,concert_singer,What is the name and capacity for the stadium with highest average attendance?,✅,1
+spider,concert_singer,What is the name and capacity for the stadium with the highest average attendance?,✅,1

benchmarks/results/20251108-110451/summary.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "total": 20,
+  "success_rate": 1.0,
+  "avg_latency_ms": 1.0,
+  "mode": "spider",
+  "split": "dev",
+  "limit": 20,
+  "config": "configs/sqlite_pipeline.yaml",
+  "provider_hint": "STUBS",
+  "spider_root": "/Users/melikakheirieh/Desktop/my/career-developement/LLM/nl2sql-copilot/data/spider",
+  "timestamp": "2025-11-08 11:04:51"
+}

benchmarks/results_demo/20251108-111403/demo.jsonl ADDED Viewed

	@@ -0,0 +1,5 @@

+{"query": "list all customers", "ok": true, "latency_ms": 12, "trace": [{"stage": "detector", "ms": 0}, {"stage": "planner", "ms": 0}, {"stage": "generator", "ms": 0}, {"stage": "safety", "ms": 0}, {"stage": "executor", "ms": 0}, {"stage": "verifier", "ms": 0}, {"stage": "pipeline", "ms": 0}], "error": null}
+{"query": "show total invoices per country", "ok": true, "latency_ms": 1, "trace": [{"stage": "detector", "ms": 0}, {"stage": "planner", "ms": 0}, {"stage": "generator", "ms": 0}, {"stage": "safety", "ms": 0}, {"stage": "executor", "ms": 0}, {"stage": "verifier", "ms": 0}, {"stage": "pipeline", "ms": 0}], "error": null}
+{"query": "top 3 albums by total sales", "ok": true, "latency_ms": 1, "trace": [{"stage": "detector", "ms": 0}, {"stage": "planner", "ms": 0}, {"stage": "generator", "ms": 0}, {"stage": "safety", "ms": 0}, {"stage": "executor", "ms": 0}, {"stage": "verifier", "ms": 0}, {"stage": "pipeline", "ms": 0}], "error": null}
+{"query": "artists with more than 3 albums", "ok": true, "latency_ms": 1, "trace": [{"stage": "detector", "ms": 0}, {"stage": "planner", "ms": 0}, {"stage": "generator", "ms": 0}, {"stage": "safety", "ms": 0}, {"stage": "executor", "ms": 0}, {"stage": "verifier", "ms": 0}, {"stage": "pipeline", "ms": 0}], "error": null}
+{"query": "number of employees per city", "ok": true, "latency_ms": 1, "trace": [{"stage": "detector", "ms": 0}, {"stage": "planner", "ms": 0}, {"stage": "generator", "ms": 0}, {"stage": "safety", "ms": 0}, {"stage": "executor", "ms": 0}, {"stage": "verifier", "ms": 0}, {"stage": "pipeline", "ms": 0}], "error": null}

benchmarks/results_demo/20251108-111403/results.csv ADDED Viewed

	@@ -0,0 +1,6 @@

+query,ok,latency_ms
+list all customers,✅,12
+show total invoices per country,✅,1
+top 3 albums by total sales,✅,1
+artists with more than 3 albums,✅,1
+number of employees per city,✅,1

benchmarks/results_demo/20251108-111403/summary.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "avg_latency_ms": 3.2,
+  "success_rate": 1.0,
+  "db_path": "/Users/melikakheirieh/Desktop/my/career-developement/LLM/nl2sql-copilot/demo.db",
+  "config": "configs/sqlite_pipeline.yaml",
+  "provider_hint": "STUBS",
+  "timestamp": "2025-11-08 11:14:03"
+}

benchmarks/results_pro/20251108-105442/spider_eval_pro.jsonl ADDED Viewed

	@@ -0,0 +1,20 @@

+{"id": 1, "db_id": "concert_singer", "question": "How many singers do we have?", "sql_pred": "SELECT 1;", "sql_gold": "SELECT count(*) FROM singer", "em": false, "sm": false, "exec_acc": false, "latency_ms": 0, "error": null}
+{"id": 2, "db_id": "concert_singer", "question": "What is the total number of singers?", "sql_pred": "SELECT 1;", "sql_gold": "SELECT count(*) FROM singer", "em": false, "sm": false, "exec_acc": false, "latency_ms": 0, "error": null}
+{"id": 3, "db_id": "concert_singer", "question": "Show name, country, age for all singers ordered by age from the oldest to the youngest.", "sql_pred": "SELECT 1;", "sql_gold": "SELECT name ,  country ,  age FROM singer ORDER BY age DESC", "em": false, "sm": false, "exec_acc": false, "latency_ms": 0, "error": null}
+{"id": 4, "db_id": "concert_singer", "question": "What are the names, countries, and ages for every singer in descending order of age?", "sql_pred": "SELECT 1;", "sql_gold": "SELECT name ,  country ,  age FROM singer ORDER BY age DESC", "em": false, "sm": false, "exec_acc": false, "latency_ms": 0, "error": null}
+{"id": 5, "db_id": "concert_singer", "question": "What is the average, minimum, and maximum age of all singers from France?", "sql_pred": "SELECT 1;", "sql_gold": "SELECT avg(age) ,  min(age) ,  max(age) FROM singer WHERE country  =  'France'", "em": false, "sm": false, "exec_acc": false, "latency_ms": 0, "error": null}
+{"id": 6, "db_id": "concert_singer", "question": "What is the average, minimum, and maximum age for all French singers?", "sql_pred": "SELECT 1;", "sql_gold": "SELECT avg(age) ,  min(age) ,  max(age) FROM singer WHERE country  =  'France'", "em": false, "sm": false, "exec_acc": false, "latency_ms": 0, "error": null}
+{"id": 7, "db_id": "concert_singer", "question": "Show the name and the release year of the song by the youngest singer.", "sql_pred": "SELECT 1;", "sql_gold": "SELECT song_name ,  song_release_year FROM singer ORDER BY age LIMIT 1", "em": false, "sm": false, "exec_acc": false, "latency_ms": 0, "error": null}
+{"id": 8, "db_id": "concert_singer", "question": "What are the names and release years for all the songs of the youngest singer?", "sql_pred": "SELECT 1;", "sql_gold": "SELECT song_name ,  song_release_year FROM singer ORDER BY age LIMIT 1", "em": false, "sm": false, "exec_acc": false, "latency_ms": 0, "error": null}
+{"id": 9, "db_id": "concert_singer", "question": "What are all distinct countries where singers above age 20 are from?", "sql_pred": "SELECT 1;", "sql_gold": "SELECT DISTINCT country FROM singer WHERE age  >  20", "em": false, "sm": false, "exec_acc": false, "latency_ms": 0, "error": null}
+{"id": 10, "db_id": "concert_singer", "question": "What are  the different countries with singers above age 20?", "sql_pred": "SELECT 1;", "sql_gold": "SELECT DISTINCT country FROM singer WHERE age  >  20", "em": false, "sm": false, "exec_acc": false, "latency_ms": 0, "error": null}
+{"id": 11, "db_id": "concert_singer", "question": "Show all countries and the number of singers in each country.", "sql_pred": "SELECT 1;", "sql_gold": "SELECT country ,  count(*) FROM singer GROUP BY country", "em": false, "sm": false, "exec_acc": false, "latency_ms": 0, "error": null}
+{"id": 12, "db_id": "concert_singer", "question": "How many singers are from each country?", "sql_pred": "SELECT 1;", "sql_gold": "SELECT country ,  count(*) FROM singer GROUP BY country", "em": false, "sm": false, "exec_acc": false, "latency_ms": 0, "error": null}
+{"id": 13, "db_id": "concert_singer", "question": "List all song names by singers above the average age.", "sql_pred": "SELECT 1;", "sql_gold": "SELECT song_name FROM singer WHERE age  >  (SELECT avg(age) FROM singer)", "em": false, "sm": false, "exec_acc": false, "latency_ms": 0, "error": null}
+{"id": 14, "db_id": "concert_singer", "question": "What are all the song names by singers who are older than average?", "sql_pred": "SELECT 1;", "sql_gold": "SELECT song_name FROM singer WHERE age  >  (SELECT avg(age) FROM singer)", "em": false, "sm": false, "exec_acc": false, "latency_ms": 0, "error": null}
+{"id": 15, "db_id": "concert_singer", "question": "Show location and name for all stadiums with a capacity between 5000 and 10000.", "sql_pred": "SELECT 1;", "sql_gold": "SELECT LOCATION ,  name FROM stadium WHERE capacity BETWEEN 5000 AND 10000", "em": false, "sm": false, "exec_acc": false, "latency_ms": 0, "error": null}
+{"id": 16, "db_id": "concert_singer", "question": "What are the locations and names of all stations with capacity between 5000 and 10000?", "sql_pred": "SELECT 1;", "sql_gold": "SELECT LOCATION ,  name FROM stadium WHERE capacity BETWEEN 5000 AND 10000", "em": false, "sm": false, "exec_acc": false, "latency_ms": 0, "error": null}
+{"id": 17, "db_id": "concert_singer", "question": "What is the maximum capacity and the average of all stadiums ?", "sql_pred": "SELECT 1;", "sql_gold": "select max(capacity), average from stadium", "em": false, "sm": false, "exec_acc": false, "latency_ms": 0, "error": null}
+{"id": 18, "db_id": "concert_singer", "question": "What is the average and maximum capacities for all stadiums ?", "sql_pred": "SELECT 1;", "sql_gold": "select avg(capacity) ,  max(capacity) from stadium", "em": false, "sm": false, "exec_acc": false, "latency_ms": 0, "error": null}
+{"id": 19, "db_id": "concert_singer", "question": "What is the name and capacity for the stadium with highest average attendance?", "sql_pred": "SELECT 1;", "sql_gold": "SELECT name ,  capacity FROM stadium ORDER BY average DESC LIMIT 1", "em": false, "sm": false, "exec_acc": false, "latency_ms": 0, "error": null}
+{"id": 20, "db_id": "concert_singer", "question": "What is the name and capacity for the stadium with the highest average attendance?", "sql_pred": "SELECT 1;", "sql_gold": "SELECT name ,  capacity FROM stadium ORDER BY average DESC LIMIT 1", "em": false, "sm": false, "exec_acc": false, "latency_ms": 0, "error": null}

benchmarks/results_pro/20251108-105442/summary.csv ADDED Viewed

	@@ -0,0 +1,21 @@

+id,db_id,question,em,sm,exec_acc,latency_ms
+1,concert_singer,How many singers do we have?,❌,❌,❌,0
+2,concert_singer,What is the total number of singers?,❌,❌,❌,0
+3,concert_singer,"Show name, country, age for all singers ordered by age from the oldest to the youngest.",❌,❌,❌,0
+4,concert_singer,"What are the names, countries, and ages for every singer in descending order of age?",❌,❌,❌,0
+5,concert_singer,"What is the average, minimum, and maximum age of all singers from France?",❌,❌,❌,0
+6,concert_singer,"What is the average, minimum, and maximum age for all French singers?",❌,❌,❌,0
+7,concert_singer,Show the name and the release year of the song by the youngest singer.,❌,❌,❌,0
+8,concert_singer,What are the names and release years for all the songs of the youngest singer?,❌,❌,❌,0
+9,concert_singer,What are all distinct countries where singers above age 20 are from?,❌,❌,❌,0
+10,concert_singer,What are  the different countries with singers above age 20?,❌,❌,❌,0
+11,concert_singer,Show all countries and the number of singers in each country.,❌,❌,❌,0
+12,concert_singer,How many singers are from each country?,❌,❌,❌,0
+13,concert_singer,List all song names by singers above the average age.,❌,❌,❌,0
+14,concert_singer,What are all the song names by singers who are older than average?,❌,❌,❌,0
+15,concert_singer,Show location and name for all stadiums with a capacity between 5000 and 10000.,❌,❌,❌,0
+16,concert_singer,What are the locations and names of all stations with capacity between 5000 and 10000?,❌,❌,❌,0
+17,concert_singer,What is the maximum capacity and the average of all stadiums ?,❌,❌,❌,0
+18,concert_singer,What is the average and maximum capacities for all stadiums ?,❌,❌,❌,0
+19,concert_singer,What is the name and capacity for the stadium with highest average attendance?,❌,❌,❌,0
+20,concert_singer,What is the name and capacity for the stadium with the highest average attendance?,❌,❌,❌,0

benchmarks/results_pro/20251108-105442/summary.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "total": 20,
+  "EM": 0.0,
+  "SM": 0.0,
+  "ExecAcc": 0.0,
+  "avg_latency_ms": 0.0,
+  "timestamp": "2025-11-08 10:54:42"
+}

benchmarks/run.py DELETED Viewed

@@ -1,214 +0,0 @@
-from __future__ import annotations
-import argparse
-import os
-import json
-import time
-from pathlib import Path
-from typing import Iterable, List, Dict, Any, Protocol, Tuple, Optional
-# ---- app imports
-from nl2sql.pipeline import Pipeline, FinalResult
-from nl2sql.ambiguity_detector import AmbiguityDetector
-from nl2sql.planner import Planner
-from nl2sql.generator import Generator
-from nl2sql.safety import Safety
-from nl2sql.executor import Executor
-from nl2sql.verifier import Verifier
-from nl2sql.repair import Repair
-# ---- adapters
-from adapters.db.sqlite_adapter import SQLiteAdapter
-from adapters.llm.openai_provider import OpenAIProvider
-# ---- LLM protocol (unifies OpenAIProvider and DummyLLM for mypy)
-class LLMProvider(Protocol):
-    """Minimal interface required by Planner/Generator/Repair stages."""
-    provider_id: str
-    def plan(
-        self, *, user_query: str, schema_preview: str
-    ) -> Tuple[str, int, int, float]: ...
-    def generate_sql(
-        self,
-        *,
-        user_query: str,
-        schema_preview: str,
-        plan_text: str,
-        clarify_answers: Optional[Any] = None,
-    ) -> Tuple[str, str, int, int, float]: ...
-    def repair(
-        self, *, sql: str, error_msg: str, schema_preview: str
-    ) -> Tuple[str, int, int, float]: ...
-# ---- fallback: Dummy LLM (so it runs without API keys)
-class DummyLLM:
-    provider_id = "dummy-llm"
-    def plan(
-        self, *, user_query: str, schema_preview: str
-    ) -> Tuple[str, int, int, float]:
-        text = (
-            f"- understand question: {user_query}\n"
-            "- identify tables\n- join if needed\n- filter\n- order/limit"
-        )
-        return text, 0, 0, 0.0
-    def generate_sql(
-        self,
-        *,
-        user_query: str,
-        schema_preview: str,
-        plan_text: str,
-        clarify_answers: Optional[Any] = None,
-    ) -> Tuple[str, str, int, int, float]:
-        # naive demo SQL (so pipeline flows end-to-end)
-        sql = "SELECT 1 AS one;"
-        rationale = "Demo SQL from DummyLLM"
-        return sql, rationale, 0, 0, 0.0
-    def repair(
-        self, *, sql: str, error_msg: str, schema_preview: str
-    ) -> Tuple[str, int, int, float]:
-        return sql, 0, 0, 0.0
-def ensure_demo_db(path: Path) -> None:
-    """Create a tiny SQLite db if missing, so executor has something to run."""
-    if path.exists():
-        return
-    import sqlite3
-    path.parent.mkdir(parents=True, exist_ok=True)
-    con = sqlite3.connect(path)
-    cur = con.cursor()
-    cur.execute("CREATE TABLE users(id INTEGER PRIMARY KEY, name TEXT, spend REAL);")
-    cur.executemany(
-        "INSERT INTO users(id,name,spend) VALUES(?,?,?)",
-        [(1, "Alice", 120.5), (2, "Bob", 80.0), (3, "Carol", 155.0)],
-    )
-    con.commit()
-    con.close()
-def build_pipeline(db_path: Path, use_openai: bool) -> Pipeline:
-    # DB adapter
-    db = SQLiteAdapter(str(db_path))
-    executor = Executor(db)
-    # LLM provider (typed to the Protocol so mypy accepts either provider)
-    llm: LLMProvider
-    if use_openai and os.getenv("OPENAI_API_KEY"):
-        llm = OpenAIProvider()  # conforms to LLMProvider
-    else:
-        llm = DummyLLM()  # conforms to LLMProvider
-    # stages
-    detector = AmbiguityDetector()
-    planner = Planner(llm)
-    generator = Generator(llm)
-    safety = Safety()
-    verifier = Verifier()
-    repair = Repair(llm)
-    # pipeline
-    return Pipeline(
-        detector=detector,
-        planner=planner,
-        generator=generator,
-        safety=safety,
-        executor=executor,
-        verifier=verifier,
-        repair=repair,
-    )
-def _sum_cost(traces: Iterable[Dict[str, Any]]) -> float:
-    total = 0.0
-    for tr in traces:
-        try:
-            total += float(tr.get("cost_usd", 0.0))
-        except Exception:
-            # ignore bad values
-            pass
-    return total
-def _is_safe_fail(ok: bool, details: List[str] | None) -> float:
-    """Return 1.0 when pipeline failed due to unsafe SQL (heuristic)."""
-    if ok:
-        return 0.0
-    txt = " ".join(details or []).lower()
-    return 1.0 if "unsafe" in txt else 0.0
-def run_benchmark(
-    queries: List[str], schema_preview: str, pipeline: Pipeline, outfile: Path
-) -> None:
-    results: List[Dict[str, Any]] = []
-    for q in queries:
-        t0 = time.perf_counter()
-        res: FinalResult = pipeline.run(user_query=q, schema_preview=schema_preview)
-        latency_ms = (time.perf_counter() - t0) * 1000.0
-        ok = (not res.ambiguous) and (not res.error) and bool(res.ok)
-        traces = res.traces or []
-        cost_sum = _sum_cost(traces)
-        results.append(
-            {
-                "query": q,
-                "exec_acc": 1.0 if ok else 0.0,
-                "safe_fail": _is_safe_fail(ok, res.details),
-                "latency_ms": latency_ms,
-                "cost_usd": cost_sum,
-                "repair_attempts": sum(1 for t in traces if t.get("stage") == "repair"),
-                "provider": getattr(
-                    getattr(pipeline.generator, "llm", None), "provider_id", "unknown"
-                ),
-            }
-        )
-    outfile.parent.mkdir(parents=True, exist_ok=True)
-    with open(outfile, "w") as f:
-        for row in results:
-            f.write(json.dumps(row) + "\n")
-    print(f"[OK] wrote {len(results)} rows → {outfile}")
-def main() -> None:
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--outfile", default="benchmarks/results/demo.jsonl")
-    parser.add_argument("--db", default="data/bench_demo.db")
-    parser.add_argument(
-        "--use-openai",
-        action="store_true",
-        help="Use OpenAI provider if API key present",
-    )
-    args = parser.parse_args()
-    root = Path(__file__).resolve().parents[1]  # project root
-    outfile = (root / args.outfile).resolve()
-    db_path = (root / args.db).resolve()
-    ensure_demo_db(db_path)
-    pipe = build_pipeline(db_path, use_openai=args.use_openai)
-    # a small demo set; replace with Spider when ready
-    queries = [
-        "show all users",
-        "top spenders",
-        "sum of spend",
-    ]
-    schema_preview = "CREATE TABLE users(id INT, name TEXT, spend REAL);"
-    run_benchmark(queries, schema_preview, pipe, outfile)
-if __name__ == "__main__":
-    main()

benchmarks/spider_loader.py CHANGED Viewed

@@ -1,12 +1,11 @@
 from __future__ import annotations
 import json
-import pathlib
 import sqlite3
 from dataclasses import dataclass
 from typing import List, Optional
-import os
-SPIDER_ROOT = pathlib.Path(os.getenv("SPIDER_ROOT", "data/spider"))
 @dataclass
@@ -14,40 +13,150 @@ class SpiderItem:
     db_id: str
     question: str
     gold_sql: str
-    db_path: pathlib.Path
 def load_spider_sqlite(
-    split: str = "dev", limit: Optional[int] = None
 ) -> List[SpiderItem]:
-    fn = {"dev": "dev.json", "train": "train_spider.json"}[split]
-    json_path = SPIDER_ROOT / fn
     try:
         items = json.loads(json_path.read_text(encoding="utf-8"))
     except Exception as e:
         raise RuntimeError(f"Failed to read Spider split file: {json_path} ({e})")
-    out: list[SpiderItem] = []
-    for ex in items[: (limit or len(items))]:
-        db_id = ex["db_id"]
-        db_path = SPIDER_ROOT / "database" / db_id / f"{db_id}.sqlite"
-        if not db_path.exists():
-            raise FileNotFoundError(f"Missing SQLite DB for {db_id}: {db_path}")
         out.append(
-            SpiderItem(
-                db_id=db_id,
-                question=ex["question"],
-                gold_sql=ex["query"],
-                db_path=db_path,
-            )
         )
     return out
-def open_readonly_connection(
-    db_path: pathlib.Path, timeout: float = 5.0
-) -> sqlite3.Connection:
-    uri = f"file:{db_path}?mode=ro&uri=true"
-    conn = sqlite3.connect(uri, uri=True, timeout=timeout)
-    conn.row_factory = sqlite3.Row
-    return conn

 from __future__ import annotations
 import json
+import os
 import sqlite3
 from dataclasses import dataclass
+from pathlib import Path
 from typing import List, Optional
 @dataclass
     db_id: str
     question: str
     gold_sql: str
+    db_path: str  # absolute path to the sqlite file
+# ---------- helpers ----------
+def _candidate_roots(env_root: Optional[str]) -> List[Path]:
+    """
+    Build a small list of candidate Spider roots to tolerate common layouts:
+    - $SPIDER_ROOT
+    - data/spider
+    - data/spider/spider        (when the repo was cloned into data/spider/spider)
+    - <env>/spider              (when SPIDER_ROOT points to the parent directory)
+    """
+    cands: List[Path] = []
+    if env_root:
+        p = Path(env_root).expanduser().resolve()
+        cands.append(p)
+        cands.append((p / "spider").resolve())
+    # project-local defaults
+    here = Path.cwd().resolve()
+    cands.append((here / "data" / "spider").resolve())
+    cands.append((here / "data" / "spider" / "spider").resolve())
+    # de-dup
+    seen, uniq = set(), []
+    for x in cands:
+        if str(x) not in seen:
+            uniq.append(x)
+            seen.add(str(x))
+    return uniq
+def _resolve_split_json(root: Path, split: str) -> Path:
+    """
+    Map split name to file name and return full path under `root`.
+    Spider uses:
+      - dev.json
+      - train_spider.json
+    """
+    fname = "dev.json" if split == "dev" else "train_spider.json"
+    return (root / fname).resolve()
+def _resolve_database_dir(root: Path) -> Path:
+    return (root / "database").resolve()
+def _ensure_exists(path: Path, kind: str) -> None:
+    if not path.exists():
+        raise FileNotFoundError(f"{kind} not found: {path}")
+# ---------- public API ----------
 def load_spider_sqlite(
+    *, split: str = "dev", limit: Optional[int] = None
 ) -> List[SpiderItem]:
+    """
+    Load a subset of Spider (dev/train) and attach absolute sqlite db paths.
+    Looks under:
+      - $SPIDER_ROOT (if set)
+      - ./data/spider
+      - ./data/spider/spider
+      - $SPIDER_ROOT/spider
+    """
+    env_root = os.getenv("SPIDER_ROOT")
+    roots = _candidate_roots(env_root)
+    # find a root that actually contains the split file & database/
+    json_path: Optional[Path] = None
+    database_dir: Optional[Path] = None
+    chosen_root: Optional[Path] = None
+    for r in roots:
+        jp = _resolve_split_json(r, split)
+        dbd = _resolve_database_dir(r)
+        if jp.exists() and dbd.exists():
+            json_path, database_dir, chosen_root = jp, dbd, r
+            break
+    if json_path is None or database_dir is None:
+        debug = "\n".join(
+            f"- {str(_resolve_split_json(r, split))}  |  {str(_resolve_database_dir(r))}"
+            for r in roots
+        )
+        raise RuntimeError(
+            "Failed to locate Spider dataset.\n"
+            f"Checked candidates for split='{split}':\n{debug}\n"
+            "Tip: export SPIDER_ROOT=/absolute/path/to/spider  "
+            "(the folder that directly contains dev.json/train_spider.json and database/)"
+        )
+    # read split
     try:
         items = json.loads(json_path.read_text(encoding="utf-8"))
     except Exception as e:
         raise RuntimeError(f"Failed to read Spider split file: {json_path} ({e})")
+    # build rows with absolute sqlite path
+    out: List[SpiderItem] = []
+    for obj in items:
+        db_id: str = obj.get("db_id", "")
+        q: str = obj.get("question", "").strip()
+        gold: str = obj.get("query", obj.get("sql", "")).strip()  # Spider uses 'query'
+        if not (db_id and q and gold):
+            continue
+        # <root>/database/<db_id>/<db_id>.sqlite
+        db_file = (database_dir / db_id / f"{db_id}.sqlite").resolve()
+        if not db_file.exists():
+            # some mirrors use .db ; try a fallback
+            alt = (database_dir / db_id / f"{db_id}.db").resolve()
+            if alt.exists():
+                db_file = alt
+            else:
+                # skip if DB file missing
+                # (you could also raise here if you prefer strict behavior)
+                continue
         out.append(
+            SpiderItem(db_id=db_id, question=q, gold_sql=gold, db_path=str(db_file))
         )
+        if limit is not None and len(out) >= limit:
+            break
+    if not out:
+        raise RuntimeError(
+            f"No usable items from {json_path} (limit={limit}). "
+            "Check db files under database/<db_id>/<db_id>.sqlite"
+        )
+    # small info for sanity
+    print(
+        f"✔ Spider root: {chosen_root}\n"
+        f"✔ Split file:  {json_path.name} ({len(out)} items)"
+    )
     return out
+def open_readonly_connection(db_path: str) -> sqlite3.Connection:
+    """
+    Open SQLite in read-only mode (URI).
+    """
+    uri = f"file:{Path(db_path).resolve()}?mode=ro"
+    return sqlite3.connect(uri, uri=True, check_same_thread=False)

scripts/smoke_run.py ADDED Viewed

	@@ -0,0 +1,335 @@

+"""
+Minimal smoke/demo runner for the NL2SQL pipeline.
+- Builds the pipeline via the official factory (no app/router imports).
+- Runs a small set of demo questions against a SQLite DB.
+- Works in two modes:
+    * Stub mode (set PYTEST_CURRENT_TEST=1) → no API key needed.
+    * Real mode   (set OPENAI_API_KEY=...)  → uses actual LLM provider.
+Outputs:
+  benchmarks/results_demo/<timestamp>/
+    - demo.jsonl     # one JSON record per query
+    - summary.json   # latency & success overview
+    - results.csv    # compact table for quick inspection
+Usage examples:
+  PYTHONPATH=$PWD PYTEST_CURRENT_TEST=1 \
+  python scripts/smoke_run.py --db-path demo.db
+  # With a custom dataset file (JSON: list[str] or list[{question: "..."}])
+  PYTHONPATH=$PWD PYTEST_CURRENT_TEST=1 \
+  python scripts/smoke_run.py --db-path demo.db --dataset-file benchmarks/demo.json
+"""
+from __future__ import annotations
+import argparse
+import csv
+import json
+import os
+import time
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+import sqlite3
+from nl2sql.pipeline_factory import pipeline_from_config_with_adapter
+from adapters.db.sqlite_adapter import SQLiteAdapter
+CONFIG_PATH = "configs/sqlite_pipeline.yaml"
+DEFAULT_QUESTIONS: List[str] = [
+    "list all customers",
+    "show total invoices per country",
+    "top 3 albums by total sales",
+    "artists with more than 3 albums",
+    "number of employees per city",
+]
+RESULT_ROOT = Path("benchmarks") / "results_demo"
+TIMESTAMP = time.strftime("%Y%m%d-%H%M%S")
+RESULT_DIR = RESULT_ROOT / TIMESTAMP
+def ensure_demo_db(db_path: Path) -> None:
+    """Create a tiny demo SQLite DB if it doesn't exist."""
+    if db_path.exists():
+        return
+    db_path.parent.mkdir(parents=True, exist_ok=True)
+    conn = sqlite3.connect(str(db_path))
+    cur = conn.cursor()
+    # Minimal schema that matches our default demo questions
+    cur.executescript("""
+    DROP TABLE IF EXISTS customers;
+    DROP TABLE IF EXISTS invoices;
+    DROP TABLE IF EXISTS employees;
+    DROP TABLE IF EXISTS artists;
+    DROP TABLE IF EXISTS albums;
+    CREATE TABLE customers (
+        id INTEGER PRIMARY KEY,
+        name TEXT,
+        country TEXT
+    );
+    CREATE TABLE invoices (
+        id INTEGER PRIMARY KEY,
+        customer_id INTEGER,
+        total REAL,
+        country TEXT,
+        FOREIGN KEY (customer_id) REFERENCES customers(id)
+    );
+    CREATE TABLE employees (
+        id INTEGER PRIMARY KEY,
+        name TEXT,
+        city TEXT
+    );
+    CREATE TABLE artists (
+        id INTEGER PRIMARY KEY,
+        name TEXT
+    );
+    CREATE TABLE albums (
+        id INTEGER PRIMARY KEY,
+        artist_id INTEGER,
+        title TEXT,
+        sales REAL DEFAULT 0,
+        FOREIGN KEY (artist_id) REFERENCES artists(id)
+    );
+    """)
+    # Seed a bit of data
+    cur.executemany(
+        "INSERT INTO customers (id, name, country) VALUES (?, ?, ?)",
+        [
+            (1, "Alice", "USA"),
+            (2, "Bob", "Germany"),
+            (3, "Carlos", "Brazil"),
+            (4, "Darya", "Iran"),
+        ],
+    )
+    cur.executemany(
+        "INSERT INTO invoices (id, customer_id, total, country) VALUES (?, ?, ?, ?)",
+        [
+            (1, 1, 120.5, "USA"),
+            (2, 2, 75.0, "Germany"),
+            (3, 1, 33.2, "USA"),
+            (4, 3, 48.0, "Brazil"),
+            (5, 4, 90.0, "Iran"),
+        ],
+    )
+    cur.executemany(
+        "INSERT INTO employees (id, name, city) VALUES (?, ?, ?)",
+        [
+            (1, "Eve", "New York"),
+            (2, "Frank", "Berlin"),
+            (3, "Gita", "Tehran"),
+        ],
+    )
+    cur.executemany(
+        "INSERT INTO artists (id, name) VALUES (?, ?)",
+        [
+            (1, "ABand"),
+            (2, "BGroup"),
+            (3, "CEnsemble"),
+        ],
+    )
+    cur.executemany(
+        "INSERT INTO albums (id, artist_id, title, sales) VALUES (?, ?, ?, ?)",
+        [
+            (1, 1, "First Light", 500.0),
+            (2, 1, "Second Wind", 300.0),
+            (3, 2, "Blue Lines", 900.0),
+            (4, 3, "Echoes", 150.0),
+        ],
+    )
+    conn.commit()
+    conn.close()
+def _ms(start_s: float) -> int:
+    """Convert elapsed seconds to integer milliseconds."""
+    return int((time.perf_counter() - start_s) * 1000)
+def _derive_schema_preview(pipeline_obj: Any) -> Optional[str]:
+    """Try to derive schema preview from adapter/executor if available."""
+    for attr in ("executor", "adapter"):
+        obj = getattr(pipeline_obj, attr, None)
+        if obj and hasattr(obj, "derive_schema_preview"):
+            try:
+                return obj.derive_schema_preview()  # type: ignore[no-any-return]
+            except Exception:
+                pass
+    return None
+def _normalize_trace(trace_obj: Any) -> List[Dict[str, Any]]:
+    """Convert trace to a list of {stage, ms} dicts for logging/export."""
+    out: List[Dict[str, Any]] = []
+    if not isinstance(trace_obj, list):
+        return out
+    for t in trace_obj:
+        if isinstance(t, dict):
+            stage = t.get("stage", "?")
+            ms = t.get("duration_ms", 0)
+        else:
+            stage = getattr(t, "stage", "?")
+            ms = getattr(t, "duration_ms", 0)
+        try:
+            out.append({"stage": str(stage), "ms": int(ms)})
+        except Exception:
+            out.append({"stage": str(stage), "ms": 0})
+    return out
+def _load_questions(path: Optional[str]) -> List[str]:
+    """Load questions from a JSON file or return defaults."""
+    if not path:
+        return DEFAULT_QUESTIONS
+    p = Path(path)
+    if not p.exists():
+        raise FileNotFoundError(f"dataset file not found: {p}")
+    data = json.loads(p.read_text(encoding="utf-8"))
+    if isinstance(data, list):
+        if all(isinstance(x, str) for x in data):
+            return list(data)
+        if all(isinstance(x, dict) and "question" in x for x in data):
+            return [str(x["question"]) for x in data]
+    raise ValueError(
+        "Dataset must be a JSON array of strings or objects with a 'question' field."
+    )
+def main() -> None:
+    ap = argparse.ArgumentParser()
+    ap.add_argument(
+        "--db-path",
+        type=str,
+        default="demo.db",
+        help="Path to SQLite DB (default: demo.db)",
+    )
+    ap.add_argument(
+        "--dataset-file",
+        type=str,
+        default=None,
+        help="Optional JSON file: list[str] or list[{question: str}]",
+    )
+    ap.add_argument(
+        "--config",
+        type=str,
+        default=CONFIG_PATH,
+        help=f"Pipeline YAML (default: {CONFIG_PATH})",
+    )
+    args = ap.parse_args()
+    RESULT_DIR.mkdir(parents=True, exist_ok=True)
+    # Resolve DB path and ensure demo DB exists for quick smoke runs
+    db_path = Path(args.db_path).resolve()
+    ensure_demo_db(db_path)
+    # Build pipeline via the official factory (factory decides real vs stub by env)
+    adapter = SQLiteAdapter(str(db_path))
+    pipeline = pipeline_from_config_with_adapter(args.config, adapter=adapter)
+    schema_preview = _derive_schema_preview(pipeline)
+    print(f"✅ Pipeline ready (db={db_path.name}, config={args.config})")
+    print(
+        "📄 Schema preview:",
+        "yes" if schema_preview else "no",
+        "| provider:",
+        "STUBS" if os.getenv("PYTEST_CURRENT_TEST") else "REAL",
+    )
+    questions = _load_questions(args.dataset_file)
+    print(f"🗂  Loaded {len(questions)} questions.")
+    rows: List[Dict[str, Any]] = []
+    for q in questions:
+        print(f"\n🧠 Query: {q}")
+        t0 = time.perf_counter()
+        try:
+            result = pipeline.run(user_query=q, schema_preview=schema_preview or "")
+            latency_ms = _ms(t0) or 1  # clamp to 1ms when stubs are instant
+            stages = _normalize_trace(
+                getattr(result, "traces", getattr(result, "trace", []))
+            )
+            rows.append(
+                {
+                    "query": q,
+                    "ok": bool(getattr(result, "ok", True)),
+                    "latency_ms": latency_ms,
+                    "trace": stages,
+                    "error": None,
+                }
+            )
+            print(f"✅ Success ({latency_ms} ms)")
+        except Exception as exc:
+            latency_ms = _ms(t0) or 1
+            rows.append(
+                {
+                    "query": q,
+                    "ok": False,
+                    "latency_ms": latency_ms,
+                    "trace": [],
+                    "error": str(exc),
+                }
+            )
+            print(f"❌ Failed: {exc!s} ({latency_ms} ms)")
+    # Aggregate and persist
+    avg_latency = (
+        round(sum(r["latency_ms"] for r in rows) / max(len(rows), 1), 1)
+        if rows
+        else 0.0
+    )
+    success_rate = (
+        (sum(1 for r in rows if r["ok"]) / max(len(rows), 1)) if rows else 0.0
+    )
+    meta = {
+        "db_path": str(db_path),
+        "config": args.config,
+        "provider_hint": "STUBS" if os.getenv("PYTEST_CURRENT_TEST") else "REAL",
+        "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
+    }
+    jsonl_path = RESULT_DIR / "demo.jsonl"
+    with jsonl_path.open("w", encoding="utf-8") as f:
+        for r in rows:
+            json.dump(r, f, ensure_ascii=False)
+            f.write("\n")
+    summary_path = RESULT_DIR / "summary.json"
+    with summary_path.open("w", encoding="utf-8") as f:
+        json.dump(
+            {"avg_latency_ms": avg_latency, "success_rate": success_rate, **meta},
+            f,
+            indent=2,
+        )
+    csv_path = RESULT_DIR / "results.csv"
+    with csv_path.open("w", newline="", encoding="utf-8") as f:
+        wr = csv.DictWriter(f, fieldnames=["query", "ok", "latency_ms"])
+        wr.writeheader()
+        for r in rows:
+            wr.writerow(
+                {
+                    "query": r["query"],
+                    "ok": "✅" if r["ok"] else "❌",
+                    "latency_ms": int(r["latency_ms"]),
+                }
+            )
+    print(
+        "\n💾 Saved outputs:\n"
+        f"- {jsonl_path}\n- {summary_path}\n- {csv_path}\n"
+        f"📊 Avg latency: {avg_latency} ms | Success rate: {success_rate:.0%}\n"
+    )
+if __name__ == "__main__":
+    main()