Spaces:

melikakheirieh
/

nl2sql-copilot

Sleeping

App Files Files Community

Melika Kheirieh commited on Nov 3, 2025

Commit

ed681b1

1 Parent(s): 598536c

feat(benchmarks): align Spider eval with config-driven Pipeline and native Safety; log per-stage trace; add CSV summary

Browse files

Files changed (1) hide show

benchmarks/evaluate_spider.py +180 -103

benchmarks/evaluate_spider.py CHANGED Viewed

@@ -1,17 +1,22 @@
-"""
-Evaluate NL2SQL pipeline performance on Spider-like queries.
-Uses config-driven Pipeline, native Safety checks, and per-stage latency tracing.
-Outputs: JSONL (detailed logs), JSON (metrics summary), and CSV (for README).
-"""
-import json
 import csv
 import time
 from pathlib import Path
-from nl2sql.pipeline import Pipeline
-# ---------- Config ----------
-DATASET = [
     "list all customers",
     "show total invoices per country",
     "top 3 albums by total sales",
@@ -19,107 +24,179 @@ DATASET = [
     "number of employees per city",
 ]
-CONFIG_PATH = "configs/pipeline.yaml"
-RESULT_DIR = Path("benchmarks/results")
-RESULT_DIR.mkdir(parents=True, exist_ok=True)
-# ---------- Initialize pipeline ----------
-pipeline = Pipeline.from_config(CONFIG_PATH)
-print(f"✅ Loaded pipeline from {CONFIG_PATH}")
-# Optional: schema preview if adapter supports it
-schema_preview = None
-try:
-    adapter = getattr(pipeline, "executor", None)
-    if adapter and hasattr(adapter, "derive_schema_preview"):
-        schema_preview = adapter.derive_schema_preview()
-        print("📄 Derived schema preview successfully.")
-except Exception as e:
-    print(f"⚠️ Could not derive schema preview: {e}")
-# ---------- Evaluation ----------
-records = []
-for q in DATASET:
-    print(f"\n🧠 Query: {q}")
-    start = time.perf_counter()
     try:
-        result = pipeline.run(user_query=q, schema_preview=schema_preview)
-        latency = int((time.perf_counter() - start) * 1000)
-        trace = getattr(result, "trace", None)
-        stages = []
-        if trace:
-            # trace might be list of StageTrace or dicts
-            try:
-                for t in trace:
-                    stages.append(
-                        {"stage": t.get("stage", "?"), "ms": t.get("duration_ms", 0)}
-                        if isinstance(t, dict)
-                        else {
-                            "stage": getattr(t, "stage", "?"),
-                            "ms": getattr(t, "duration_ms", 0),
-                        }
-                    )
-            except Exception:
-                pass
-        records.append(
-            {
                 "query": q,
-                "ok": True,
-                "latency_ms": latency,
                 "trace": stages,
                 "error": None,
             }
-        )
-        print(f"✅ Success ({latency} ms)")
-    except Exception as e:
-        latency = int((time.perf_counter() - start) * 1000)
-        records.append(
-            {
                 "query": q,
                 "ok": False,
-                "latency_ms": latency,
                 "trace": [],
-                "error": str(e),
             }
-        )
-        print(f"❌ Failed: {e} ({latency} ms)")
-# ---------- Aggregate metrics ----------
-avg_latency = round(sum(r["latency_ms"] for r in records) / len(records), 1)
-success_rate = sum(1 for r in records if r["ok"]) / len(records)
-print(f"\n📊 Average latency: {avg_latency} ms | Success rate: {success_rate:.0%}")
-summary = {
-    "queries_total": len(records),
-    "success_rate": success_rate,
-    "avg_latency_ms": avg_latency,
-    "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
-}
-# ---------- Save outputs ----------
-jsonl_path = RESULT_DIR / "spider_eval.jsonl"
-with open(jsonl_path, "w", encoding="utf-8") as f:
-    for r in records:
-        json.dump(r, f, ensure_ascii=False)
-        f.write("\n")
-summary_path = RESULT_DIR / "metrics_summary.json"
-with open(summary_path, "w", encoding="utf-8") as f:
-    json.dump(summary, f, indent=2)
-csv_path = RESULT_DIR / "results.csv"
-with open(csv_path, "w", newline="", encoding="utf-8") as f:
-    writer = csv.DictWriter(f, fieldnames=["query", "ok", "latency_ms"])
-    writer.writeheader()
-    for r in records:
-        writer.writerow(
-            {
-                "query": r["query"],
-                "ok": "✅" if r["ok"] else "❌",
-                "latency_ms": r["latency_ms"],
-            }
-        )
-print(f"\n💾 Saved logs to:\n- {jsonl_path}\n- {summary_path}\n- {csv_path}")

+from __future__ import annotations
 import csv
+import json
+import os
 import time
 from pathlib import Path
+from typing import Any, Dict, List, Optional
+# Reuse existing factories from your FastAPI router (no new DI needed)
+from app.routers.nl2sql import (  # type: ignore
+    _pipeline as DEFAULT_PIPELINE,
+    _build_pipeline,
+    _select_adapter,
+)
+# -------------------- Config --------------------
+DATASET: List[str] = [
     "list all customers",
     "show total invoices per country",
     "top 3 albums by total sales",
     "number of employees per city",
 ]
+# DB id/mode follows your router convention; adjust if needed
+DB_ID: str = os.getenv("DB_MODE", "sqlite")
+# Results directory with timestamped subfolder (keeps previous runs)
+RESULT_ROOT = Path("benchmarks") / "results"
+TIMESTAMP = time.strftime("%Y%m%d-%H%M%S")
+RESULT_DIR = RESULT_ROOT / TIMESTAMP
+# -------------------- Helpers --------------------
+def _int_ms(start: float) -> int:
+    return int((time.perf_counter() - start) * 1000)
+def _derive_schema_preview_safe(pipeline_obj: Any) -> Optional[str]:
+    """
+    Try to derive schema preview from the adapter/executor if such a method exists.
+    Kept intentionally permissive to avoid tight coupling.
+    """
     try:
+        # common places the adapter might live
+        candidates: List[Any] = [
+            getattr(pipeline_obj, "executor", None),
+            getattr(pipeline_obj, "adapter", None),
+        ]
+        for c in candidates:
+            if c and hasattr(c, "derive_schema_preview"):
+                return c.derive_schema_preview()  # type: ignore[no-any-return, call-arg]
+    except Exception:
+        pass
+    return None
+def _to_stage_list(trace_obj: Any) -> List[Dict[str, Any]]:
+    """
+    Normalize pipeline trace (list of dataclass or dict) to a list of dicts:
+    [{ "stage": str, "ms": int }, ...]
+    """
+    stages: List[Dict[str, Any]] = []
+    if not isinstance(trace_obj, list):
+        return stages
+    for t in trace_obj:
+        if isinstance(t, dict):
+            stage = t.get("stage", "?")
+            ms = t.get("duration_ms", 0)
+        else:
+            stage = getattr(t, "stage", "?")
+            ms = getattr(t, "duration_ms", 0)
+        try:
+            stages.append({"stage": str(stage), "ms": int(ms)})
+        except Exception:
+            stages.append({"stage": str(stage), "ms": 0})
+    return stages
+# -------------------- Main --------------------
+def main() -> None:
+    RESULT_DIR.mkdir(parents=True, exist_ok=True)
+    # Build pipeline from router factories (no new DI required)
+    try:
+        adapter = _select_adapter(DB_ID)  # e.g., "sqlite" / "postgres"
+        pipeline = _build_pipeline(adapter)
+        using_default = False
+    except Exception:
+        pipeline = DEFAULT_PIPELINE
+        using_default = True
+    print(
+        f"✅ Pipeline ready "
+        f"(db_id={DB_ID}, source={'default' if using_default else 'custom adapter'})"
+    )
+    # Optional schema preview
+    schema_preview = _derive_schema_preview_safe(pipeline)
+    if schema_preview:
+        print("📄 Derived schema preview ✓")
+    else:
+        print("ℹ️ No schema preview (adapter does not expose it or not needed)")
+    # Evaluate
+    records: List[Dict[str, Any]] = []
+    for q in DATASET:
+        print(f"\n🧠 Query: {q}")
+        t0 = time.perf_counter()
+        try:
+            result = pipeline.run(
+                user_query=q,
+                schema_preview=schema_preview or "",  # <- force str
+            )
+            latency_ms = _int_ms(t0)
+            # ok flag -> coerce to bool for mypy and consistency
+            ok_flag = bool(getattr(result, "ok", True))
+            stages = _to_stage_list(getattr(result, "trace", None))
+            rec: Dict[str, Any] = {
                 "query": q,
+                "ok": ok_flag,
+                "latency_ms": latency_ms,
                 "trace": stages,
                 "error": None,
             }
+            records.append(rec)
+            print(f"✅ Success ({latency_ms} ms)")
+        except Exception as exc:
+            latency_ms = _int_ms(t0)
+            rec = {
                 "query": q,
                 "ok": False,
+                "latency_ms": latency_ms,
                 "trace": [],
+                "error": str(exc),
             }
+            records.append(rec)
+            print(f"❌ Failed: {exc!s} ({latency_ms} ms)")
+    # Aggregate metrics
+    avg_latency = (
+        round(sum(r["latency_ms"] for r in records) / max(len(records), 1), 1)
+        if records
+        else 0.0
+    )
+    success_rate = (
+        sum(1 for r in records if bool(r.get("ok"))) / max(len(records), 1)
+        if records
+        else 0.0
+    )
+    summary: Dict[str, Any] = {
+        "queries_total": len(records),
+        "success_rate": success_rate,
+        "avg_latency_ms": avg_latency,
+        "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
+        "db_id": DB_ID,
+        "pipeline_source": "default" if using_default else "adapter",
+    }
+    # Persist outputs
+    jsonl_path = RESULT_DIR / "spider_eval.jsonl"
+    with jsonl_path.open("w", encoding="utf-8") as f:
+        for r in records:
+            json.dump(r, f, ensure_ascii=False)
+            f.write("\n")
+    summary_path = RESULT_DIR / "metrics_summary.json"
+    with summary_path.open("w", encoding="utf-8") as f:
+        json.dump(summary, f, indent=2)
+    csv_path = RESULT_DIR / "results.csv"
+    with csv_path.open("w", newline="", encoding="utf-8") as f:
+        writer = csv.DictWriter(f, fieldnames=["query", "ok", "latency_ms"])
+        writer.writeheader()
+        for r in records:
+            writer.writerow(
+                {
+                    "query": r["query"],
+                    "ok": "✅" if bool(r["ok"]) else "❌",
+                    "latency_ms": int(r["latency_ms"]),
+                }
+            )
+    print(
+        "\n💾 Saved outputs:\n"
+        f"- {jsonl_path}\n- {summary_path}\n- {csv_path}\n"
+        f"📊 Avg latency: {avg_latency} ms | Success rate: {success_rate:.0%}"
+    )
+if __name__ == "__main__":
+    main()