Spaces:

melikakheirieh
/

nl2sql-copilot

Sleeping

App Files Files Community

Melika Kheirieh commited on Nov 3

Commit

598536c

1 Parent(s): b0bec17

feat(benchmarks): align Spider eval with config-driven Pipeline and native Safety; log per-stage trace; add CSV summary

Browse files

Files changed (2) hide show

.coverage +0 -0
benchmarks/evaluate_spider.py +115 -442

.coverage CHANGED Viewed

Binary files a/.coverage and b/.coverage differ

benchmarks/evaluate_spider.py CHANGED Viewed

@@ -1,452 +1,125 @@
-from __future__ import annotations
 import json
-import subprocess
 import time
 from pathlib import Path
-from typing import Any, Iterable, Optional, Tuple, cast
-from tqdm import tqdm
-from langchain_community.utilities import SQLDatabase
-from sqlglot import parse_one, exp
-from sqlglot.errors import ParseError
-from sqlalchemy import create_engine, inspect
-from spider_loader import load_spider_sqlite
-def _try_import_pipeline():
-    """
-    Try multiple plausible entrypoints from nl2sql.
-    Returns a tuple of callables or None:
-      (make_pipeline | None, run_function | None, PipelineClass | None)
-    """
-    make_pipeline = None
-    run_fn = None
-    PipelineCls = None
-    try:
-        from nl2sql.pipeline import make_pipeline as _mk  # type: ignore
-        make_pipeline = _mk
-    except Exception:
-        pass
-    try:
-        from nl2sql.pipeline import run_nl2sql as _run  # type: ignore
-        run_fn = _run
-    except Exception:
-        pass
-    try:
-        from nl2sql.pipeline import Pipeline as _P  # type: ignore
-        PipelineCls = _P
-    except Exception:
-        pass
-    return make_pipeline, run_fn, PipelineCls
-LOG_DIR = Path("logs/spider_eval")
-LOG_DIR.mkdir(parents=True, exist_ok=True)
-FORBIDDEN_NODES: Tuple[type, ...] = (
-    exp.Insert,
-    exp.Delete,
-    exp.Update,
-    exp.Drop,
-    exp.Alter,
-    exp.Attach,
-    exp.Pragma,
-    exp.Create,
-)
-def normalize_sql(sql: str) -> str:
-    return " ".join(sql.lower().strip().split())
-def compare_results(
-    pred_rows: Optional[Iterable[Any]], gold_rows: Optional[Iterable[Any]]
-) -> bool:
-    if pred_rows is None or gold_rows is None:
-        return False
-    return set(pred_rows) == set(gold_rows)
-def try_execute_sql(
-    sql_db: SQLDatabase,
-    sql: str,
-    timeout: Optional[float] = None,  # kept for API compatibility
-) -> tuple[Optional[list[tuple[Any, ...]]], float, Optional[str]]:
-    start = time.time()
     try:
-        raw_rows = sql_db.run(sql)
-        # Normalize result shape for MyPy and downstream code
-        if isinstance(raw_rows, list):
-            rows = [tuple(r) for r in raw_rows]
-        elif isinstance(raw_rows, tuple):
-            rows = [tuple(raw_rows)]
-        else:
-            # Fallback cast — if library returns ResultSet or something similar
-            rows = cast(list[tuple[Any, ...]], raw_rows)
-        return rows, time.time() - start, None
-    except Exception as e:
-        return None, time.time() - start, str(e)
-def exact_match_structural(sql_pred: str, sql_gold: str) -> bool:
-    try:
-        ast_pred = parse_one(sql_pred)
-        ast_gold = parse_one(sql_gold)
-    except Exception:
-        return False
-    def normalize_ast(node: exp.Expression) -> exp.Expression:
-        for name, arg in node.args.items():
-            if isinstance(arg, list):
-                arg.sort(key=lambda x: str(x))
-                for child in arg:
-                    normalize_ast(child)
-            elif isinstance(arg, exp.Expression):
-                normalize_ast(arg)
-        if isinstance(node, exp.Alias):
-            return normalize_ast(node.this)
-        return node
-    norm_prd = normalize_ast(ast_pred)
-    norm_gold = normalize_ast(ast_gold)
-    return norm_prd == norm_gold
-def get_git_commit_hash() -> str:
-    try:
-        out = (
-            subprocess.check_output(["git", "rev-parse", "HEAD"])
-            .strip()
-            .decode("ascii")
-        )
-        return out
-    except Exception:
-        return "UNKNOWN"
-def is_safe_sql(sql: str, dialect: Optional[str] = None) -> bool:
-    try:
-        ast = parse_one(sql, read=dialect)
-    except ParseError:
-        return False
-    if not isinstance(ast, exp.Select):
-        return False
-    for node in ast.walk():
-        if isinstance(node, FORBIDDEN_NODES):
-            return False
-    return True
-# --- جایگزین get_schema_preview از app.routers ---
-def get_schema_preview_sqlalchemy(db_path: str, max_cols: int = 0) -> str:
-    """
-    Lightweight schema preview using SQLAlchemy inspector.
-    max_cols=0 => unlimited
-    """
-    engine = create_engine(f"sqlite:///{db_path}")
-    insp = inspect(engine)
-    lines: list[str] = []
-    for tbl in sorted(insp.get_table_names()):
-        cols = insp.get_columns(tbl)
-        if max_cols > 0:
-            cols = cols[:max_cols]
-        col_str = ", ".join(f"{c['name']}:{c.get('type')}" for c in cols)
-        pks = insp.get_pk_constraint(tbl).get("constrained_columns") or []
-        pk_str = f" | PK: {', '.join(pks)}" if pks else ""
-        fks = insp.get_foreign_keys(tbl)
-        fk_str = ""
-        if fks:
-            fks_desc = []
-            for fk in fks:
-                ref = fk.get("referred_table")
-                cols_fk = ", ".join(fk.get("constrained_columns") or [])
-                ref_cols = ", ".join(fk.get("referred_columns") or [])
-                fks_desc.append(f"{cols_fk} -> {ref}({ref_cols})")
-            fk_str = " | FK: " + " ; ".join(fks_desc)
-        lines.append(f"{tbl}({col_str}){pk_str}{fk_str}")
-    engine.dispose()
-    return "\n".join(lines)
-def _generate_sql(
-    question: str, sql_db: SQLDatabase, schema_text: str, max_output_tokens: int = 1000
-) -> tuple[str, str, dict[str, Any]]:
-    """
-    Returns: (status_msg, sql_text, extra_output)
-    Strategy:
-      1) If nl2sql.pipeline.run_nl2sql exists: call it.
-      2) Else if nl2sql.pipeline.make_pipeline exists: build and run.
-      3) Else if nl2sql.pipeline.Pipeline exists: instantiate minimal pipeline and run.
-      4) Else: raise NotImplementedError.
-    """
-    make_pipeline, run_fn, PipelineCls = _try_import_pipeline()
-    # Case 1: direct run function
-    if run_fn is not None:
-        res = run_fn(
-            question=question,
-            schema_text=schema_text,
-            sql_db=sql_db,
-            max_output_tokens=max_output_tokens,
         )
-        # Expecting a dict-like or object with attributes; normalize:
-        if isinstance(res, dict):
-            msg = res.get("status", "ok")
-            sql = res.get("sql", "")
-            return msg, sql, res
-        # fallback generic
-        msg = getattr(res, "status", "ok")
-        sql = getattr(res, "sql", "")
-        return msg, sql, {"result": res}
-    # Case 2: factory + run
-    if make_pipeline is not None:
-        pipe = make_pipeline(sql_db=sql_db, schema_text=schema_text)  # type: ignore[arg-type]
-        # Common conventions:
-        if hasattr(pipe, "run"):
-            out = pipe.run(question)  # type: ignore[call-arg]
-        elif hasattr(pipe, "execute"):
-            out = pipe.execute(question)  # type: ignore[call-arg]
-        else:
-            raise RuntimeError("Pipeline object has no run/execute()")
-        msg = getattr(out, "status", "ok")
-        sql = getattr(out, "sql", "")
-        return msg, sql, {"result": out}
-    # Case 3: class-based pipeline
-    if PipelineCls is not None:
-        # Try minimal constructor names; adjust to your class signature if needed
-        # We pass what we have; extra kwargs should be ignored or have defaults.
-        pipe = PipelineCls(sql_db=sql_db, schema_text=schema_text)
-        if hasattr(pipe, "run"):
-            out = pipe.run(question)  # type: ignore[call-arg]
-        else:
-            raise RuntimeError("Pipeline class has no run()")
-        msg = getattr(out, "status", "ok")
-        sql = getattr(out, "sql", "")
-        return msg, sql, {"result": out}
-    raise NotImplementedError(
-        "Cannot locate a public NL2SQL entrypoint in nl2sql.pipeline. "
-        "Expose one of: run_nl2sql(), make_pipeline(), or Pipeline.run()."
-    )
-def run_eval(
-    split: str = "dev", limit: int = 100, resume: bool = True, sleep_time: float = 0.01
-) -> None:
-    data = load_spider_sqlite(split)
-    if len(data) < limit:
-        limit = len(data)
-    data = data[:limit]
-    print(f"Running eval on {len(data)} examples in split={split}...")
-    commit_hash = get_git_commit_hash()
-    start_ts = int(time.time())
-    pred_txt = LOG_DIR / f"{split}_pred_{start_ts}.txt"
-    gold_txt = LOG_DIR / f"{split}_gold_{start_ts}.txt"
-    results_fn = LOG_DIR / f"{split}_results_{start_ts}.jsonl"
-    metrics_fn = LOG_DIR / f"{split}_metrics_{start_ts}.json"
-    done: set[tuple[str, str]] = set()
-    if resume and results_fn.exists():
-        with results_fn.open("r", encoding="utf-8") as f:
-            for line in f:
-                if line.startswith("#"):
-                    continue
-                try:
-                    r = json.loads(line)
-                    done.add((r.get("db_id"), r.get("question")))
-                except Exception:
-                    pass
-    write_header = not results_fn.exists()
-    agg: list[dict[str, Any]] = []
-    with (
-        results_fn.open("a", encoding="utf-8") as fout,
-        pred_txt.open("a", encoding="utf-8") as fpred,
-        gold_txt.open("a", encoding="utf-8") as fgold,
-    ):
-        if write_header:
-            header = {
-                "commit_hash": commit_hash,
-                "split": split,
-                "limit": limit,
-                "start_time": start_ts,
             }
-            fout.write("# " + json.dumps(header, ensure_ascii=False) + "\n")
-            fout.flush()
-        for ex in tqdm(data):
-            key = (ex.db_id, ex.question)
-            if resume and key in done:
-                continue
-            db_path = str(ex.db_path)
-            schema = get_schema_preview_sqlalchemy(db_path, max_cols=0)
-            sql_db = SQLDatabase.from_uri(f"sqlite:///{db_path}")
-            t0 = time.time()
-            try:
-                msg, sql, output = _generate_sql(
-                    ex.question, sql_db, schema, max_output_tokens=1000
-                )
-            except NotImplementedError as e:
-                rec = {
-                    "db_id": ex.db_id,
-                    "question": ex.question,
-                    "gold_sql": ex.gold_sql,
-                    "pred_sql": "",
-                    "status": "no_entrypoint",
-                    "output": {"error": str(e)},
-                    "gen_time": time.time() - t0,
-                    "exec_time": None,
-                    "error": "no_entrypoint",
-                    "gold_error": None,
-                    "pred_rows": None,
-                    "gold_rows": None,
-                    "exact_match": False,
-                    "exact_match_structural": False,
-                    "execution_accuracy": False,
-                    "safe_check_failed": True,
-                }
-                fout.write(json.dumps(rec, ensure_ascii=False) + "\n")
-                fout.flush()
-                fgold.write(f"{ex.gold_sql}\t{ex.db_id}\n")
-                fgold.flush()
-                agg.append(rec)
-                if sleep_time > 0:
-                    time.sleep(sleep_time)
-                continue
-            gen_time = time.time() - t0
-            safe_flag = is_safe_sql(sql)
-            if not safe_flag:
-                rec = {
-                    "db_id": ex.db_id,
-                    "question": ex.question,
-                    "gold_sql": ex.gold_sql,
-                    "pred_sql": sql,
-                    "status": "rejected_safe_check",
-                    "output": output,
-                    "gen_time": gen_time,
-                    "exec_time": None,
-                    "error": "unsafe_sql",
-                    "gold_error": None,
-                    "pred_rows": None,
-                    "gold_rows": None,
-                    "exact_match": False,
-                    "exact_match_structural": False,
-                    "execution_accuracy": False,
-                    "safe_check_failed": True,
-                }
-                fout.write(json.dumps(rec, ensure_ascii=False) + "\n")
-                fout.flush()
-                fpred.write(f"{sql}\t{ex.db_id}\n")
-                fgold.write(f"{ex.gold_sql}\t{ex.db_id}\n")
-                fpred.flush()
-                fgold.flush()
-                agg.append(rec)
-                if sleep_time > 0:
-                    time.sleep(sleep_time)
-                continue
-            pred_rows, exec_time, error = try_execute_sql(sql_db, sql)
-            gold_rows, gold_time, gold_error = try_execute_sql(sql_db, ex.gold_sql)
-            skip = gold_error is not None
-            em = normalize_sql(sql) == normalize_sql(ex.gold_sql) if not skip else False
-            em_struct = exact_match_structural(sql, ex.gold_sql) if not skip else False
-            exec_acc = compare_results(pred_rows, gold_rows) if not skip else False
-            rec = {
-                "db_id": ex.db_id,
-                "question": ex.question,
-                "gold_sql": ex.gold_sql,
-                "pred_sql": sql,
-                "status": msg,
-                "output": output,
-                "gen_time": gen_time,
-                "exec_time": exec_time,
-                "error": error,
-                "gold_error": gold_error,
-                "pred_rows": pred_rows,
-                "gold_rows": gold_rows,
-                "exact_match": em,
-                "exact_match_structural": em_struct,
-                "execution_accuracy": exec_acc,
-                "safe_check_failed": False,
             }
-            fout.write(json.dumps(rec, ensure_ascii=False) + "\n")
-            fout.flush()
-            fpred.write(f"{sql}\t{ex.db_id}\n")
-            fgold.write(f"{ex.gold_sql}\t{ex.db_id}\n")
-            fpred.flush()
-            fgold.flush()
-            agg.append(rec)
-            if sleep_time > 0:
-                time.sleep(sleep_time)
-    valid = [
-        r
-        for r in agg
-        if (not r.get("safe_check_failed", False)) and (r.get("gold_error") is None)
-    ]
-    total_valid = len(valid)
-    total_all = len(agg)
-    if total_valid == 0:
-        print("No valid examples to compute metrics")
-        return
-    em_count = sum(1 for r in valid if r["exact_match"])
-    em_struct_count = sum(1 for r in valid if r["exact_match_structural"])
-    exec_acc_count = sum(1 for r in valid if r["execution_accuracy"])
-    error_count = sum(
-        1
-        for r in agg
-        if (r.get("error") is not None) and (not r.get("safe_check_failed", False))
-    )
-    safe_fail_count = sum(1 for r in agg if r.get("safe_check_failed", False))
-    avg_gen_time = sum(float(r["gen_time"]) for r in valid) / total_valid
-    avg_exec_time = sum(float(r["exec_time"]) for r in valid) / total_valid
-    metrics = {
-        "commit_hash": commit_hash,
-        "split": split,
-        "limit": limit,
-        "total_examples": total_all,
-        "valid_examples": total_valid,
-        "exact_match_rate": em_count / total_valid,
-        "exact_match_structural_rate": em_struct_count / total_valid,
-        "execution_accuracy_rate": exec_acc_count / total_valid,
-        "error_rate": error_count / total_valid,
-        "safe_check_fail_rate": safe_fail_count / total_all,
-        "avg_gen_time": avg_gen_time,
-        "avg_exec_time": avg_exec_time,
-        "run_id": start_ts,
-    }
-    metrics_fn = LOG_DIR / f"{split}_metrics_{start_ts}.json"
-    with metrics_fn.open("w", encoding="utf-8") as fm:
-        json.dump(metrics, fm, ensure_ascii=False, indent=2)
-    print("Metrics:", metrics)
-    print(f"Wrote results → {results_fn}")
-    print(f"Wrote pred file → {pred_txt}")
-    print(f"Wrote gold file → {gold_txt}")
-    print(f"Wrote metrics → {metrics_fn}")
-if __name__ == "__main__":
-    run_eval("dev", limit=10, resume=True, sleep_time=0.05)

+"""
+Evaluate NL2SQL pipeline performance on Spider-like queries.
+Uses config-driven Pipeline, native Safety checks, and per-stage latency tracing.
+Outputs: JSONL (detailed logs), JSON (metrics summary), and CSV (for README).
+"""
 import json
+import csv
 import time
 from pathlib import Path
+from nl2sql.pipeline import Pipeline
+# ---------- Config ----------
+DATASET = [
+    "list all customers",
+    "show total invoices per country",
+    "top 3 albums by total sales",
+    "artists with more than 3 albums",
+    "number of employees per city",
+]
+CONFIG_PATH = "configs/pipeline.yaml"
+RESULT_DIR = Path("benchmarks/results")
+RESULT_DIR.mkdir(parents=True, exist_ok=True)
+# ---------- Initialize pipeline ----------
+pipeline = Pipeline.from_config(CONFIG_PATH)
+print(f"✅ Loaded pipeline from {CONFIG_PATH}")
+# Optional: schema preview if adapter supports it
+schema_preview = None
+try:
+    adapter = getattr(pipeline, "executor", None)
+    if adapter and hasattr(adapter, "derive_schema_preview"):
+        schema_preview = adapter.derive_schema_preview()
+        print("📄 Derived schema preview successfully.")
+except Exception as e:
+    print(f"⚠️ Could not derive schema preview: {e}")
+# ---------- Evaluation ----------
+records = []
+for q in DATASET:
+    print(f"\n🧠 Query: {q}")
+    start = time.perf_counter()
     try:
+        result = pipeline.run(user_query=q, schema_preview=schema_preview)
+        latency = int((time.perf_counter() - start) * 1000)
+        trace = getattr(result, "trace", None)
+        stages = []
+        if trace:
+            # trace might be list of StageTrace or dicts
+            try:
+                for t in trace:
+                    stages.append(
+                        {"stage": t.get("stage", "?"), "ms": t.get("duration_ms", 0)}
+                        if isinstance(t, dict)
+                        else {
+                            "stage": getattr(t, "stage", "?"),
+                            "ms": getattr(t, "duration_ms", 0),
+                        }
+                    )
+            except Exception:
+                pass
+        records.append(
+            {
+                "query": q,
+                "ok": True,
+                "latency_ms": latency,
+                "trace": stages,
+                "error": None,
+            }
         )
+        print(f"✅ Success ({latency} ms)")
+    except Exception as e:
+        latency = int((time.perf_counter() - start) * 1000)
+        records.append(
+            {
+                "query": q,
+                "ok": False,
+                "latency_ms": latency,
+                "trace": [],
+                "error": str(e),
             }
+        )
+        print(f"❌ Failed: {e} ({latency} ms)")
+# ---------- Aggregate metrics ----------
+avg_latency = round(sum(r["latency_ms"] for r in records) / len(records), 1)
+success_rate = sum(1 for r in records if r["ok"]) / len(records)
+print(f"\n📊 Average latency: {avg_latency} ms | Success rate: {success_rate:.0%}")
+summary = {
+    "queries_total": len(records),
+    "success_rate": success_rate,
+    "avg_latency_ms": avg_latency,
+    "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
+}
+# ---------- Save outputs ----------
+jsonl_path = RESULT_DIR / "spider_eval.jsonl"
+with open(jsonl_path, "w", encoding="utf-8") as f:
+    for r in records:
+        json.dump(r, f, ensure_ascii=False)
+        f.write("\n")
+summary_path = RESULT_DIR / "metrics_summary.json"
+with open(summary_path, "w", encoding="utf-8") as f:
+    json.dump(summary, f, indent=2)
+csv_path = RESULT_DIR / "results.csv"
+with open(csv_path, "w", newline="", encoding="utf-8") as f:
+    writer = csv.DictWriter(f, fieldnames=["query", "ok", "latency_ms"])
+    writer.writeheader()
+    for r in records:
+        writer.writerow(
+            {
+                "query": r["query"],
+                "ok": "✅" if r["ok"] else "❌",
+                "latency_ms": r["latency_ms"],
             }
+        )
+print(f"\n💾 Saved logs to:\n- {jsonl_path}\n- {summary_path}\n- {csv_path}")