Spaces:

melikakheirieh
/

nl2sql-copilot

Sleeping

App Files Files Community

Melika Kheirieh commited on Nov 3, 2025

Commit

d347376

1 Parent(s): ebc7457

test(benchmarks): add black-box tests for evaluate_spider outputs and trace normalization

Browse files

Files changed (1) hide show

tests/test_evaluate_spider.py +135 -0

tests/test_evaluate_spider.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import csv
+import json
+import types
+import sys
+from pathlib import Path
+import pytest
+@pytest.fixture()
+def temp_cwd(tmp_path, monkeypatch):
+    """Isolate working directory to a temp folder so outputs don't leak."""
+    monkeypatch.chdir(tmp_path)
+    return tmp_path
+def _install_fake_router_module(monkeypatch):
+    """
+    Install a fake 'app.routers.nl2sql' module into sys.modules
+    BEFORE importing evaluate_spider, so its top-level imports resolve.
+    """
+    # Create package hierarchy: app, app.routers, app.routers.nl2sql
+    app_mod = types.ModuleType("app")
+    routers_mod = types.ModuleType("app.routers")
+    nl2sql_mod = types.ModuleType("app.routers.nl2sql")
+    class _FakeExec:
+        def derive_schema_preview(self):
+            return "TABLE users(id INT);"
+    class _FakeResult:
+        def __init__(self, ok=True):
+            self.ok = ok
+            # mix dicts/objects to exercise _to_stage_list normalization
+            self.trace = [
+                {"stage": "planner", "duration_ms": 11},
+                types.SimpleNamespace(stage="generator", duration_ms=23),
+                {"stage": "safety", "duration_ms": 5},
+            ]
+    class _FakePipeline:
+        def __init__(self):
+            self.executor = _FakeExec()
+        def run(self, *, user_query: str, schema_preview: str = ""):
+            return _FakeResult(ok=True)
+    # exported symbols used by evaluate_spider
+    nl2sql_mod._pipeline = _FakePipeline()
+    nl2sql_mod._build_pipeline = lambda adapter: _FakePipeline()
+    nl2sql_mod._select_adapter = lambda dbid: object()
+    # register in sys.modules (package chain)
+    sys.modules["app"] = app_mod
+    sys.modules["app.routers"] = routers_mod
+    sys.modules["app.routers.nl2sql"] = nl2sql_mod
+def test_evaluate_spider_writes_outputs(temp_cwd, monkeypatch):
+    # 1) install fake router module BEFORE import
+    _install_fake_router_module(monkeypatch)
+    # 2) import module under test (now its top-level imports succeed)
+    import benchmarks.evaluate_spider as mod
+    # 3) shrink dataset for speed and redirect outputs into tmp dir
+    monkeypatch.setattr(mod, "DATASET", ["q1", "q2"], raising=True)
+    out_root = Path("benchmarks") / "results"
+    monkeypatch.setattr(mod, "RESULT_ROOT", out_root, raising=True)
+    # Recompute RESULT_DIR to reflect new root (keep its naming scheme)
+    run_dir = out_root / "test-run"
+    monkeypatch.setattr(mod, "RESULT_DIR", run_dir, raising=True)
+    # 4) execute main
+    mod.main()
+    # 5) verify files exist
+    jsonl_path = run_dir / "spider_eval.jsonl"
+    summary_path = run_dir / "metrics_summary.json"
+    csv_path = run_dir / "results.csv"
+    assert jsonl_path.exists(), "jsonl not written"
+    assert summary_path.exists(), "summary not written"
+    assert csv_path.exists(), "csv not written"
+    # 6) validate JSONL (2 lines, keys present, normalized trace)
+    lines = jsonl_path.read_text(encoding="utf-8").strip().splitlines()
+    assert len(lines) == 2
+    rec0 = json.loads(lines[0])
+    assert set(rec0.keys()) >= {"query", "ok", "latency_ms", "trace", "error"}
+    assert isinstance(rec0["ok"], bool)
+    assert isinstance(rec0["latency_ms"], int)
+    assert isinstance(rec0["trace"], list)
+    assert all("stage" in t and "ms" in t for t in rec0["trace"])
+    # 7) validate summary.json
+    summary = json.loads(summary_path.read_text(encoding="utf-8"))
+    assert summary["queries_total"] == 2
+    assert 0.0 <= summary["success_rate"] <= 1.0
+    assert isinstance(summary["avg_latency_ms"], (int, float))
+    assert summary["pipeline_source"] in {"default", "adapter"}  # per code path
+    # 8) validate CSV
+    with csv_path.open(newline="", encoding="utf-8") as f:
+        rows = list(csv.DictReader(f))
+    assert len(rows) == 2
+    assert set(rows[0].keys()) == {"query", "ok", "latency_ms"}
+    assert rows[0]["ok"] in {"✅", "❌"}
+    assert int(rows[0]["latency_ms"]) >= 0
+def test_to_stage_list_normalizes_mixed_items(temp_cwd, monkeypatch):
+    _install_fake_router_module(monkeypatch)
+    import benchmarks.evaluate_spider as mod
+    mixed = [
+        {"stage": "planner", "duration_ms": 10},
+        types.SimpleNamespace(stage="generator", duration_ms=20),
+        {"stage": "safety", "duration_ms": "7"},
+    ]
+    out = mod._to_stage_list(mixed)
+    assert out == [
+        {"stage": "planner", "ms": 10},
+        {"stage": "generator", "ms": 20},
+        {"stage": "safety", "ms": 7},
+    ]
+def test_int_ms_returns_int(temp_cwd, monkeypatch):
+    _install_fake_router_module(monkeypatch)
+    import benchmarks.evaluate_spider as mod
+    # use a small synthetic duration to assert type not magnitude
+    t0 = 0.0
+    assert isinstance(mod._int_ms(t0), int)