Spaces:
Sleeping
Sleeping
File size: 4,731 Bytes
d347376 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
import csv
import json
import types
import sys
from pathlib import Path
import pytest
@pytest.fixture()
def temp_cwd(tmp_path, monkeypatch):
"""Isolate working directory to a temp folder so outputs don't leak."""
monkeypatch.chdir(tmp_path)
return tmp_path
def _install_fake_router_module(monkeypatch):
"""
Install a fake 'app.routers.nl2sql' module into sys.modules
BEFORE importing evaluate_spider, so its top-level imports resolve.
"""
# Create package hierarchy: app, app.routers, app.routers.nl2sql
app_mod = types.ModuleType("app")
routers_mod = types.ModuleType("app.routers")
nl2sql_mod = types.ModuleType("app.routers.nl2sql")
class _FakeExec:
def derive_schema_preview(self):
return "TABLE users(id INT);"
class _FakeResult:
def __init__(self, ok=True):
self.ok = ok
# mix dicts/objects to exercise _to_stage_list normalization
self.trace = [
{"stage": "planner", "duration_ms": 11},
types.SimpleNamespace(stage="generator", duration_ms=23),
{"stage": "safety", "duration_ms": 5},
]
class _FakePipeline:
def __init__(self):
self.executor = _FakeExec()
def run(self, *, user_query: str, schema_preview: str = ""):
return _FakeResult(ok=True)
# exported symbols used by evaluate_spider
nl2sql_mod._pipeline = _FakePipeline()
nl2sql_mod._build_pipeline = lambda adapter: _FakePipeline()
nl2sql_mod._select_adapter = lambda dbid: object()
# register in sys.modules (package chain)
sys.modules["app"] = app_mod
sys.modules["app.routers"] = routers_mod
sys.modules["app.routers.nl2sql"] = nl2sql_mod
def test_evaluate_spider_writes_outputs(temp_cwd, monkeypatch):
# 1) install fake router module BEFORE import
_install_fake_router_module(monkeypatch)
# 2) import module under test (now its top-level imports succeed)
import benchmarks.evaluate_spider as mod
# 3) shrink dataset for speed and redirect outputs into tmp dir
monkeypatch.setattr(mod, "DATASET", ["q1", "q2"], raising=True)
out_root = Path("benchmarks") / "results"
monkeypatch.setattr(mod, "RESULT_ROOT", out_root, raising=True)
# Recompute RESULT_DIR to reflect new root (keep its naming scheme)
run_dir = out_root / "test-run"
monkeypatch.setattr(mod, "RESULT_DIR", run_dir, raising=True)
# 4) execute main
mod.main()
# 5) verify files exist
jsonl_path = run_dir / "spider_eval.jsonl"
summary_path = run_dir / "metrics_summary.json"
csv_path = run_dir / "results.csv"
assert jsonl_path.exists(), "jsonl not written"
assert summary_path.exists(), "summary not written"
assert csv_path.exists(), "csv not written"
# 6) validate JSONL (2 lines, keys present, normalized trace)
lines = jsonl_path.read_text(encoding="utf-8").strip().splitlines()
assert len(lines) == 2
rec0 = json.loads(lines[0])
assert set(rec0.keys()) >= {"query", "ok", "latency_ms", "trace", "error"}
assert isinstance(rec0["ok"], bool)
assert isinstance(rec0["latency_ms"], int)
assert isinstance(rec0["trace"], list)
assert all("stage" in t and "ms" in t for t in rec0["trace"])
# 7) validate summary.json
summary = json.loads(summary_path.read_text(encoding="utf-8"))
assert summary["queries_total"] == 2
assert 0.0 <= summary["success_rate"] <= 1.0
assert isinstance(summary["avg_latency_ms"], (int, float))
assert summary["pipeline_source"] in {"default", "adapter"} # per code path
# 8) validate CSV
with csv_path.open(newline="", encoding="utf-8") as f:
rows = list(csv.DictReader(f))
assert len(rows) == 2
assert set(rows[0].keys()) == {"query", "ok", "latency_ms"}
assert rows[0]["ok"] in {"✅", "❌"}
assert int(rows[0]["latency_ms"]) >= 0
def test_to_stage_list_normalizes_mixed_items(temp_cwd, monkeypatch):
_install_fake_router_module(monkeypatch)
import benchmarks.evaluate_spider as mod
mixed = [
{"stage": "planner", "duration_ms": 10},
types.SimpleNamespace(stage="generator", duration_ms=20),
{"stage": "safety", "duration_ms": "7"},
]
out = mod._to_stage_list(mixed)
assert out == [
{"stage": "planner", "ms": 10},
{"stage": "generator", "ms": 20},
{"stage": "safety", "ms": 7},
]
def test_int_ms_returns_int(temp_cwd, monkeypatch):
_install_fake_router_module(monkeypatch)
import benchmarks.evaluate_spider as mod
# use a small synthetic duration to assert type not magnitude
t0 = 0.0
assert isinstance(mod._int_ms(t0), int)
|