Spaces:
Sleeping
Sleeping
Melika Kheirieh
commited on
Commit
Β·
296a94d
1
Parent(s):
b794494
feat(bench): gold-aware EM/SM/ExecAcc + p50/p95; write per-stage means; richer plots
Browse files- benchmarks/evaluate_spider_pro.py +276 -279
- benchmarks/plot_results.py +129 -89
- benchmarks/results_pro/20251109-095552/eval.jsonl +5 -0
- benchmarks/results_pro/20251109-095552/summary.json +12 -0
- benchmarks/results_pro/20251109-100021/eval.jsonl +5 -0
- benchmarks/results_pro/20251109-100021/results.csv +6 -0
- benchmarks/results_pro/20251109-100021/summary.json +21 -0
benchmarks/evaluate_spider_pro.py
CHANGED
|
@@ -1,7 +1,9 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
-
|
| 4 |
-
|
|
|
|
|
|
|
|
|
|
| 5 |
"""
|
| 6 |
|
| 7 |
from __future__ import annotations
|
|
@@ -20,423 +22,418 @@ from nl2sql.pipeline_factory import pipeline_from_config_with_adapter
|
|
| 20 |
from adapters.db.sqlite_adapter import SQLiteAdapter
|
| 21 |
from benchmarks.spider_loader import load_spider_sqlite
|
| 22 |
|
| 23 |
-
#
|
| 24 |
|
| 25 |
RESULT_ROOT = Path("benchmarks/results_pro")
|
| 26 |
TIMESTAMP = time.strftime("%Y%m%d-%H%M%S")
|
| 27 |
RESULT_DIR = RESULT_ROOT / TIMESTAMP
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
-
|
| 30 |
-
# ==================== SQL Processing ====================
|
| 31 |
|
| 32 |
|
| 33 |
def extract_clean_sql(text: str | None) -> str:
|
| 34 |
-
"""
|
| 35 |
-
|
| 36 |
-
sql = text or ""
|
| 37 |
|
| 38 |
-
#
|
| 39 |
-
sql = re.sub(r"```(?:sql)?\s
|
| 40 |
-
sql =
|
| 41 |
|
| 42 |
-
#
|
| 43 |
-
|
| 44 |
-
if
|
| 45 |
-
sql =
|
| 46 |
|
| 47 |
-
#
|
| 48 |
sql = sql.replace('\\"', '"').replace("\\n", " ").replace("\\t", " ")
|
| 49 |
|
| 50 |
-
#
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
sql = m_sql.group(0)
|
| 56 |
sql = re.sub(r"\s+", " ", sql).strip().rstrip(";")
|
| 57 |
return sql
|
| 58 |
|
| 59 |
|
| 60 |
def normalize_sql(sql: str) -> str:
|
| 61 |
-
"""
|
| 62 |
if not sql:
|
| 63 |
return ""
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
#
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
#
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
|
| 95 |
|
| 96 |
def get_database_schema(db_path: Path) -> Dict[str, Any]:
|
| 97 |
-
"""Extract
|
|
|
|
| 98 |
if not db_path.exists():
|
| 99 |
-
return
|
| 100 |
|
| 101 |
conn = sqlite3.connect(str(db_path))
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
schema: dict[str, Any] = {"tables": {}}
|
| 105 |
-
|
| 106 |
try:
|
| 107 |
-
|
| 108 |
-
cursor.execute(
|
| 109 |
"SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'"
|
| 110 |
)
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
col_dict = {
|
| 125 |
-
"name": col_name,
|
| 126 |
-
"type": col_type,
|
| 127 |
-
"primary_key": bool(is_pk),
|
| 128 |
-
}
|
| 129 |
-
col_info.append(col_dict)
|
| 130 |
-
|
| 131 |
-
# Get foreign keys
|
| 132 |
-
cursor.execute(f"PRAGMA foreign_key_list('{table_name}')")
|
| 133 |
-
fks = cursor.fetchall()
|
| 134 |
-
|
| 135 |
-
fk_info = []
|
| 136 |
-
for fk in fks:
|
| 137 |
-
fk_info.append(
|
| 138 |
-
{
|
| 139 |
-
"column": fk[3],
|
| 140 |
-
"referenced_table": fk[2],
|
| 141 |
-
"referenced_column": fk[4],
|
| 142 |
-
}
|
| 143 |
-
)
|
| 144 |
-
|
| 145 |
-
schema["tables"][table_name] = {
|
| 146 |
-
"columns": col_info,
|
| 147 |
-
"foreign_keys": fk_info,
|
| 148 |
-
}
|
| 149 |
-
|
| 150 |
finally:
|
| 151 |
conn.close()
|
| 152 |
-
|
| 153 |
return schema
|
| 154 |
|
| 155 |
|
| 156 |
def format_schema_for_prompt(schema: Dict[str, Any]) -> str:
|
| 157 |
-
"""
|
| 158 |
-
if not schema
|
| 159 |
return ""
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
col_str += " PRIMARY KEY"
|
| 168 |
-
cols.append(col_str)
|
| 169 |
-
|
| 170 |
-
lines.append(f"Table: {table_name}")
|
| 171 |
lines.append(f"Columns: {', '.join(cols)}")
|
| 172 |
-
|
| 173 |
-
if
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
f"{fk['column']} -> {fk['referenced_table']}.{fk['referenced_column']}"
|
|
|
|
| 178 |
)
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
lines.append("") # Empty line between tables
|
| 182 |
-
|
| 183 |
return "\n".join(lines).strip()
|
| 184 |
|
| 185 |
|
| 186 |
-
#
|
| 187 |
|
| 188 |
|
| 189 |
-
def
|
| 190 |
-
"""Execute SQL and return success flag and results."""
|
| 191 |
if not sql:
|
| 192 |
return False, []
|
| 193 |
-
|
| 194 |
try:
|
| 195 |
-
conn = sqlite3.connect(str(
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
conn.close()
|
| 200 |
-
return True,
|
| 201 |
except Exception:
|
| 202 |
return False, []
|
| 203 |
|
| 204 |
|
| 205 |
-
def
|
| 206 |
-
|
| 207 |
-
if len(gold_results) != len(pred_results):
|
| 208 |
-
return False
|
| 209 |
-
|
| 210 |
-
# Convert to sets for comparison (order independent)
|
| 211 |
-
gold_set = set(gold_results)
|
| 212 |
-
pred_set = set(pred_results)
|
| 213 |
-
|
| 214 |
-
return gold_set == pred_set
|
| 215 |
-
|
| 216 |
|
| 217 |
-
def evaluate_sql_match(pred_sql: str, gold_sql: str, db_path: Path) -> Dict[str, float]:
|
| 218 |
-
"""Evaluate predicted SQL against gold SQL."""
|
| 219 |
-
metrics = {"exact_match": 0.0, "set_match": 0.0, "exec_accuracy": 0.0}
|
| 220 |
|
| 221 |
-
|
| 222 |
-
|
|
|
|
| 223 |
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
metrics["exact_match"] = 1.0
|
| 227 |
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
if compare_sql_results(gold_results, pred_results):
|
| 235 |
-
metrics["set_match"] = 1.0
|
| 236 |
-
metrics["exec_accuracy"] = 1.0
|
| 237 |
else:
|
| 238 |
-
#
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
return metrics
|
| 242 |
|
| 243 |
|
| 244 |
-
#
|
| 245 |
|
| 246 |
|
| 247 |
@dataclass
|
| 248 |
class SpiderSample:
|
| 249 |
-
"""Spider dataset sample."""
|
| 250 |
-
|
| 251 |
question: str
|
| 252 |
db_id: str
|
| 253 |
db_path: Path
|
| 254 |
gold_sql: str
|
| 255 |
|
| 256 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 257 |
def run_pipeline_on_sample(
|
| 258 |
pipeline: Any,
|
| 259 |
sample: SpiderSample,
|
| 260 |
schema_cache: Dict[str, str],
|
| 261 |
debug: bool = False,
|
| 262 |
) -> Dict[str, Any]:
|
| 263 |
-
"""Run
|
| 264 |
-
|
| 265 |
-
# Get/cache schema
|
| 266 |
if sample.db_id not in schema_cache:
|
| 267 |
schema_dict = get_database_schema(sample.db_path)
|
| 268 |
-
|
| 269 |
-
schema_cache[sample.db_id] = schema_str
|
| 270 |
if debug:
|
| 271 |
-
print(
|
|
|
|
|
|
|
| 272 |
|
| 273 |
-
schema
|
| 274 |
|
| 275 |
-
# Run pipeline
|
| 276 |
try:
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
if hasattr(
|
| 281 |
-
pred_sql = extract_clean_sql(
|
| 282 |
else:
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
pred_sql = extract_clean_sql(str(val))
|
| 289 |
-
if pred_sql:
|
| 290 |
-
break
|
| 291 |
-
else:
|
| 292 |
-
pred_sql = ""
|
| 293 |
-
|
| 294 |
return {
|
| 295 |
-
"ok": bool(getattr(
|
| 296 |
"sql": pred_sql,
|
| 297 |
-
"
|
| 298 |
-
"traces": getattr(result, "traces", []),
|
| 299 |
"error": None,
|
| 300 |
}
|
| 301 |
-
|
| 302 |
except Exception as e:
|
| 303 |
if debug:
|
| 304 |
import traceback
|
| 305 |
|
| 306 |
traceback.print_exc()
|
| 307 |
-
return {
|
| 308 |
-
"ok": False,
|
| 309 |
-
"sql": "",
|
| 310 |
-
"raw_response": "",
|
| 311 |
-
"traces": [],
|
| 312 |
-
"error": str(e),
|
| 313 |
-
}
|
| 314 |
-
|
| 315 |
|
| 316 |
-
# ==================== Main Evaluation ====================
|
| 317 |
|
|
|
|
| 318 |
|
| 319 |
-
def main():
|
| 320 |
-
parser = argparse.ArgumentParser(description="Evaluate NL2SQL on Spider")
|
| 321 |
-
parser.add_argument("--spider", action="store_true", help="Run Spider evaluation")
|
| 322 |
-
parser.add_argument("--split", default="dev", choices=["dev", "train"])
|
| 323 |
-
parser.add_argument("--limit", type=int, help="Limit number of samples")
|
| 324 |
-
parser.add_argument("--debug", action="store_true", help="Enable debug output")
|
| 325 |
-
parser.add_argument("--config", default="configs/sqlite_pipeline.yaml")
|
| 326 |
|
| 327 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 328 |
|
| 329 |
if not args.spider:
|
| 330 |
-
print("
|
| 331 |
return
|
| 332 |
|
| 333 |
-
#
|
| 334 |
print(f"Loading Spider {args.split} split...")
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
print("β No samples loaded. Check SPIDER_ROOT environment variable.")
|
| 339 |
return
|
|
|
|
| 340 |
|
| 341 |
-
print(f"β Loaded {len(samples)} samples")
|
| 342 |
-
|
| 343 |
-
# Prepare results directory
|
| 344 |
RESULT_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
|
| 345 |
|
| 346 |
-
|
| 347 |
-
schema_cache = {}
|
| 348 |
-
|
| 349 |
-
# Process each sample
|
| 350 |
-
results = []
|
| 351 |
-
for i, spider_item in enumerate(samples, 1):
|
| 352 |
-
# Convert to our sample format
|
| 353 |
sample = SpiderSample(
|
| 354 |
-
question=
|
| 355 |
-
db_id=
|
| 356 |
-
db_path=Path(
|
| 357 |
-
gold_sql=
|
| 358 |
)
|
|
|
|
| 359 |
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
# Create adapter and pipeline for this database
|
| 363 |
-
adapter = SQLiteAdapter(sample.db_path)
|
| 364 |
pipeline = pipeline_from_config_with_adapter(args.config, adapter=adapter)
|
| 365 |
|
| 366 |
-
# Run pipeline
|
| 367 |
t0 = time.perf_counter()
|
| 368 |
-
|
| 369 |
latency_ms = int((time.perf_counter() - t0) * 1000)
|
| 370 |
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
# Store result
|
| 375 |
-
eval_result = {
|
| 376 |
"source": "spider",
|
| 377 |
"db_id": sample.db_id,
|
| 378 |
"query": sample.question,
|
| 379 |
"gold_sql": sample.gold_sql,
|
| 380 |
-
"pred_sql":
|
| 381 |
-
"ok":
|
| 382 |
"latency_ms": latency_ms,
|
| 383 |
-
"em": metrics["
|
| 384 |
-
"sm": metrics["
|
| 385 |
-
"exec_acc": metrics["
|
| 386 |
-
"error":
|
| 387 |
-
"trace":
|
| 388 |
}
|
| 389 |
-
|
| 390 |
|
| 391 |
-
# Debug output
|
| 392 |
if args.debug:
|
| 393 |
-
status = "β
" if
|
| 394 |
print(
|
| 395 |
-
f"{status} ({latency_ms} ms) | EM={
|
| 396 |
)
|
| 397 |
-
if
|
| 398 |
-
print(f" gold: {sample.gold_sql
|
| 399 |
-
print(f" pred: {
|
| 400 |
-
|
| 401 |
-
#
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
|
| 405 |
-
|
| 406 |
-
avg_ea = sum(r["exec_acc"] for r in results) / total if total > 0 else 0
|
| 407 |
-
avg_latency = sum(r["latency_ms"] for r in results) / total if total > 0 else 0
|
| 408 |
-
|
| 409 |
-
# Save results
|
| 410 |
-
eval_jsonl = RESULT_DIR / "eval.jsonl"
|
| 411 |
-
with open(eval_jsonl, "w") as f:
|
| 412 |
-
for r in results:
|
| 413 |
json.dump(r, f, ensure_ascii=False)
|
| 414 |
f.write("\n")
|
| 415 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 416 |
summary = {
|
| 417 |
"timestamp": datetime.now().isoformat(timespec="seconds"),
|
|
|
|
|
|
|
| 418 |
"total": total,
|
| 419 |
-
"success":
|
| 420 |
-
"success_rate": round(
|
| 421 |
-
"avg_latency_ms": round(
|
|
|
|
|
|
|
| 422 |
"EM": round(avg_em, 3),
|
| 423 |
"SM": round(avg_sm, 3),
|
| 424 |
-
"ExecAcc": round(
|
| 425 |
-
"
|
| 426 |
-
"config": args.config,
|
| 427 |
}
|
| 428 |
|
| 429 |
(RESULT_DIR / "summary.json").write_text(
|
| 430 |
json.dumps(summary, indent=2, ensure_ascii=False), encoding="utf-8"
|
| 431 |
)
|
| 432 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 433 |
print("\n================== Evaluation Summary ==================")
|
| 434 |
print(f"Total samples: {total}")
|
| 435 |
-
print(f"Successful runs: {
|
| 436 |
print(f"Avg EM: {summary['EM']}")
|
| 437 |
print(f"Avg SM: {summary['SM']}")
|
| 438 |
print(f"Avg ExecAcc: {summary['ExecAcc']}")
|
| 439 |
-
print(
|
|
|
|
|
|
|
| 440 |
print(f"Results saved to {RESULT_DIR}")
|
| 441 |
print("========================================================")
|
| 442 |
|
|
|
|
|
|
|
| 1 |
"""
|
| 2 |
+
Spider benchmark evaluator (pro):
|
| 3 |
+
- Computes EM / SM / ExecAcc vs. gold SQL
|
| 4 |
+
- Records per-sample latency and (if present) per-stage timings from pipeline traces
|
| 5 |
+
- Persists eval.jsonl (per-sample), summary.json (aggregates incl. p50/p95, per-stage means), results.csv
|
| 6 |
+
- No external deps; percentile and normalization are implemented locally.
|
| 7 |
"""
|
| 8 |
|
| 9 |
from __future__ import annotations
|
|
|
|
| 22 |
from adapters.db.sqlite_adapter import SQLiteAdapter
|
| 23 |
from benchmarks.spider_loader import load_spider_sqlite
|
| 24 |
|
| 25 |
+
# -------------------------- Config --------------------------
|
| 26 |
|
| 27 |
RESULT_ROOT = Path("benchmarks/results_pro")
|
| 28 |
TIMESTAMP = time.strftime("%Y%m%d-%H%M%S")
|
| 29 |
RESULT_DIR = RESULT_ROOT / TIMESTAMP
|
| 30 |
+
STAGES = [
|
| 31 |
+
"detector",
|
| 32 |
+
"planner",
|
| 33 |
+
"generator",
|
| 34 |
+
"safety",
|
| 35 |
+
"executor",
|
| 36 |
+
"verifier",
|
| 37 |
+
"repair",
|
| 38 |
+
]
|
| 39 |
|
| 40 |
+
# -------------------------- SQL utils -----------------------
|
|
|
|
| 41 |
|
| 42 |
|
| 43 |
def extract_clean_sql(text: str | None) -> str:
|
| 44 |
+
"""Extract a clean SQL string from LLM-ish output (may include fences/JSON)."""
|
| 45 |
+
sql = (text or "").strip()
|
|
|
|
| 46 |
|
| 47 |
+
# strip ```sql fences
|
| 48 |
+
sql = re.sub(r"```(?:sql)?\s*", "", sql, flags=re.I)
|
| 49 |
+
sql = sql.replace("```", "")
|
| 50 |
|
| 51 |
+
# JSON-like {"sql": "..."}
|
| 52 |
+
m = re.search(r'"sql"\s*:\s*"([^"]+)"', sql)
|
| 53 |
+
if m:
|
| 54 |
+
sql = m.group(1)
|
| 55 |
|
| 56 |
+
# unescape
|
| 57 |
sql = sql.replace('\\"', '"').replace("\\n", " ").replace("\\t", " ")
|
| 58 |
|
| 59 |
+
# find first SQL-ish keyword
|
| 60 |
+
m2 = re.search(r"\b(select|with|insert|update|delete)\b[\s\S]+", sql, re.I)
|
| 61 |
+
if m2:
|
| 62 |
+
sql = m2.group(0)
|
| 63 |
+
|
|
|
|
| 64 |
sql = re.sub(r"\s+", " ", sql).strip().rstrip(";")
|
| 65 |
return sql
|
| 66 |
|
| 67 |
|
| 68 |
def normalize_sql(sql: str) -> str:
|
| 69 |
+
"""Light normalization to make EM stricter-but-fair."""
|
| 70 |
if not sql:
|
| 71 |
return ""
|
| 72 |
+
s = sql.strip()
|
| 73 |
+
# unify case but keep literals recognizable
|
| 74 |
+
s = re.sub(r"\s+", " ", s).strip()
|
| 75 |
+
s = s.rstrip(";")
|
| 76 |
+
|
| 77 |
+
# drop table prefixes a.b -> b
|
| 78 |
+
s = re.sub(r"\b\w+\.(\w+)\b", r"\1", s)
|
| 79 |
+
|
| 80 |
+
# collapse quotes around identifiers
|
| 81 |
+
s = re.sub(r"`([A-Za-z_]\w*)`", r"\1", s)
|
| 82 |
+
s = re.sub(r'"([A-Za-z_]\w*)"', r"\1", s)
|
| 83 |
+
|
| 84 |
+
# COUNT(foo) -> COUNT(*), DISTINCT inside COUNT -> COUNT(*)
|
| 85 |
+
s = re.sub(r"(?i)COUNT\s*\(\s*DISTINCT\s+[^)]+\)", "COUNT(*)", s)
|
| 86 |
+
s = re.sub(r"(?i)COUNT\s*\(\s*[A-Za-z_]\w*\s*\)", "COUNT(*)", s)
|
| 87 |
+
|
| 88 |
+
# strip trailing LIMIT n
|
| 89 |
+
s = re.sub(r"(?i)\s+LIMIT\s+\d+\s*$", "", s)
|
| 90 |
+
|
| 91 |
+
# canonical whitespace + upper keywords for stability
|
| 92 |
+
s = re.sub(r"\s+", " ", s).strip()
|
| 93 |
+
# keyword upper (a bit heuristic)
|
| 94 |
+
for kw in [
|
| 95 |
+
"select",
|
| 96 |
+
"from",
|
| 97 |
+
"where",
|
| 98 |
+
"group by",
|
| 99 |
+
"order by",
|
| 100 |
+
"having",
|
| 101 |
+
"limit",
|
| 102 |
+
"join",
|
| 103 |
+
"on",
|
| 104 |
+
"and",
|
| 105 |
+
"or",
|
| 106 |
+
"asc",
|
| 107 |
+
"desc",
|
| 108 |
+
]:
|
| 109 |
+
s = re.sub(rf"(?i)\b{kw}\b", kw.upper(), s)
|
| 110 |
+
return s
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
# ---------------------- Schema extraction -------------------
|
| 114 |
|
| 115 |
|
| 116 |
def get_database_schema(db_path: Path) -> Dict[str, Any]:
|
| 117 |
+
"""Extract schema from SQLite database (tables, columns, FKs)."""
|
| 118 |
+
schema: Dict[str, Any] = {"tables": {}}
|
| 119 |
if not db_path.exists():
|
| 120 |
+
return schema
|
| 121 |
|
| 122 |
conn = sqlite3.connect(str(db_path))
|
| 123 |
+
cur = conn.cursor()
|
|
|
|
|
|
|
|
|
|
| 124 |
try:
|
| 125 |
+
cur.execute(
|
|
|
|
| 126 |
"SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'"
|
| 127 |
)
|
| 128 |
+
for (table,) in cur.fetchall():
|
| 129 |
+
cur.execute(f"PRAGMA table_info('{table}')")
|
| 130 |
+
cols = [
|
| 131 |
+
{"name": c[1], "type": c[2], "primary_key": bool(c[5])}
|
| 132 |
+
for c in cur.fetchall()
|
| 133 |
+
]
|
| 134 |
+
cur.execute(f"PRAGMA foreign_key_list('{table}')")
|
| 135 |
+
fks = [
|
| 136 |
+
{"column": fk[3], "referenced_table": fk[2], "referenced_column": fk[4]}
|
| 137 |
+
for fk in cur.fetchall()
|
| 138 |
+
]
|
| 139 |
+
schema["tables"][table] = {"columns": cols, "foreign_keys": fks}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
finally:
|
| 141 |
conn.close()
|
|
|
|
| 142 |
return schema
|
| 143 |
|
| 144 |
|
| 145 |
def format_schema_for_prompt(schema: Dict[str, Any]) -> str:
|
| 146 |
+
"""Plain-text schema for prompt (minimal but helpful)."""
|
| 147 |
+
if not schema.get("tables"):
|
| 148 |
return ""
|
| 149 |
+
lines: List[str] = []
|
| 150 |
+
for t, info in schema["tables"].items():
|
| 151 |
+
cols = [
|
| 152 |
+
f"{c['name']} {c['type']}{' PK' if c.get('primary_key') else ''}"
|
| 153 |
+
for c in info.get("columns", [])
|
| 154 |
+
]
|
| 155 |
+
lines.append(f"Table: {t}")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
lines.append(f"Columns: {', '.join(cols)}")
|
| 157 |
+
fks = info.get("foreign_keys") or []
|
| 158 |
+
if fks:
|
| 159 |
+
lines.append(
|
| 160 |
+
"FKs: "
|
| 161 |
+
+ ", ".join(
|
| 162 |
f"{fk['column']} -> {fk['referenced_table']}.{fk['referenced_column']}"
|
| 163 |
+
for fk in fks
|
| 164 |
)
|
| 165 |
+
)
|
| 166 |
+
lines.append("")
|
|
|
|
|
|
|
| 167 |
return "\n".join(lines).strip()
|
| 168 |
|
| 169 |
|
| 170 |
+
# ---------------------- Exec/eval metrics -------------------
|
| 171 |
|
| 172 |
|
| 173 |
+
def _exec_sql(db: Path, sql: str) -> Tuple[bool, List[Tuple]]:
|
|
|
|
| 174 |
if not sql:
|
| 175 |
return False, []
|
|
|
|
| 176 |
try:
|
| 177 |
+
conn = sqlite3.connect(str(db))
|
| 178 |
+
cur = conn.cursor()
|
| 179 |
+
cur.execute(sql)
|
| 180 |
+
rows = cur.fetchall()
|
| 181 |
conn.close()
|
| 182 |
+
return True, rows
|
| 183 |
except Exception:
|
| 184 |
return False, []
|
| 185 |
|
| 186 |
|
| 187 |
+
def _same_rows(a: List[Tuple], b: List[Tuple]) -> bool:
|
| 188 |
+
return set(a) == set(b) and len(a) == len(b)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 189 |
|
|
|
|
|
|
|
|
|
|
| 190 |
|
| 191 |
+
def evaluate_sql(pred: str, gold: str, db: Path) -> Dict[str, float]:
|
| 192 |
+
"""Return {'em', 'sm', 'exec'} in {0.0,1.0} (sm ~ set-match)."""
|
| 193 |
+
em = 1.0 if normalize_sql(pred) == normalize_sql(gold) else 0.0
|
| 194 |
|
| 195 |
+
gold_ok, gold_rows = _exec_sql(db, gold)
|
| 196 |
+
pred_ok, pred_rows = _exec_sql(db, pred)
|
|
|
|
| 197 |
|
| 198 |
+
sm = 0.0
|
| 199 |
+
exec_acc = 0.0
|
| 200 |
+
if gold_ok and pred_ok:
|
| 201 |
+
if _same_rows(gold_rows, pred_rows):
|
| 202 |
+
sm = 1.0
|
| 203 |
+
exec_acc = 1.0
|
|
|
|
|
|
|
|
|
|
| 204 |
else:
|
| 205 |
+
exec_acc = 0.5 # partial credit for executing but mismatched rows
|
| 206 |
+
return {"em": em, "sm": sm, "exec": exec_acc}
|
|
|
|
|
|
|
| 207 |
|
| 208 |
|
| 209 |
+
# ---------------------- Dataclass + runner ------------------
|
| 210 |
|
| 211 |
|
| 212 |
@dataclass
|
| 213 |
class SpiderSample:
|
|
|
|
|
|
|
| 214 |
question: str
|
| 215 |
db_id: str
|
| 216 |
db_path: Path
|
| 217 |
gold_sql: str
|
| 218 |
|
| 219 |
|
| 220 |
+
def _percentile(values: List[float], p: float) -> float:
|
| 221 |
+
"""Compute p-th percentile (0..100) without numpy."""
|
| 222 |
+
if not values:
|
| 223 |
+
return 0.0
|
| 224 |
+
vals = sorted(values)
|
| 225 |
+
k = (len(vals) - 1) * (p / 100.0)
|
| 226 |
+
f = int(k)
|
| 227 |
+
c = min(f + 1, len(vals) - 1)
|
| 228 |
+
if f == c:
|
| 229 |
+
return float(vals[int(k)])
|
| 230 |
+
return float(vals[f] * (c - k) + vals[c] * (k - f))
|
| 231 |
+
|
| 232 |
+
|
| 233 |
+
def _stage_ms_from_trace(trace_item: Dict[str, Any]) -> float:
|
| 234 |
+
"""Accepts {'stage':..., 'ms':...} OR {'stage':..., 'duration_ms':...}."""
|
| 235 |
+
if not trace_item:
|
| 236 |
+
return 0.0
|
| 237 |
+
if "ms" in trace_item:
|
| 238 |
+
try:
|
| 239 |
+
return float(trace_item["ms"])
|
| 240 |
+
except Exception:
|
| 241 |
+
return 0.0
|
| 242 |
+
if "duration_ms" in trace_item:
|
| 243 |
+
try:
|
| 244 |
+
return float(trace_item["duration_ms"])
|
| 245 |
+
except Exception:
|
| 246 |
+
return 0.0
|
| 247 |
+
return 0.0
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
def _collect_stage_means(eval_rows: List[Dict[str, Any]]) -> Dict[str, float]:
|
| 251 |
+
"""Average per-stage ms across all records (0 if absent)."""
|
| 252 |
+
totals = {s: 0.0 for s in STAGES}
|
| 253 |
+
counts = {s: 0 for s in STAGES}
|
| 254 |
+
for r in eval_rows:
|
| 255 |
+
trace_list = r.get("trace") or r.get("traces") or []
|
| 256 |
+
for t in trace_list:
|
| 257 |
+
s = t.get("stage")
|
| 258 |
+
if s in totals:
|
| 259 |
+
ms = _stage_ms_from_trace(t)
|
| 260 |
+
totals[s] += ms
|
| 261 |
+
counts[s] += 1
|
| 262 |
+
return {s: round(totals[s] / counts[s], 2) if counts[s] else 0.0 for s in STAGES}
|
| 263 |
+
|
| 264 |
+
|
| 265 |
def run_pipeline_on_sample(
|
| 266 |
pipeline: Any,
|
| 267 |
sample: SpiderSample,
|
| 268 |
schema_cache: Dict[str, str],
|
| 269 |
debug: bool = False,
|
| 270 |
) -> Dict[str, Any]:
|
| 271 |
+
"""Run pipeline on one sample and extract normalized prediction + traces."""
|
| 272 |
+
# cache schema
|
|
|
|
| 273 |
if sample.db_id not in schema_cache:
|
| 274 |
schema_dict = get_database_schema(sample.db_path)
|
| 275 |
+
schema_cache[sample.db_id] = format_schema_for_prompt(schema_dict)
|
|
|
|
| 276 |
if debug:
|
| 277 |
+
print(
|
| 278 |
+
f" [schema] Loaded {len(schema_cache[sample.db_id])} chars for {sample.db_id}"
|
| 279 |
+
)
|
| 280 |
|
| 281 |
+
schema = schema_cache[sample.db_id]
|
| 282 |
|
|
|
|
| 283 |
try:
|
| 284 |
+
res = pipeline.run(user_query=sample.question, schema_preview=schema)
|
| 285 |
+
# extract SQL
|
| 286 |
+
pred_sql = ""
|
| 287 |
+
if hasattr(res, "sql") and res.sql:
|
| 288 |
+
pred_sql = extract_clean_sql(res.sql)
|
| 289 |
else:
|
| 290 |
+
for attr in ("final_sql", "generated_sql", "answer"):
|
| 291 |
+
if getattr(res, attr, None):
|
| 292 |
+
pred_sql = extract_clean_sql(str(getattr(res, attr)))
|
| 293 |
+
if pred_sql:
|
| 294 |
+
break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 295 |
return {
|
| 296 |
+
"ok": bool(getattr(res, "ok", True)),
|
| 297 |
"sql": pred_sql,
|
| 298 |
+
"trace": getattr(res, "traces", []) or getattr(res, "trace", []),
|
|
|
|
| 299 |
"error": None,
|
| 300 |
}
|
|
|
|
| 301 |
except Exception as e:
|
| 302 |
if debug:
|
| 303 |
import traceback
|
| 304 |
|
| 305 |
traceback.print_exc()
|
| 306 |
+
return {"ok": False, "sql": "", "trace": [], "error": str(e)}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 307 |
|
|
|
|
| 308 |
|
| 309 |
+
# --------------------------- Main --------------------------
|
| 310 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 311 |
|
| 312 |
+
def main() -> None:
|
| 313 |
+
ap = argparse.ArgumentParser(description="Evaluate NL2SQL on Spider (pro)")
|
| 314 |
+
ap.add_argument("--spider", action="store_true", help="Use Spider dataset loader")
|
| 315 |
+
ap.add_argument("--split", default="dev", choices=["dev", "train"])
|
| 316 |
+
ap.add_argument("--limit", type=int, default=20)
|
| 317 |
+
ap.add_argument("--debug", action="store_true")
|
| 318 |
+
ap.add_argument("--config", default="configs/sqlite_pipeline.yaml")
|
| 319 |
+
args = ap.parse_args()
|
| 320 |
|
| 321 |
if not args.spider:
|
| 322 |
+
print("Use --spider to run Spider evaluation.")
|
| 323 |
return
|
| 324 |
|
| 325 |
+
# load items
|
| 326 |
print(f"Loading Spider {args.split} split...")
|
| 327 |
+
items = load_spider_sqlite(split=args.split, limit=args.limit)
|
| 328 |
+
if not items:
|
| 329 |
+
print("β No samples loaded. Check SPIDER_ROOT.")
|
|
|
|
| 330 |
return
|
| 331 |
+
print(f"β Loaded {len(items)} samples")
|
| 332 |
|
|
|
|
|
|
|
|
|
|
| 333 |
RESULT_DIR.mkdir(parents=True, exist_ok=True)
|
| 334 |
+
schema_cache: Dict[str, str] = {}
|
| 335 |
+
eval_rows: List[Dict[str, Any]] = []
|
| 336 |
|
| 337 |
+
for i, it in enumerate(items, 1):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 338 |
sample = SpiderSample(
|
| 339 |
+
question=it.question,
|
| 340 |
+
db_id=it.db_id,
|
| 341 |
+
db_path=Path(it.db_path),
|
| 342 |
+
gold_sql=it.gold_sql,
|
| 343 |
)
|
| 344 |
+
print(f"\nπ§ [{i}/{len(items)}] [{sample.db_id}] {sample.question}")
|
| 345 |
|
| 346 |
+
adapter = SQLiteAdapter(str(sample.db_path))
|
|
|
|
|
|
|
|
|
|
| 347 |
pipeline = pipeline_from_config_with_adapter(args.config, adapter=adapter)
|
| 348 |
|
|
|
|
| 349 |
t0 = time.perf_counter()
|
| 350 |
+
out = run_pipeline_on_sample(pipeline, sample, schema_cache, args.debug)
|
| 351 |
latency_ms = int((time.perf_counter() - t0) * 1000)
|
| 352 |
|
| 353 |
+
metrics = evaluate_sql(out["sql"], sample.gold_sql, sample.db_path)
|
| 354 |
+
row = {
|
|
|
|
|
|
|
|
|
|
| 355 |
"source": "spider",
|
| 356 |
"db_id": sample.db_id,
|
| 357 |
"query": sample.question,
|
| 358 |
"gold_sql": sample.gold_sql,
|
| 359 |
+
"pred_sql": out["sql"],
|
| 360 |
+
"ok": out["ok"],
|
| 361 |
"latency_ms": latency_ms,
|
| 362 |
+
"em": metrics["em"],
|
| 363 |
+
"sm": metrics["sm"],
|
| 364 |
+
"exec_acc": metrics["exec"],
|
| 365 |
+
"error": out.get("error"),
|
| 366 |
+
"trace": out.get("trace", []),
|
| 367 |
}
|
| 368 |
+
eval_rows.append(row)
|
| 369 |
|
|
|
|
| 370 |
if args.debug:
|
| 371 |
+
status = "β
" if row["ok"] and row["em"] == 1.0 else "β οΈ"
|
| 372 |
print(
|
| 373 |
+
f"{status} ({latency_ms} ms) | EM={row['em']} SM={row['sm']} ExecAcc={row['exec_acc']}"
|
| 374 |
)
|
| 375 |
+
if row["em"] < 1.0:
|
| 376 |
+
print(f" gold: {sample.gold_sql}")
|
| 377 |
+
print(f" pred: {out['sql'] or 'EMPTY'}")
|
| 378 |
+
|
| 379 |
+
# persist eval.jsonl
|
| 380 |
+
RESULT_ROOT.mkdir(parents=True, exist_ok=True)
|
| 381 |
+
RESULT_DIR.mkdir(parents=True, exist_ok=True)
|
| 382 |
+
with (RESULT_DIR / "eval.jsonl").open("w", encoding="utf-8") as f:
|
| 383 |
+
for r in eval_rows:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 384 |
json.dump(r, f, ensure_ascii=False)
|
| 385 |
f.write("\n")
|
| 386 |
|
| 387 |
+
# aggregates
|
| 388 |
+
total = len(eval_rows)
|
| 389 |
+
success = sum(1 for r in eval_rows if r["ok"])
|
| 390 |
+
avg_em = sum(r["em"] for r in eval_rows) / total if total else 0.0
|
| 391 |
+
avg_sm = sum(r["sm"] for r in eval_rows) / total if total else 0.0
|
| 392 |
+
avg_exec = sum(r["exec_acc"] for r in eval_rows) / total if total else 0.0
|
| 393 |
+
avg_lat = sum(r["latency_ms"] for r in eval_rows) / total if total else 0.0
|
| 394 |
+
p50 = _percentile([r["latency_ms"] for r in eval_rows], 50.0)
|
| 395 |
+
p95 = _percentile([r["latency_ms"] for r in eval_rows], 95.0)
|
| 396 |
+
|
| 397 |
+
stage_means = _collect_stage_means(eval_rows)
|
| 398 |
+
|
| 399 |
summary = {
|
| 400 |
"timestamp": datetime.now().isoformat(timespec="seconds"),
|
| 401 |
+
"split": args.split,
|
| 402 |
+
"config": args.config,
|
| 403 |
"total": total,
|
| 404 |
+
"success": success,
|
| 405 |
+
"success_rate": round(success / total, 3) if total else 0.0,
|
| 406 |
+
"avg_latency_ms": round(avg_lat, 1),
|
| 407 |
+
"p50_latency_ms": round(p50, 1),
|
| 408 |
+
"p95_latency_ms": round(p95, 1),
|
| 409 |
"EM": round(avg_em, 3),
|
| 410 |
"SM": round(avg_sm, 3),
|
| 411 |
+
"ExecAcc": round(avg_exec, 3),
|
| 412 |
+
**{f"{s}_avg_ms": stage_means[s] for s in STAGES},
|
|
|
|
| 413 |
}
|
| 414 |
|
| 415 |
(RESULT_DIR / "summary.json").write_text(
|
| 416 |
json.dumps(summary, indent=2, ensure_ascii=False), encoding="utf-8"
|
| 417 |
)
|
| 418 |
|
| 419 |
+
# CSV
|
| 420 |
+
with (RESULT_DIR / "results.csv").open("w", encoding="utf-8") as f:
|
| 421 |
+
f.write("db_id,query,ok,em,sm,exec_acc,latency_ms\n")
|
| 422 |
+
for r in eval_rows:
|
| 423 |
+
f.write(
|
| 424 |
+
f"{r['db_id']},{json.dumps(r['query'])},{'β
' if r['ok'] else 'β'},"
|
| 425 |
+
f"{r['em']},{r['sm']},{r['exec_acc']},{r['latency_ms']}\n"
|
| 426 |
+
)
|
| 427 |
+
|
| 428 |
print("\n================== Evaluation Summary ==================")
|
| 429 |
print(f"Total samples: {total}")
|
| 430 |
+
print(f"Successful runs: {success} ({summary['success_rate'] * 100:.1f}%)")
|
| 431 |
print(f"Avg EM: {summary['EM']}")
|
| 432 |
print(f"Avg SM: {summary['SM']}")
|
| 433 |
print(f"Avg ExecAcc: {summary['ExecAcc']}")
|
| 434 |
+
print(
|
| 435 |
+
f"Avg Latency: {summary['avg_latency_ms']} ms | p50={summary['p50_latency_ms']} ms | p95={summary['p95_latency_ms']} ms"
|
| 436 |
+
)
|
| 437 |
print(f"Results saved to {RESULT_DIR}")
|
| 438 |
print("========================================================")
|
| 439 |
|
benchmarks/plot_results.py
CHANGED
|
@@ -1,101 +1,141 @@
|
|
| 1 |
"""
|
| 2 |
-
Plot
|
| 3 |
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
If summary.json lacks per-stage averages, they are derived from eval.jsonl traces.
|
| 10 |
"""
|
| 11 |
|
|
|
|
|
|
|
| 12 |
import json
|
| 13 |
-
import time
|
| 14 |
from pathlib import Path
|
| 15 |
import matplotlib.pyplot as plt
|
| 16 |
|
| 17 |
-
# -------------------------------------------------------------------
|
| 18 |
-
# Locate latest results directory
|
| 19 |
-
# -------------------------------------------------------------------
|
| 20 |
-
|
| 21 |
ROOT = Path("benchmarks/results_pro")
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
)
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
ms = t.get("ms", t.get("duration_ms", 0.0))
|
| 55 |
-
|
| 56 |
totals[s] += float(ms)
|
| 57 |
counts[s] += 1
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
plt.
|
| 73 |
-
plt.
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
print(
|
| 96 |
-
f"\nβ
Summary for {run_dir.name}\n"
|
| 97 |
-
f"Avg latency: {summary.get('avg_latency_ms', 'n/a')} ms\n"
|
| 98 |
-
f"Success rate: {summary.get('success_rate', 0.0):.0%}\n"
|
| 99 |
-
f"EM: {summary.get('EM', 0.0):.3f} | SM: {summary.get('SM', 0.0):.3f} | ExecAcc: {summary.get('ExecAcc', 0.0):.3f}\n"
|
| 100 |
-
)
|
| 101 |
-
time.sleep(0.2)
|
|
|
|
| 1 |
"""
|
| 2 |
+
Plot latest Spider benchmark results.
|
| 3 |
|
| 4 |
+
Outputs in the latest folder under benchmarks/results_pro/:
|
| 5 |
+
- metrics_overview.png: EM/SM/ExecAcc + latency (avg, p50, p95)
|
| 6 |
+
- latency_per_stage.png: bar of average per-stage latency
|
| 7 |
+
- latency_histogram.png: latency distribution across samples
|
|
|
|
|
|
|
| 8 |
"""
|
| 9 |
|
| 10 |
+
from __future__ import annotations
|
| 11 |
+
|
| 12 |
import json
|
|
|
|
| 13 |
from pathlib import Path
|
| 14 |
import matplotlib.pyplot as plt
|
| 15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
ROOT = Path("benchmarks/results_pro")
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def _latest_run_dir() -> Path:
|
| 20 |
+
summaries = sorted(
|
| 21 |
+
ROOT.glob("*/summary.json"), key=lambda p: p.stat().st_mtime, reverse=True
|
| 22 |
+
)
|
| 23 |
+
if not summaries:
|
| 24 |
+
raise SystemExit("β No benchmark results found under benchmarks/results_pro/")
|
| 25 |
+
return summaries[0].parent
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def _load_summary(run: Path) -> dict:
|
| 29 |
+
return json.loads((run / "summary.json").read_text(encoding="utf-8"))
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def _load_eval_rows(run: Path) -> list[dict]:
|
| 33 |
+
lines = (run / "eval.jsonl").read_text(encoding="utf-8").splitlines()
|
| 34 |
+
return [json.loads(x) for x in lines]
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def plot_metrics_overview(run: Path, summary: dict) -> None:
|
| 38 |
+
# EM/SM/ExecAcc on [0,1]; latency in ms (show as seconds for scale)
|
| 39 |
+
labels = ["EM", "SM", "ExecAcc", "avg(s)", "p50(s)", "p95(s)"]
|
| 40 |
+
values = [
|
| 41 |
+
summary.get("EM", 0.0),
|
| 42 |
+
summary.get("SM", 0.0),
|
| 43 |
+
summary.get("ExecAcc", 0.0),
|
| 44 |
+
summary.get("avg_latency_ms", 0.0) / 1000.0,
|
| 45 |
+
summary.get("p50_latency_ms", 0.0) / 1000.0,
|
| 46 |
+
summary.get("p95_latency_ms", 0.0) / 1000.0,
|
| 47 |
+
]
|
| 48 |
+
|
| 49 |
+
plt.figure(figsize=(9, 5))
|
| 50 |
+
bars = plt.bar(labels, values)
|
| 51 |
+
for b, v in zip(bars, values):
|
| 52 |
+
plt.text(b.get_x() + b.get_width() / 2, v, f"{v:.2f}", ha="center", va="bottom")
|
| 53 |
+
plt.title("Metrics Overview (Spider)")
|
| 54 |
+
plt.ylim(0, max(1.0, max(values) * 1.15 if values else 1.0))
|
| 55 |
+
plt.tight_layout()
|
| 56 |
+
plt.savefig(run / "metrics_overview.png")
|
| 57 |
+
plt.close()
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
def plot_latency_hist(run: Path, rows: list[dict]) -> None:
|
| 61 |
+
latencies = [
|
| 62 |
+
r.get("latency_ms", 0)
|
| 63 |
+
for r in rows
|
| 64 |
+
if isinstance(r.get("latency_ms"), (int, float))
|
| 65 |
+
]
|
| 66 |
+
if not latencies:
|
| 67 |
+
return
|
| 68 |
+
plt.figure(figsize=(9, 4))
|
| 69 |
+
plt.hist(latencies, bins=min(20, max(5, int(len(latencies) ** 0.5))))
|
| 70 |
+
plt.title("Latency Distribution (ms)")
|
| 71 |
+
plt.xlabel("Latency (ms)")
|
| 72 |
+
plt.ylabel("Count")
|
| 73 |
+
plt.tight_layout()
|
| 74 |
+
plt.savefig(run / "latency_histogram.png")
|
| 75 |
+
plt.close()
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def plot_latency_per_stage(run: Path, summary: dict, rows: list[dict]) -> None:
|
| 79 |
+
stages = [
|
| 80 |
+
"detector",
|
| 81 |
+
"planner",
|
| 82 |
+
"generator",
|
| 83 |
+
"safety",
|
| 84 |
+
"executor",
|
| 85 |
+
"verifier",
|
| 86 |
+
"repair",
|
| 87 |
+
]
|
| 88 |
+
# prefer summary keys if available; else derive from traces
|
| 89 |
+
raw_values = [summary.get(f"{s}_avg_ms") for s in stages]
|
| 90 |
+
# convert Nones to 0.0
|
| 91 |
+
values: list[float] = [float(v or 0.0) for v in raw_values]
|
| 92 |
+
|
| 93 |
+
if not any(values):
|
| 94 |
+
totals = {s: 0.0 for s in stages}
|
| 95 |
+
counts = {s: 0 for s in stages}
|
| 96 |
+
for r in rows:
|
| 97 |
+
trace = r.get("trace") or r.get("traces") or []
|
| 98 |
+
for t in trace:
|
| 99 |
+
s = t.get("stage")
|
| 100 |
+
if s in totals:
|
| 101 |
ms = t.get("ms", t.get("duration_ms", 0.0))
|
| 102 |
+
try:
|
| 103 |
totals[s] += float(ms)
|
| 104 |
counts[s] += 1
|
| 105 |
+
except Exception:
|
| 106 |
+
pass
|
| 107 |
+
values = [round(totals[s] / counts[s], 2) if counts[s] else 0.0 for s in stages]
|
| 108 |
+
|
| 109 |
+
plt.figure(figsize=(10, 5))
|
| 110 |
+
bars = plt.bar(stages, values)
|
| 111 |
+
for b, v in zip(bars, values):
|
| 112 |
+
plt.text(
|
| 113 |
+
b.get_x() + b.get_width() / 2,
|
| 114 |
+
float(v),
|
| 115 |
+
f"{v:.1f}",
|
| 116 |
+
ha="center",
|
| 117 |
+
va="bottom",
|
| 118 |
+
)
|
| 119 |
+
plt.title("Average Latency per Stage (ms)")
|
| 120 |
+
plt.xlabel("Stage")
|
| 121 |
+
plt.ylabel("Latency (ms)")
|
| 122 |
+
plt.tight_layout()
|
| 123 |
+
plt.savefig(run / "latency_per_stage.png")
|
| 124 |
+
plt.close()
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
def main() -> None:
|
| 128 |
+
run = _latest_run_dir()
|
| 129 |
+
print(f"π Using latest run: {run.name}")
|
| 130 |
+
summary = _load_summary(run)
|
| 131 |
+
rows = _load_eval_rows(run)
|
| 132 |
+
plot_metrics_overview(run, summary)
|
| 133 |
+
plot_latency_hist(run, rows)
|
| 134 |
+
plot_latency_per_stage(run, summary, rows)
|
| 135 |
+
print(
|
| 136 |
+
"β
Saved: metrics_overview.png, latency_histogram.png, latency_per_stage.png"
|
| 137 |
+
)
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
if __name__ == "__main__":
|
| 141 |
+
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
benchmarks/results_pro/20251109-095552/eval.jsonl
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"source": "spider", "db_id": "concert_singer", "query": "How many singers do we have?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "select count(*) from singer limit 1", "ok": true, "latency_ms": 11661, "em": 1.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 8989, "summary": "ok", "notes": {"len_plan": 1451}, "token_in": 270, "token_out": 347, "cost_usd": 0.0002487}, {"stage": "generator", "duration_ms": 977, "summary": "ok", "notes": {"rationale_len": 30}, "token_in": 834, "token_out": 19, "cost_usd": 0.00013649999999999998}, {"stage": "safety", "duration_ms": 2, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 2, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 745, "summary": "ok", "notes": {"old_sql_len": 27, "new_sql_len": 35}, "token_in": 318, "token_out": 8, "cost_usd": 5.2499999999999995e-05}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 937, "summary": "ok", "notes": {"old_sql_len": 35, "new_sql_len": 35}, "token_in": 321, "token_out": 8, "cost_usd": 5.295e-05}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
|
| 2 |
+
{"source": "spider", "db_id": "concert_singer", "query": "What is the total number of singers?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "select count(*) from singer limit 1", "ok": true, "latency_ms": 9786, "em": 1.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 6574, "summary": "ok", "notes": {"len_plan": 1479}, "token_in": 271, "token_out": 343, "cost_usd": 0.00024645}, {"stage": "generator", "duration_ms": 955, "summary": "ok", "notes": {"rationale_len": 30}, "token_in": 831, "token_out": 19, "cost_usd": 0.00013605}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 2, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 986, "summary": "ok", "notes": {"old_sql_len": 27, "new_sql_len": 35}, "token_in": 318, "token_out": 8, "cost_usd": 5.2499999999999995e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 1262, "summary": "ok", "notes": {"old_sql_len": 35, "new_sql_len": 35}, "token_in": 321, "token_out": 8, "cost_usd": 5.295e-05}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
|
| 3 |
+
{"source": "spider", "db_id": "concert_singer", "query": "Show name, country, age for all singers ordered by age from the oldest to the youngest.", "gold_sql": "SELECT name , country , age FROM singer ORDER BY age DESC", "pred_sql": "", "ok": true, "latency_ms": 0, "em": 0.0, "sm": 0.0, "exec_acc": 0.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "ambiguous", "notes": {"ambiguous": true, "questions_len": 1}}]}
|
| 4 |
+
{"source": "spider", "db_id": "concert_singer", "query": "What are the names, countries, and ages for every singer in descending order of age?", "gold_sql": "SELECT name , country , age FROM singer ORDER BY age DESC", "pred_sql": "select Name, Country, Age from singer order by Age desc LIMIT 10", "ok": true, "latency_ms": 8674, "em": 0.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 5293, "summary": "ok", "notes": {"len_plan": 1333}, "token_in": 281, "token_out": 305, "cost_usd": 0.00022514999999999997}, {"stage": "generator", "duration_ms": 1510, "summary": "ok", "notes": {"rationale_len": 85}, "token_in": 803, "token_out": 37, "cost_usd": 0.00014265}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 6, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 857, "summary": "ok", "notes": {"old_sql_len": 55, "new_sql_len": 64}, "token_in": 325, "token_out": 21, "cost_usd": 6.135e-05}, {"stage": "safety", "duration_ms": 2, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 6, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 1004, "summary": "ok", "notes": {"old_sql_len": 64, "new_sql_len": 64}, "token_in": 328, "token_out": 21, "cost_usd": 6.18e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 6, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
|
| 5 |
+
{"source": "spider", "db_id": "concert_singer", "query": "What is the average, minimum, and maximum age of all singers from France?", "gold_sql": "SELECT avg(age) , min(age) , max(age) FROM singer WHERE country = 'France'", "pred_sql": "select avg(Age), min(Age), max(Age) from singer where Country = 'France'", "ok": true, "latency_ms": 11247, "em": 0.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 7296, "summary": "ok", "notes": {"len_plan": 1578}, "token_in": 279, "token_out": 425, "cost_usd": 0.00029685}, {"stage": "generator", "duration_ms": 1552, "summary": "ok", "notes": {"rationale_len": 67}, "token_in": 921, "token_out": 42, "cost_usd": 0.00016334999999999999}, {"stage": "safety", "duration_ms": 2, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 1222, "summary": "ok", "notes": {"old_sql_len": 72, "new_sql_len": 80}, "token_in": 333, "token_out": 24, "cost_usd": 6.435e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 1163, "summary": "ok", "notes": {"old_sql_len": 80, "new_sql_len": 72}, "token_in": 337, "token_out": 28, "cost_usd": 6.735e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
|
benchmarks/results_pro/20251109-095552/summary.json
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"timestamp": "2025-11-09T09:56:33",
|
| 3 |
+
"total": 5,
|
| 4 |
+
"success": 5,
|
| 5 |
+
"success_rate": 1.0,
|
| 6 |
+
"avg_latency_ms": 8273.6,
|
| 7 |
+
"EM": 0.4,
|
| 8 |
+
"SM": 0.8,
|
| 9 |
+
"ExecAcc": 0.8,
|
| 10 |
+
"split": "dev",
|
| 11 |
+
"config": "configs/sqlite_pipeline.yaml"
|
| 12 |
+
}
|
benchmarks/results_pro/20251109-100021/eval.jsonl
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"source": "spider", "db_id": "concert_singer", "query": "How many singers do we have?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "select count(*) from singer limit 1", "ok": true, "latency_ms": 9656, "em": 1.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 7138, "summary": "ok", "notes": {"len_plan": 1287}, "token_in": 265, "token_out": 303, "cost_usd": 0.00022154999999999996}, {"stage": "generator", "duration_ms": 875, "summary": "ok", "notes": {"rationale_len": 30}, "token_in": 785, "token_out": 19, "cost_usd": 0.00012915}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 803, "summary": "ok", "notes": {"old_sql_len": 27, "new_sql_len": 35}, "token_in": 313, "token_out": 8, "cost_usd": 5.175e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 829, "summary": "ok", "notes": {"old_sql_len": 35, "new_sql_len": 35}, "token_in": 316, "token_out": 8, "cost_usd": 5.2199999999999995e-05}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
|
| 2 |
+
{"source": "spider", "db_id": "concert_singer", "query": "What is the total number of singers?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "select count(*) from singer limit 1", "ok": true, "latency_ms": 11252, "em": 1.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 8353, "summary": "ok", "notes": {"len_plan": 1399}, "token_in": 266, "token_out": 330, "cost_usd": 0.00023789999999999998}, {"stage": "generator", "duration_ms": 1048, "summary": "ok", "notes": {"rationale_len": 30}, "token_in": 813, "token_out": 19, "cost_usd": 0.00013335}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 794, "summary": "ok", "notes": {"old_sql_len": 27, "new_sql_len": 35}, "token_in": 313, "token_out": 8, "cost_usd": 5.175e-05}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 1052, "summary": "ok", "notes": {"old_sql_len": 35, "new_sql_len": 35}, "token_in": 316, "token_out": 8, "cost_usd": 5.2199999999999995e-05}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
|
| 3 |
+
{"source": "spider", "db_id": "concert_singer", "query": "Show name, country, age for all singers ordered by age from the oldest to the youngest.", "gold_sql": "SELECT name , country , age FROM singer ORDER BY age DESC", "pred_sql": "", "ok": true, "latency_ms": 0, "em": 0.0, "sm": 0.0, "exec_acc": 0.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "ambiguous", "notes": {"ambiguous": true, "questions_len": 1}}]}
|
| 4 |
+
{"source": "spider", "db_id": "concert_singer", "query": "What are the names, countries, and ages for every singer in descending order of age?", "gold_sql": "SELECT name , country , age FROM singer ORDER BY age DESC", "pred_sql": "select Name, Country, Age from singer order by Age desc LIMIT 10", "ok": true, "latency_ms": 8517, "em": 0.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 5263, "summary": "ok", "notes": {"len_plan": 1304}, "token_in": 276, "token_out": 300, "cost_usd": 0.0002214}, {"stage": "generator", "duration_ms": 1022, "summary": "ok", "notes": {"rationale_len": 85}, "token_in": 793, "token_out": 37, "cost_usd": 0.00014115}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 6, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 977, "summary": "ok", "notes": {"old_sql_len": 55, "new_sql_len": 64}, "token_in": 320, "token_out": 21, "cost_usd": 6.0599999999999996e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 6, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 1249, "summary": "ok", "notes": {"old_sql_len": 64, "new_sql_len": 64}, "token_in": 323, "token_out": 21, "cost_usd": 6.104999999999999e-05}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 0, "summary": "ok", "notes": {"row_count": 6, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
|
| 5 |
+
{"source": "spider", "db_id": "concert_singer", "query": "What is the average, minimum, and maximum age of all singers from France?", "gold_sql": "SELECT avg(age) , min(age) , max(age) FROM singer WHERE country = 'France'", "pred_sql": "select avg(Age), min(Age), max(Age) from singer where Country = 'France'", "ok": true, "latency_ms": 15468, "em": 0.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 11390, "summary": "ok", "notes": {"len_plan": 1400}, "token_in": 274, "token_out": 348, "cost_usd": 0.0002499}, {"stage": "generator", "duration_ms": 1252, "summary": "ok", "notes": {"rationale_len": 95}, "token_in": 839, "token_out": 45, "cost_usd": 0.00015285}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 1384, "summary": "ok", "notes": {"old_sql_len": 72, "new_sql_len": 80}, "token_in": 328, "token_out": 24, "cost_usd": 6.36e-05}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 0, "summary": "ok", "notes": {"row_count": 1, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 1437, "summary": "ok", "notes": {"old_sql_len": 80, "new_sql_len": 72}, "token_in": 332, "token_out": 21, "cost_usd": 6.24e-05}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 0, "summary": "ok", "notes": {"row_count": 1, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
|
benchmarks/results_pro/20251109-100021/results.csv
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
db_id,query,ok,em,sm,exec_acc,latency_ms
|
| 2 |
+
concert_singer,"How many singers do we have?",β
,1.0,1.0,1.0,9656
|
| 3 |
+
concert_singer,"What is the total number of singers?",β
,1.0,1.0,1.0,11252
|
| 4 |
+
concert_singer,"Show name, country, age for all singers ordered by age from the oldest to the youngest.",β
,0.0,0.0,0.0,0
|
| 5 |
+
concert_singer,"What are the names, countries, and ages for every singer in descending order of age?",β
,0.0,1.0,1.0,8517
|
| 6 |
+
concert_singer,"What is the average, minimum, and maximum age of all singers from France?",β
,0.0,1.0,1.0,15468
|
benchmarks/results_pro/20251109-100021/summary.json
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"timestamp": "2025-11-09T10:01:06",
|
| 3 |
+
"split": "dev",
|
| 4 |
+
"config": "configs/sqlite_pipeline.yaml",
|
| 5 |
+
"total": 5,
|
| 6 |
+
"success": 5,
|
| 7 |
+
"success_rate": 1.0,
|
| 8 |
+
"avg_latency_ms": 8978.6,
|
| 9 |
+
"p50_latency_ms": 9656.0,
|
| 10 |
+
"p95_latency_ms": 14624.8,
|
| 11 |
+
"EM": 0.4,
|
| 12 |
+
"SM": 0.8,
|
| 13 |
+
"ExecAcc": 0.8,
|
| 14 |
+
"detector_avg_ms": 0.0,
|
| 15 |
+
"planner_avg_ms": 8036.0,
|
| 16 |
+
"generator_avg_ms": 1049.25,
|
| 17 |
+
"safety_avg_ms": 0.33,
|
| 18 |
+
"executor_avg_ms": 0.75,
|
| 19 |
+
"verifier_avg_ms": 0.0,
|
| 20 |
+
"repair_avg_ms": 1065.62
|
| 21 |
+
}
|