Melika Kheirieh commited on
Commit
db1d448
·
1 Parent(s): 7ece28d

feat(bench): auto-detect latest run and plot per-stage latency + metrics summary

Browse files
benchmarks/plot_results.py CHANGED
@@ -1,145 +1,101 @@
1
  """
2
- Plot and summarize results from benchmarks/results_pro/<run>/summary.json
3
 
4
- - Auto-detects the latest run directory (unless --run-dir is provided).
5
- - Prints a compact textual report (EM/SM/ExecAcc, latency, success rate).
6
- - Saves two charts next to summary.json:
7
- - latency_per_stage.png
8
- - metrics_overview.png (EM/SM/ExecAcc as a bar chart)
9
 
10
- Usage:
11
- PYTHONPATH=$PWD python benchmarks/plot_results.py
12
- PYTHONPATH=$PWD python benchmarks/plot_results.py --run-dir benchmarks/results_pro/20251108-105442
13
  """
14
 
15
- from __future__ import annotations
16
-
17
- import argparse
18
  import json
 
19
  from pathlib import Path
20
- from typing import Dict, Any, List
21
-
22
  import matplotlib.pyplot as plt
23
 
24
-
25
- STAGES: List[str] = [
26
- "detector",
27
- "planner",
28
- "generator",
29
- "safety",
30
- "executor",
31
- "verifier",
32
- ]
33
-
34
-
35
- def _find_latest_run(results_root: Path) -> Path:
36
- runs = sorted([p for p in results_root.iterdir() if p.is_dir()])
37
- if not runs:
38
- raise FileNotFoundError(f"No runs found under {results_root}")
39
- return runs[-1]
40
-
41
-
42
- def _load_summary(run_dir: Path) -> Dict[str, Any]:
43
- summary_path = run_dir / "summary.json"
44
- if not summary_path.exists():
45
- # Back-compat (legacy name used by tests)
46
- summary_path = run_dir / "metrics_summary.json"
47
- if not summary_path.exists():
48
- raise FileNotFoundError(f"Missing summary JSON in {run_dir}")
49
- return json.loads(summary_path.read_text(encoding="utf-8"))
50
-
51
-
52
- def _print_report(summary: Dict[str, Any]) -> None:
53
- # Gracefully read metrics (demo runs may not have EM/SM/ExecAcc)
54
- em = summary.get("EM", 0.0)
55
- sm = summary.get("SM", 0.0)
56
- exec_acc = summary.get("ExecAcc", 0.0)
57
- success_rate = summary.get("success_rate", 0.0)
58
- avg_ms = summary.get("avg_latency_ms", 0.0)
59
- p95_ms = summary.get("p95_latency_ms", None)
60
-
61
- total = summary.get("queries_total", summary.get("total", 0))
62
- src = summary.get("pipeline_source", "adapter")
63
- ts = summary.get("timestamp", "-")
64
-
65
- print("\n================ Benchmark Summary ================")
66
- print(f"Timestamp : {ts}")
67
- print(f"Pipeline source : {src}")
68
- print(f"Queries total : {total}")
69
- print(f"Success rate : {success_rate:.0%}")
70
- print(f"EM / SM / ExecAcc: {em:.2f} / {sm:.2f} / {exec_acc:.2f}")
71
- print(f"Avg latency (ms) : {avg_ms:.1f}")
72
- if p95_ms is not None:
73
- print(f"p95 latency (ms) : {p95_ms:.1f}")
74
- print("===================================================\n")
75
-
76
-
77
- def _plot_latency_per_stage(run_dir: Path, summary: Dict[str, Any]) -> Path:
78
- latencies = [summary.get(f"{s}_avg_ms", 0.0) for s in STAGES]
79
- out_path = run_dir / "latency_per_stage.png"
80
-
81
- # Single-plot bar chart (no explicit colors)
82
- plt.figure()
83
- plt.bar(STAGES, latencies)
84
- plt.title("Average Latency per Stage (ms)")
85
- plt.xlabel("Stage")
86
- plt.ylabel("Latency (ms)")
87
- plt.tight_layout()
88
- plt.savefig(out_path, dpi=160)
89
- plt.close()
90
-
91
- return out_path
92
-
93
-
94
- def _plot_metrics_overview(run_dir: Path, summary: Dict[str, Any]) -> Path:
95
- # Even if zeros (demo mode), chart is still useful in README.
96
- em = summary.get("EM", 0.0)
97
- sm = summary.get("SM", 0.0)
98
- exec_acc = summary.get("ExecAcc", 0.0)
99
- out_path = run_dir / "metrics_overview.png"
100
-
101
- labels = ["EM", "SM", "ExecAcc"]
102
- values = [em, sm, exec_acc]
103
-
104
- plt.figure()
105
- plt.bar(labels, values)
106
- plt.title("EM / SM / ExecAcc")
107
- plt.xlabel("Metric")
108
- plt.ylabel("Score")
109
- plt.ylim(0, 1) # normalized range
110
- plt.tight_layout()
111
- plt.savefig(out_path, dpi=160)
112
- plt.close()
113
-
114
- return out_path
115
-
116
-
117
- def main() -> None:
118
- ap = argparse.ArgumentParser()
119
- ap.add_argument(
120
- "--run-dir",
121
- type=str,
122
- default=None,
123
- help="Path to a specific run directory under benchmarks/results_pro/ "
124
- "(defaults to latest).",
125
- )
126
- args, _ = ap.parse_known_args()
127
-
128
- results_root = Path("benchmarks") / "results_pro"
129
- run_dir = (
130
- Path(args.run_dir).resolve() if args.run_dir else _find_latest_run(results_root)
131
- )
132
-
133
- summary = _load_summary(run_dir)
134
- _print_report(summary)
135
-
136
- lat_path = _plot_latency_per_stage(run_dir, summary)
137
- met_path = _plot_metrics_overview(run_dir, summary)
138
-
139
- print("✅ Saved plots:")
140
- print(f"- {lat_path}")
141
- print(f"- {met_path}")
142
-
143
-
144
- if __name__ == "__main__":
145
- main()
 
1
  """
2
+ Plot evaluation summaries for NL2SQL Copilot benchmark runs.
3
 
4
+ Automatically detects the latest results folder under benchmarks/results_pro/,
5
+ reads summary.json + eval.jsonl, and plots:
6
+ 1. Average latency per pipeline stage (ms)
7
+ 2. EM / SM / ExecAcc overview
 
8
 
9
+ If summary.json lacks per-stage averages, they are derived from eval.jsonl traces.
 
 
10
  """
11
 
 
 
 
12
  import json
13
+ import time
14
  from pathlib import Path
 
 
15
  import matplotlib.pyplot as plt
16
 
17
+ # -------------------------------------------------------------------
18
+ # Locate latest results directory
19
+ # -------------------------------------------------------------------
20
+
21
+ ROOT = Path("benchmarks/results_pro")
22
+ run_dirs = sorted(
23
+ ROOT.glob("*/summary.json"), key=lambda p: p.stat().st_mtime, reverse=True
24
+ )
25
+ if not run_dirs:
26
+ raise SystemExit("❌ No benchmark results found under benchmarks/results_pro/")
27
+ summary_path = run_dirs[0]
28
+ run_dir = summary_path.parent
29
+ print(f"📂 Using latest run: {run_dir.name}")
30
+
31
+ # -------------------------------------------------------------------
32
+ # Load summary
33
+ # -------------------------------------------------------------------
34
+ with summary_path.open(encoding="utf-8") as f:
35
+ summary = json.load(f)
36
+
37
+ # -------------------------------------------------------------------
38
+ # Derive per-stage averages if not present
39
+ # -------------------------------------------------------------------
40
+ STAGES = ["detector", "planner", "generator", "safety", "executor", "verifier"]
41
+ stage_means = {s: summary.get(f"{s}_avg_ms") for s in STAGES}
42
+ need_fallback = any(v is None for v in stage_means.values())
43
+
44
+ if need_fallback:
45
+ eval_path = run_dir / "eval.jsonl"
46
+ totals = {s: 0.0 for s in STAGES}
47
+ counts = {s: 0 for s in STAGES}
48
+ if eval_path.exists():
49
+ with eval_path.open(encoding="utf-8") as f:
50
+ for line in f:
51
+ rec = json.loads(line)
52
+ for t in rec.get("trace", []) or []:
53
+ s = t.get("stage")
54
+ ms = t.get("ms", t.get("duration_ms", 0.0))
55
+ if s in totals:
56
+ totals[s] += float(ms)
57
+ counts[s] += 1
58
+ stage_means = {
59
+ s: round(totals[s] / max(counts[s], 1), 2) if counts[s] else 0.0 for s in STAGES
60
+ }
61
+
62
+ latencies = [stage_means[s] for s in STAGES]
63
+
64
+ # -------------------------------------------------------------------
65
+ # Plot average latency per stage
66
+ # -------------------------------------------------------------------
67
+ plt.figure(figsize=(7, 5))
68
+ plt.bar(STAGES, latencies, color="#6fa8dc")
69
+ plt.title("Average Latency per Stage (ms)")
70
+ plt.xlabel("Stage")
71
+ plt.ylabel("Latency (ms)")
72
+ plt.tight_layout()
73
+ plt.savefig(run_dir / "latency_per_stage.png")
74
+ print(f"📊 Saved latency chart {run_dir / 'latency_per_stage.png'}")
75
+
76
+ # -------------------------------------------------------------------
77
+ # Plot EM / SM / ExecAcc metrics
78
+ # -------------------------------------------------------------------
79
+ metrics = ["EM", "SM", "ExecAcc"]
80
+ scores = [summary.get(k, 0.0) for k in metrics]
81
+
82
+ plt.figure(figsize=(7, 5))
83
+ plt.bar(metrics, scores, color="#93c47d")
84
+ plt.title("EM / SM / ExecAcc")
85
+ plt.xlabel("Metric")
86
+ plt.ylabel("Score")
87
+ plt.ylim(0, 1)
88
+ plt.tight_layout()
89
+ plt.savefig(run_dir / "metrics_overview.png")
90
+ print(f"📊 Saved metrics chart → {run_dir / 'metrics_overview.png'}")
91
+
92
+ # -------------------------------------------------------------------
93
+ # Quick textual summary
94
+ # -------------------------------------------------------------------
95
+ print(
96
+ f"\n✅ Summary for {run_dir.name}\n"
97
+ f"Avg latency: {summary.get('avg_latency_ms', 'n/a')} ms\n"
98
+ f"Success rate: {summary.get('success_rate', 0.0):.0%}\n"
99
+ f"EM: {summary.get('EM', 0.0):.3f} | SM: {summary.get('SM', 0.0):.3f} | ExecAcc: {summary.get('ExecAcc', 0.0):.3f}\n"
100
+ )
101
+ time.sleep(0.2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
benchmarks/results_pro/20251108-125829/eval.jsonl ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {"source": "demo", "db_id": "demo", "query": "list all customers", "ok": false, "latency_ms": 6652, "trace": [{"stage": "detector", "ms": 0}, {"stage": "planner", "ms": 2554}, {"stage": "generator", "ms": 1370}, {"stage": "safety", "ms": 1}, {"stage": "executor", "ms": 1}, {"stage": "verifier", "ms": 1}, {"stage": "repair", "ms": 1295}, {"stage": "safety", "ms": 0}, {"stage": "executor", "ms": 0}, {"stage": "repair", "ms": 1426}, {"stage": "safety", "ms": 0}, {"stage": "executor", "ms": 0}, {"stage": "pipeline", "ms": 0}], "error": null}
2
+ {"source": "demo", "db_id": "demo", "query": "show total invoices per country", "ok": true, "latency_ms": 7375, "trace": [{"stage": "detector", "ms": 0}, {"stage": "planner", "ms": 3866}, {"stage": "generator", "ms": 1265}, {"stage": "safety", "ms": 4}, {"stage": "executor", "ms": 1}, {"stage": "verifier", "ms": 0}, {"stage": "repair", "ms": 1126}, {"stage": "safety", "ms": 1}, {"stage": "executor", "ms": 1}, {"stage": "verifier", "ms": 0}, {"stage": "repair", "ms": 1106}, {"stage": "safety", "ms": 1}, {"stage": "executor", "ms": 1}, {"stage": "verifier", "ms": 0}, {"stage": "pipeline", "ms": 0}, {"stage": "pipeline", "ms": 0}], "error": null}
3
+ {"source": "demo", "db_id": "demo", "query": "top 3 albums by total sales", "ok": true, "latency_ms": 1, "trace": [{"stage": "detector", "ms": 0}], "error": null}
4
+ {"source": "demo", "db_id": "demo", "query": "artists with more than 3 albums", "ok": false, "latency_ms": 8629, "trace": [{"stage": "detector", "ms": 0}, {"stage": "planner", "ms": 4110}, {"stage": "generator", "ms": 1969}, {"stage": "safety", "ms": 2}, {"stage": "executor", "ms": 1}, {"stage": "verifier", "ms": 0}, {"stage": "repair", "ms": 1296}, {"stage": "safety", "ms": 2}, {"stage": "executor", "ms": 1}, {"stage": "repair", "ms": 1244}, {"stage": "safety", "ms": 2}, {"stage": "executor", "ms": 0}, {"stage": "pipeline", "ms": 0}], "error": null}
5
+ {"source": "demo", "db_id": "demo", "query": "number of employees per city", "ok": true, "latency_ms": 5630, "trace": [{"stage": "detector", "ms": 0}, {"stage": "planner", "ms": 2602}, {"stage": "generator", "ms": 1097}, {"stage": "safety", "ms": 1}, {"stage": "executor", "ms": 0}, {"stage": "verifier", "ms": 0}, {"stage": "repair", "ms": 1018}, {"stage": "safety", "ms": 2}, {"stage": "executor", "ms": 1}, {"stage": "verifier", "ms": 0}, {"stage": "repair", "ms": 906}, {"stage": "safety", "ms": 2}, {"stage": "executor", "ms": 1}, {"stage": "verifier", "ms": 0}, {"stage": "pipeline", "ms": 0}, {"stage": "pipeline", "ms": 0}], "error": null}
benchmarks/results_pro/20251108-125829/latency_per_stage.png ADDED
benchmarks/results_pro/20251108-125829/metrics_overview.png ADDED
benchmarks/results_pro/20251108-125829/results.csv ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ source,db_id,query,em,sm,exec_acc,ok,latency_ms
2
+ demo,demo,list all customers,,,,❌,6652
3
+ demo,demo,show total invoices per country,,,,✅,7375
4
+ demo,demo,top 3 albums by total sales,,,,✅,1
5
+ demo,demo,artists with more than 3 albums,,,,❌,8629
6
+ demo,demo,number of employees per city,,,,✅,5630
benchmarks/results_pro/20251108-125829/summary.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mode": "single-db",
3
+ "db_path": "/Users/melikakheirieh/Desktop/my/career-developement/LLM/nl2sql-copilot/demo.db",
4
+ "config": "/Users/melikakheirieh/Desktop/my/career-developement/LLM/nl2sql-copilot/configs/sqlite_pipeline.yaml",
5
+ "provider_hint": "REAL",
6
+ "total": 5,
7
+ "EM": 0.0,
8
+ "SM": 0.0,
9
+ "ExecAcc": 0.0,
10
+ "success_rate": 0.6,
11
+ "avg_latency_ms": 5657.4,
12
+ "timestamp": "2025-11-08 12:58:58"
13
+ }