Melika Kheirieh commited on
Commit
296a94d
Β·
1 Parent(s): b794494

feat(bench): gold-aware EM/SM/ExecAcc + p50/p95; write per-stage means; richer plots

Browse files
benchmarks/evaluate_spider_pro.py CHANGED
@@ -1,7 +1,9 @@
1
- #!/usr/bin/env python3
2
  """
3
- Enhanced Spider benchmark evaluator for NL2SQL pipeline.
4
- No external dependencies - uses internal evaluation logic.
 
 
 
5
  """
6
 
7
  from __future__ import annotations
@@ -20,423 +22,418 @@ from nl2sql.pipeline_factory import pipeline_from_config_with_adapter
20
  from adapters.db.sqlite_adapter import SQLiteAdapter
21
  from benchmarks.spider_loader import load_spider_sqlite
22
 
23
- # ==================== Configuration ====================
24
 
25
  RESULT_ROOT = Path("benchmarks/results_pro")
26
  TIMESTAMP = time.strftime("%Y%m%d-%H%M%S")
27
  RESULT_DIR = RESULT_ROOT / TIMESTAMP
 
 
 
 
 
 
 
 
 
28
 
29
-
30
- # ==================== SQL Processing ====================
31
 
32
 
33
  def extract_clean_sql(text: str | None) -> str:
34
- """Safely extract a clean SQL string from input text possibly containing markdown fences or JSON."""
35
- # Always initialize variable to empty string
36
- sql = text or ""
37
 
38
- # Remove markdown code fences
39
- sql = re.sub(r"```(?:sql)?\s*\n?", "", sql, flags=re.IGNORECASE)
40
- sql = re.sub(r"```\s*$", "", sql)
41
 
42
- # Try JSON pattern like {"sql": "..."}
43
- m_json = re.search(r'"sql"\s*:\s*"([^"]+)"', sql)
44
- if m_json:
45
- sql = m_json.group(1)
46
 
47
- # Clean escaped characters
48
  sql = sql.replace('\\"', '"').replace("\\n", " ").replace("\\t", " ")
49
 
50
- # Try to locate SQL statement keywords
51
- m_sql = re.search(
52
- r"\b(select|with|insert|update|delete)\b[\s\S]+", sql, re.IGNORECASE
53
- )
54
- if m_sql:
55
- sql = m_sql.group(0)
56
  sql = re.sub(r"\s+", " ", sql).strip().rstrip(";")
57
  return sql
58
 
59
 
60
  def normalize_sql(sql: str) -> str:
61
- """Enhanced SQL normalization for better matching."""
62
  if not sql:
63
  return ""
64
-
65
- sql = sql.strip().upper()
66
- # Remove all whitespace variations
67
- sql = re.sub(r"\s+", " ", sql)
68
- # Remove trailing semicolon
69
- sql = sql.rstrip(";")
70
-
71
- # Remove table prefixes (e.g., singer.name -> name)
72
- sql = re.sub(r"\b\w+\.(\w+)\b", r"\1", sql)
73
-
74
- # Remove AS aliases
75
- sql = re.sub(r"\s+AS\s+\w+", "", sql, flags=re.IGNORECASE)
76
-
77
- # Remove DISTINCT if used with COUNT(*)
78
- sql = re.sub(r"COUNT\s*\(\s*DISTINCT\s+", "COUNT(", sql)
79
-
80
- # Normalize COUNT variations
81
- sql = re.sub(r"COUNT\s*\(\s*\w+\s*\)", "COUNT(*)", sql)
82
-
83
- # Remove LIMIT at end
84
- sql = re.sub(r"\s+LIMIT\s+\d+$", "", sql)
85
-
86
- # Normalize quotes
87
- sql = re.sub(r'"(\w+)"', r"\1", sql)
88
- sql = re.sub(r"`(\w+)`", r"\1", sql)
89
-
90
- return sql
91
-
92
-
93
- # ==================== Schema Extraction ====================
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
 
96
  def get_database_schema(db_path: Path) -> Dict[str, Any]:
97
- """Extract complete schema from SQLite database."""
 
98
  if not db_path.exists():
99
- return {}
100
 
101
  conn = sqlite3.connect(str(db_path))
102
- cursor = conn.cursor()
103
-
104
- schema: dict[str, Any] = {"tables": {}}
105
-
106
  try:
107
- # Get all tables
108
- cursor.execute(
109
  "SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'"
110
  )
111
- tables = cursor.fetchall()
112
-
113
- for (table_name,) in tables:
114
- # Get columns
115
- cursor.execute(f"PRAGMA table_info('{table_name}')")
116
- columns = cursor.fetchall()
117
-
118
- col_info = []
119
- for col in columns:
120
- col_name = col[1]
121
- col_type = col[2]
122
- is_pk = col[5]
123
-
124
- col_dict = {
125
- "name": col_name,
126
- "type": col_type,
127
- "primary_key": bool(is_pk),
128
- }
129
- col_info.append(col_dict)
130
-
131
- # Get foreign keys
132
- cursor.execute(f"PRAGMA foreign_key_list('{table_name}')")
133
- fks = cursor.fetchall()
134
-
135
- fk_info = []
136
- for fk in fks:
137
- fk_info.append(
138
- {
139
- "column": fk[3],
140
- "referenced_table": fk[2],
141
- "referenced_column": fk[4],
142
- }
143
- )
144
-
145
- schema["tables"][table_name] = {
146
- "columns": col_info,
147
- "foreign_keys": fk_info,
148
- }
149
-
150
  finally:
151
  conn.close()
152
-
153
  return schema
154
 
155
 
156
  def format_schema_for_prompt(schema: Dict[str, Any]) -> str:
157
- """Format schema for LLM prompt."""
158
- if not schema or not schema.get("tables"):
159
  return ""
160
-
161
- lines = []
162
- for table_name, table_info in schema["tables"].items():
163
- cols = []
164
- for col in table_info["columns"]:
165
- col_str = f"{col['name']} {col['type']}"
166
- if col.get("primary_key"):
167
- col_str += " PRIMARY KEY"
168
- cols.append(col_str)
169
-
170
- lines.append(f"Table: {table_name}")
171
  lines.append(f"Columns: {', '.join(cols)}")
172
-
173
- if table_info.get("foreign_keys"):
174
- fks = []
175
- for fk in table_info["foreign_keys"]:
176
- fks.append(
177
  f"{fk['column']} -> {fk['referenced_table']}.{fk['referenced_column']}"
 
178
  )
179
- lines.append(f"Foreign Keys: {', '.join(fks)}")
180
-
181
- lines.append("") # Empty line between tables
182
-
183
  return "\n".join(lines).strip()
184
 
185
 
186
- # ==================== SQL Evaluation ====================
187
 
188
 
189
- def execute_sql(db_path: Path, sql: str) -> Tuple[bool, List[Tuple]]:
190
- """Execute SQL and return success flag and results."""
191
  if not sql:
192
  return False, []
193
-
194
  try:
195
- conn = sqlite3.connect(str(db_path))
196
- cursor = conn.cursor()
197
- cursor.execute(sql)
198
- results = cursor.fetchall()
199
  conn.close()
200
- return True, results
201
  except Exception:
202
  return False, []
203
 
204
 
205
- def compare_sql_results(gold_results: List[Tuple], pred_results: List[Tuple]) -> bool:
206
- """Compare SQL execution results."""
207
- if len(gold_results) != len(pred_results):
208
- return False
209
-
210
- # Convert to sets for comparison (order independent)
211
- gold_set = set(gold_results)
212
- pred_set = set(pred_results)
213
-
214
- return gold_set == pred_set
215
-
216
 
217
- def evaluate_sql_match(pred_sql: str, gold_sql: str, db_path: Path) -> Dict[str, float]:
218
- """Evaluate predicted SQL against gold SQL."""
219
- metrics = {"exact_match": 0.0, "set_match": 0.0, "exec_accuracy": 0.0}
220
 
221
- if not pred_sql:
222
- return metrics
 
223
 
224
- # Exact match
225
- if normalize_sql(pred_sql) == normalize_sql(gold_sql):
226
- metrics["exact_match"] = 1.0
227
 
228
- # Execution-based evaluation
229
- gold_success, gold_results = execute_sql(db_path, gold_sql)
230
- pred_success, pred_results = execute_sql(db_path, pred_sql)
231
-
232
- if gold_success and pred_success:
233
- # Set match (results match)
234
- if compare_sql_results(gold_results, pred_results):
235
- metrics["set_match"] = 1.0
236
- metrics["exec_accuracy"] = 1.0
237
  else:
238
- # Partial credit for successful execution
239
- metrics["exec_accuracy"] = 0.5
240
-
241
- return metrics
242
 
243
 
244
- # ==================== Pipeline Runner ====================
245
 
246
 
247
  @dataclass
248
  class SpiderSample:
249
- """Spider dataset sample."""
250
-
251
  question: str
252
  db_id: str
253
  db_path: Path
254
  gold_sql: str
255
 
256
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
257
  def run_pipeline_on_sample(
258
  pipeline: Any,
259
  sample: SpiderSample,
260
  schema_cache: Dict[str, str],
261
  debug: bool = False,
262
  ) -> Dict[str, Any]:
263
- """Run NL2SQL pipeline on a single sample."""
264
-
265
- # Get/cache schema
266
  if sample.db_id not in schema_cache:
267
  schema_dict = get_database_schema(sample.db_path)
268
- schema_str = format_schema_for_prompt(schema_dict)
269
- schema_cache[sample.db_id] = schema_str
270
  if debug:
271
- print(f" [schema] Loaded {len(schema_str)} chars for {sample.db_id}")
 
 
272
 
273
- schema: str = schema_cache[sample.db_id]
274
 
275
- # Run pipeline
276
  try:
277
- result = pipeline.run(user_query=sample.question, schema_preview=schema)
278
-
279
- # Extract SQL from result
280
- if hasattr(result, "sql") and result.sql:
281
- pred_sql = extract_clean_sql(result.sql)
282
  else:
283
- # Try to extract from various fields
284
- for attr in ["final_sql", "generated_sql", "answer"]:
285
- if hasattr(result, attr):
286
- val = getattr(result, attr)
287
- if val:
288
- pred_sql = extract_clean_sql(str(val))
289
- if pred_sql:
290
- break
291
- else:
292
- pred_sql = ""
293
-
294
  return {
295
- "ok": bool(getattr(result, "ok", True)),
296
  "sql": pred_sql,
297
- "raw_response": getattr(result, "sql", ""),
298
- "traces": getattr(result, "traces", []),
299
  "error": None,
300
  }
301
-
302
  except Exception as e:
303
  if debug:
304
  import traceback
305
 
306
  traceback.print_exc()
307
- return {
308
- "ok": False,
309
- "sql": "",
310
- "raw_response": "",
311
- "traces": [],
312
- "error": str(e),
313
- }
314
-
315
 
316
- # ==================== Main Evaluation ====================
317
 
 
318
 
319
- def main():
320
- parser = argparse.ArgumentParser(description="Evaluate NL2SQL on Spider")
321
- parser.add_argument("--spider", action="store_true", help="Run Spider evaluation")
322
- parser.add_argument("--split", default="dev", choices=["dev", "train"])
323
- parser.add_argument("--limit", type=int, help="Limit number of samples")
324
- parser.add_argument("--debug", action="store_true", help="Enable debug output")
325
- parser.add_argument("--config", default="configs/sqlite_pipeline.yaml")
326
 
327
- args = parser.parse_args()
 
 
 
 
 
 
 
328
 
329
  if not args.spider:
330
- print("Please use --spider flag to run Spider evaluation")
331
  return
332
 
333
- # Load Spider samples
334
  print(f"Loading Spider {args.split} split...")
335
- samples = load_spider_sqlite(split=args.split, limit=args.limit)
336
-
337
- if not samples:
338
- print("❌ No samples loaded. Check SPIDER_ROOT environment variable.")
339
  return
 
340
 
341
- print(f"βœ” Loaded {len(samples)} samples")
342
-
343
- # Prepare results directory
344
  RESULT_DIR.mkdir(parents=True, exist_ok=True)
 
 
345
 
346
- # Initialize schema cache
347
- schema_cache = {}
348
-
349
- # Process each sample
350
- results = []
351
- for i, spider_item in enumerate(samples, 1):
352
- # Convert to our sample format
353
  sample = SpiderSample(
354
- question=spider_item.question,
355
- db_id=spider_item.db_id,
356
- db_path=Path(spider_item.db_path),
357
- gold_sql=spider_item.gold_sql,
358
  )
 
359
 
360
- print(f"\n🧠 [{i}/{len(samples)}] [{sample.db_id}] {sample.question}")
361
-
362
- # Create adapter and pipeline for this database
363
- adapter = SQLiteAdapter(sample.db_path)
364
  pipeline = pipeline_from_config_with_adapter(args.config, adapter=adapter)
365
 
366
- # Run pipeline
367
  t0 = time.perf_counter()
368
- result = run_pipeline_on_sample(pipeline, sample, schema_cache, args.debug)
369
  latency_ms = int((time.perf_counter() - t0) * 1000)
370
 
371
- # Evaluate
372
- metrics = evaluate_sql_match(result["sql"], sample.gold_sql, sample.db_path)
373
-
374
- # Store result
375
- eval_result = {
376
  "source": "spider",
377
  "db_id": sample.db_id,
378
  "query": sample.question,
379
  "gold_sql": sample.gold_sql,
380
- "pred_sql": result["sql"],
381
- "ok": result["ok"],
382
  "latency_ms": latency_ms,
383
- "em": metrics["exact_match"],
384
- "sm": metrics["set_match"],
385
- "exec_acc": metrics["exec_accuracy"],
386
- "error": result.get("error"),
387
- "trace": result.get("traces", []),
388
  }
389
- results.append(eval_result)
390
 
391
- # Debug output
392
  if args.debug:
393
- status = "βœ…" if result["ok"] and metrics["exact_match"] == 1 else "⚠️"
394
  print(
395
- f"{status} ({latency_ms} ms) | EM={metrics['exact_match']:.0f} SM={metrics['set_match']:.0f} ExecAcc={metrics['exec_accuracy']:.1f}"
396
  )
397
- if metrics["exact_match"] < 1:
398
- print(f" gold: {sample.gold_sql[:100]}")
399
- print(f" pred: {result['sql'][:100] if result['sql'] else 'EMPTY'}")
400
-
401
- # Calculate aggregates
402
- total = len(results)
403
- successful = sum(1 for r in results if r["ok"])
404
- avg_em = sum(r["em"] for r in results) / total if total > 0 else 0
405
- avg_sm = sum(r["sm"] for r in results) / total if total > 0 else 0
406
- avg_ea = sum(r["exec_acc"] for r in results) / total if total > 0 else 0
407
- avg_latency = sum(r["latency_ms"] for r in results) / total if total > 0 else 0
408
-
409
- # Save results
410
- eval_jsonl = RESULT_DIR / "eval.jsonl"
411
- with open(eval_jsonl, "w") as f:
412
- for r in results:
413
  json.dump(r, f, ensure_ascii=False)
414
  f.write("\n")
415
 
 
 
 
 
 
 
 
 
 
 
 
 
416
  summary = {
417
  "timestamp": datetime.now().isoformat(timespec="seconds"),
 
 
418
  "total": total,
419
- "success": successful,
420
- "success_rate": round(successful / total, 3) if total else 0,
421
- "avg_latency_ms": round(avg_latency, 1),
 
 
422
  "EM": round(avg_em, 3),
423
  "SM": round(avg_sm, 3),
424
- "ExecAcc": round(avg_ea, 3),
425
- "split": args.split,
426
- "config": args.config,
427
  }
428
 
429
  (RESULT_DIR / "summary.json").write_text(
430
  json.dumps(summary, indent=2, ensure_ascii=False), encoding="utf-8"
431
  )
432
 
 
 
 
 
 
 
 
 
 
433
  print("\n================== Evaluation Summary ==================")
434
  print(f"Total samples: {total}")
435
- print(f"Successful runs: {successful} ({summary['success_rate'] * 100:.1f}%)")
436
  print(f"Avg EM: {summary['EM']}")
437
  print(f"Avg SM: {summary['SM']}")
438
  print(f"Avg ExecAcc: {summary['ExecAcc']}")
439
- print(f"Avg Latency: {summary['avg_latency_ms']} ms")
 
 
440
  print(f"Results saved to {RESULT_DIR}")
441
  print("========================================================")
442
 
 
 
1
  """
2
+ Spider benchmark evaluator (pro):
3
+ - Computes EM / SM / ExecAcc vs. gold SQL
4
+ - Records per-sample latency and (if present) per-stage timings from pipeline traces
5
+ - Persists eval.jsonl (per-sample), summary.json (aggregates incl. p50/p95, per-stage means), results.csv
6
+ - No external deps; percentile and normalization are implemented locally.
7
  """
8
 
9
  from __future__ import annotations
 
22
  from adapters.db.sqlite_adapter import SQLiteAdapter
23
  from benchmarks.spider_loader import load_spider_sqlite
24
 
25
+ # -------------------------- Config --------------------------
26
 
27
  RESULT_ROOT = Path("benchmarks/results_pro")
28
  TIMESTAMP = time.strftime("%Y%m%d-%H%M%S")
29
  RESULT_DIR = RESULT_ROOT / TIMESTAMP
30
+ STAGES = [
31
+ "detector",
32
+ "planner",
33
+ "generator",
34
+ "safety",
35
+ "executor",
36
+ "verifier",
37
+ "repair",
38
+ ]
39
 
40
+ # -------------------------- SQL utils -----------------------
 
41
 
42
 
43
  def extract_clean_sql(text: str | None) -> str:
44
+ """Extract a clean SQL string from LLM-ish output (may include fences/JSON)."""
45
+ sql = (text or "").strip()
 
46
 
47
+ # strip ```sql fences
48
+ sql = re.sub(r"```(?:sql)?\s*", "", sql, flags=re.I)
49
+ sql = sql.replace("```", "")
50
 
51
+ # JSON-like {"sql": "..."}
52
+ m = re.search(r'"sql"\s*:\s*"([^"]+)"', sql)
53
+ if m:
54
+ sql = m.group(1)
55
 
56
+ # unescape
57
  sql = sql.replace('\\"', '"').replace("\\n", " ").replace("\\t", " ")
58
 
59
+ # find first SQL-ish keyword
60
+ m2 = re.search(r"\b(select|with|insert|update|delete)\b[\s\S]+", sql, re.I)
61
+ if m2:
62
+ sql = m2.group(0)
63
+
 
64
  sql = re.sub(r"\s+", " ", sql).strip().rstrip(";")
65
  return sql
66
 
67
 
68
  def normalize_sql(sql: str) -> str:
69
+ """Light normalization to make EM stricter-but-fair."""
70
  if not sql:
71
  return ""
72
+ s = sql.strip()
73
+ # unify case but keep literals recognizable
74
+ s = re.sub(r"\s+", " ", s).strip()
75
+ s = s.rstrip(";")
76
+
77
+ # drop table prefixes a.b -> b
78
+ s = re.sub(r"\b\w+\.(\w+)\b", r"\1", s)
79
+
80
+ # collapse quotes around identifiers
81
+ s = re.sub(r"`([A-Za-z_]\w*)`", r"\1", s)
82
+ s = re.sub(r'"([A-Za-z_]\w*)"', r"\1", s)
83
+
84
+ # COUNT(foo) -> COUNT(*), DISTINCT inside COUNT -> COUNT(*)
85
+ s = re.sub(r"(?i)COUNT\s*\(\s*DISTINCT\s+[^)]+\)", "COUNT(*)", s)
86
+ s = re.sub(r"(?i)COUNT\s*\(\s*[A-Za-z_]\w*\s*\)", "COUNT(*)", s)
87
+
88
+ # strip trailing LIMIT n
89
+ s = re.sub(r"(?i)\s+LIMIT\s+\d+\s*$", "", s)
90
+
91
+ # canonical whitespace + upper keywords for stability
92
+ s = re.sub(r"\s+", " ", s).strip()
93
+ # keyword upper (a bit heuristic)
94
+ for kw in [
95
+ "select",
96
+ "from",
97
+ "where",
98
+ "group by",
99
+ "order by",
100
+ "having",
101
+ "limit",
102
+ "join",
103
+ "on",
104
+ "and",
105
+ "or",
106
+ "asc",
107
+ "desc",
108
+ ]:
109
+ s = re.sub(rf"(?i)\b{kw}\b", kw.upper(), s)
110
+ return s
111
+
112
+
113
+ # ---------------------- Schema extraction -------------------
114
 
115
 
116
  def get_database_schema(db_path: Path) -> Dict[str, Any]:
117
+ """Extract schema from SQLite database (tables, columns, FKs)."""
118
+ schema: Dict[str, Any] = {"tables": {}}
119
  if not db_path.exists():
120
+ return schema
121
 
122
  conn = sqlite3.connect(str(db_path))
123
+ cur = conn.cursor()
 
 
 
124
  try:
125
+ cur.execute(
 
126
  "SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'"
127
  )
128
+ for (table,) in cur.fetchall():
129
+ cur.execute(f"PRAGMA table_info('{table}')")
130
+ cols = [
131
+ {"name": c[1], "type": c[2], "primary_key": bool(c[5])}
132
+ for c in cur.fetchall()
133
+ ]
134
+ cur.execute(f"PRAGMA foreign_key_list('{table}')")
135
+ fks = [
136
+ {"column": fk[3], "referenced_table": fk[2], "referenced_column": fk[4]}
137
+ for fk in cur.fetchall()
138
+ ]
139
+ schema["tables"][table] = {"columns": cols, "foreign_keys": fks}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  finally:
141
  conn.close()
 
142
  return schema
143
 
144
 
145
  def format_schema_for_prompt(schema: Dict[str, Any]) -> str:
146
+ """Plain-text schema for prompt (minimal but helpful)."""
147
+ if not schema.get("tables"):
148
  return ""
149
+ lines: List[str] = []
150
+ for t, info in schema["tables"].items():
151
+ cols = [
152
+ f"{c['name']} {c['type']}{' PK' if c.get('primary_key') else ''}"
153
+ for c in info.get("columns", [])
154
+ ]
155
+ lines.append(f"Table: {t}")
 
 
 
 
156
  lines.append(f"Columns: {', '.join(cols)}")
157
+ fks = info.get("foreign_keys") or []
158
+ if fks:
159
+ lines.append(
160
+ "FKs: "
161
+ + ", ".join(
162
  f"{fk['column']} -> {fk['referenced_table']}.{fk['referenced_column']}"
163
+ for fk in fks
164
  )
165
+ )
166
+ lines.append("")
 
 
167
  return "\n".join(lines).strip()
168
 
169
 
170
+ # ---------------------- Exec/eval metrics -------------------
171
 
172
 
173
+ def _exec_sql(db: Path, sql: str) -> Tuple[bool, List[Tuple]]:
 
174
  if not sql:
175
  return False, []
 
176
  try:
177
+ conn = sqlite3.connect(str(db))
178
+ cur = conn.cursor()
179
+ cur.execute(sql)
180
+ rows = cur.fetchall()
181
  conn.close()
182
+ return True, rows
183
  except Exception:
184
  return False, []
185
 
186
 
187
+ def _same_rows(a: List[Tuple], b: List[Tuple]) -> bool:
188
+ return set(a) == set(b) and len(a) == len(b)
 
 
 
 
 
 
 
 
 
189
 
 
 
 
190
 
191
+ def evaluate_sql(pred: str, gold: str, db: Path) -> Dict[str, float]:
192
+ """Return {'em', 'sm', 'exec'} in {0.0,1.0} (sm ~ set-match)."""
193
+ em = 1.0 if normalize_sql(pred) == normalize_sql(gold) else 0.0
194
 
195
+ gold_ok, gold_rows = _exec_sql(db, gold)
196
+ pred_ok, pred_rows = _exec_sql(db, pred)
 
197
 
198
+ sm = 0.0
199
+ exec_acc = 0.0
200
+ if gold_ok and pred_ok:
201
+ if _same_rows(gold_rows, pred_rows):
202
+ sm = 1.0
203
+ exec_acc = 1.0
 
 
 
204
  else:
205
+ exec_acc = 0.5 # partial credit for executing but mismatched rows
206
+ return {"em": em, "sm": sm, "exec": exec_acc}
 
 
207
 
208
 
209
+ # ---------------------- Dataclass + runner ------------------
210
 
211
 
212
  @dataclass
213
  class SpiderSample:
 
 
214
  question: str
215
  db_id: str
216
  db_path: Path
217
  gold_sql: str
218
 
219
 
220
+ def _percentile(values: List[float], p: float) -> float:
221
+ """Compute p-th percentile (0..100) without numpy."""
222
+ if not values:
223
+ return 0.0
224
+ vals = sorted(values)
225
+ k = (len(vals) - 1) * (p / 100.0)
226
+ f = int(k)
227
+ c = min(f + 1, len(vals) - 1)
228
+ if f == c:
229
+ return float(vals[int(k)])
230
+ return float(vals[f] * (c - k) + vals[c] * (k - f))
231
+
232
+
233
+ def _stage_ms_from_trace(trace_item: Dict[str, Any]) -> float:
234
+ """Accepts {'stage':..., 'ms':...} OR {'stage':..., 'duration_ms':...}."""
235
+ if not trace_item:
236
+ return 0.0
237
+ if "ms" in trace_item:
238
+ try:
239
+ return float(trace_item["ms"])
240
+ except Exception:
241
+ return 0.0
242
+ if "duration_ms" in trace_item:
243
+ try:
244
+ return float(trace_item["duration_ms"])
245
+ except Exception:
246
+ return 0.0
247
+ return 0.0
248
+
249
+
250
+ def _collect_stage_means(eval_rows: List[Dict[str, Any]]) -> Dict[str, float]:
251
+ """Average per-stage ms across all records (0 if absent)."""
252
+ totals = {s: 0.0 for s in STAGES}
253
+ counts = {s: 0 for s in STAGES}
254
+ for r in eval_rows:
255
+ trace_list = r.get("trace") or r.get("traces") or []
256
+ for t in trace_list:
257
+ s = t.get("stage")
258
+ if s in totals:
259
+ ms = _stage_ms_from_trace(t)
260
+ totals[s] += ms
261
+ counts[s] += 1
262
+ return {s: round(totals[s] / counts[s], 2) if counts[s] else 0.0 for s in STAGES}
263
+
264
+
265
  def run_pipeline_on_sample(
266
  pipeline: Any,
267
  sample: SpiderSample,
268
  schema_cache: Dict[str, str],
269
  debug: bool = False,
270
  ) -> Dict[str, Any]:
271
+ """Run pipeline on one sample and extract normalized prediction + traces."""
272
+ # cache schema
 
273
  if sample.db_id not in schema_cache:
274
  schema_dict = get_database_schema(sample.db_path)
275
+ schema_cache[sample.db_id] = format_schema_for_prompt(schema_dict)
 
276
  if debug:
277
+ print(
278
+ f" [schema] Loaded {len(schema_cache[sample.db_id])} chars for {sample.db_id}"
279
+ )
280
 
281
+ schema = schema_cache[sample.db_id]
282
 
 
283
  try:
284
+ res = pipeline.run(user_query=sample.question, schema_preview=schema)
285
+ # extract SQL
286
+ pred_sql = ""
287
+ if hasattr(res, "sql") and res.sql:
288
+ pred_sql = extract_clean_sql(res.sql)
289
  else:
290
+ for attr in ("final_sql", "generated_sql", "answer"):
291
+ if getattr(res, attr, None):
292
+ pred_sql = extract_clean_sql(str(getattr(res, attr)))
293
+ if pred_sql:
294
+ break
 
 
 
 
 
 
295
  return {
296
+ "ok": bool(getattr(res, "ok", True)),
297
  "sql": pred_sql,
298
+ "trace": getattr(res, "traces", []) or getattr(res, "trace", []),
 
299
  "error": None,
300
  }
 
301
  except Exception as e:
302
  if debug:
303
  import traceback
304
 
305
  traceback.print_exc()
306
+ return {"ok": False, "sql": "", "trace": [], "error": str(e)}
 
 
 
 
 
 
 
307
 
 
308
 
309
+ # --------------------------- Main --------------------------
310
 
 
 
 
 
 
 
 
311
 
312
+ def main() -> None:
313
+ ap = argparse.ArgumentParser(description="Evaluate NL2SQL on Spider (pro)")
314
+ ap.add_argument("--spider", action="store_true", help="Use Spider dataset loader")
315
+ ap.add_argument("--split", default="dev", choices=["dev", "train"])
316
+ ap.add_argument("--limit", type=int, default=20)
317
+ ap.add_argument("--debug", action="store_true")
318
+ ap.add_argument("--config", default="configs/sqlite_pipeline.yaml")
319
+ args = ap.parse_args()
320
 
321
  if not args.spider:
322
+ print("Use --spider to run Spider evaluation.")
323
  return
324
 
325
+ # load items
326
  print(f"Loading Spider {args.split} split...")
327
+ items = load_spider_sqlite(split=args.split, limit=args.limit)
328
+ if not items:
329
+ print("❌ No samples loaded. Check SPIDER_ROOT.")
 
330
  return
331
+ print(f"βœ” Loaded {len(items)} samples")
332
 
 
 
 
333
  RESULT_DIR.mkdir(parents=True, exist_ok=True)
334
+ schema_cache: Dict[str, str] = {}
335
+ eval_rows: List[Dict[str, Any]] = []
336
 
337
+ for i, it in enumerate(items, 1):
 
 
 
 
 
 
338
  sample = SpiderSample(
339
+ question=it.question,
340
+ db_id=it.db_id,
341
+ db_path=Path(it.db_path),
342
+ gold_sql=it.gold_sql,
343
  )
344
+ print(f"\n🧠 [{i}/{len(items)}] [{sample.db_id}] {sample.question}")
345
 
346
+ adapter = SQLiteAdapter(str(sample.db_path))
 
 
 
347
  pipeline = pipeline_from_config_with_adapter(args.config, adapter=adapter)
348
 
 
349
  t0 = time.perf_counter()
350
+ out = run_pipeline_on_sample(pipeline, sample, schema_cache, args.debug)
351
  latency_ms = int((time.perf_counter() - t0) * 1000)
352
 
353
+ metrics = evaluate_sql(out["sql"], sample.gold_sql, sample.db_path)
354
+ row = {
 
 
 
355
  "source": "spider",
356
  "db_id": sample.db_id,
357
  "query": sample.question,
358
  "gold_sql": sample.gold_sql,
359
+ "pred_sql": out["sql"],
360
+ "ok": out["ok"],
361
  "latency_ms": latency_ms,
362
+ "em": metrics["em"],
363
+ "sm": metrics["sm"],
364
+ "exec_acc": metrics["exec"],
365
+ "error": out.get("error"),
366
+ "trace": out.get("trace", []),
367
  }
368
+ eval_rows.append(row)
369
 
 
370
  if args.debug:
371
+ status = "βœ…" if row["ok"] and row["em"] == 1.0 else "⚠️"
372
  print(
373
+ f"{status} ({latency_ms} ms) | EM={row['em']} SM={row['sm']} ExecAcc={row['exec_acc']}"
374
  )
375
+ if row["em"] < 1.0:
376
+ print(f" gold: {sample.gold_sql}")
377
+ print(f" pred: {out['sql'] or 'EMPTY'}")
378
+
379
+ # persist eval.jsonl
380
+ RESULT_ROOT.mkdir(parents=True, exist_ok=True)
381
+ RESULT_DIR.mkdir(parents=True, exist_ok=True)
382
+ with (RESULT_DIR / "eval.jsonl").open("w", encoding="utf-8") as f:
383
+ for r in eval_rows:
 
 
 
 
 
 
 
384
  json.dump(r, f, ensure_ascii=False)
385
  f.write("\n")
386
 
387
+ # aggregates
388
+ total = len(eval_rows)
389
+ success = sum(1 for r in eval_rows if r["ok"])
390
+ avg_em = sum(r["em"] for r in eval_rows) / total if total else 0.0
391
+ avg_sm = sum(r["sm"] for r in eval_rows) / total if total else 0.0
392
+ avg_exec = sum(r["exec_acc"] for r in eval_rows) / total if total else 0.0
393
+ avg_lat = sum(r["latency_ms"] for r in eval_rows) / total if total else 0.0
394
+ p50 = _percentile([r["latency_ms"] for r in eval_rows], 50.0)
395
+ p95 = _percentile([r["latency_ms"] for r in eval_rows], 95.0)
396
+
397
+ stage_means = _collect_stage_means(eval_rows)
398
+
399
  summary = {
400
  "timestamp": datetime.now().isoformat(timespec="seconds"),
401
+ "split": args.split,
402
+ "config": args.config,
403
  "total": total,
404
+ "success": success,
405
+ "success_rate": round(success / total, 3) if total else 0.0,
406
+ "avg_latency_ms": round(avg_lat, 1),
407
+ "p50_latency_ms": round(p50, 1),
408
+ "p95_latency_ms": round(p95, 1),
409
  "EM": round(avg_em, 3),
410
  "SM": round(avg_sm, 3),
411
+ "ExecAcc": round(avg_exec, 3),
412
+ **{f"{s}_avg_ms": stage_means[s] for s in STAGES},
 
413
  }
414
 
415
  (RESULT_DIR / "summary.json").write_text(
416
  json.dumps(summary, indent=2, ensure_ascii=False), encoding="utf-8"
417
  )
418
 
419
+ # CSV
420
+ with (RESULT_DIR / "results.csv").open("w", encoding="utf-8") as f:
421
+ f.write("db_id,query,ok,em,sm,exec_acc,latency_ms\n")
422
+ for r in eval_rows:
423
+ f.write(
424
+ f"{r['db_id']},{json.dumps(r['query'])},{'βœ…' if r['ok'] else '❌'},"
425
+ f"{r['em']},{r['sm']},{r['exec_acc']},{r['latency_ms']}\n"
426
+ )
427
+
428
  print("\n================== Evaluation Summary ==================")
429
  print(f"Total samples: {total}")
430
+ print(f"Successful runs: {success} ({summary['success_rate'] * 100:.1f}%)")
431
  print(f"Avg EM: {summary['EM']}")
432
  print(f"Avg SM: {summary['SM']}")
433
  print(f"Avg ExecAcc: {summary['ExecAcc']}")
434
+ print(
435
+ f"Avg Latency: {summary['avg_latency_ms']} ms | p50={summary['p50_latency_ms']} ms | p95={summary['p95_latency_ms']} ms"
436
+ )
437
  print(f"Results saved to {RESULT_DIR}")
438
  print("========================================================")
439
 
benchmarks/plot_results.py CHANGED
@@ -1,101 +1,141 @@
1
  """
2
- Plot evaluation summaries for NL2SQL Copilot benchmark runs.
3
 
4
- Automatically detects the latest results folder under benchmarks/results_pro/,
5
- reads summary.json + eval.jsonl, and plots:
6
- 1. Average latency per pipeline stage (ms)
7
- 2. EM / SM / ExecAcc overview
8
-
9
- If summary.json lacks per-stage averages, they are derived from eval.jsonl traces.
10
  """
11
 
 
 
12
  import json
13
- import time
14
  from pathlib import Path
15
  import matplotlib.pyplot as plt
16
 
17
- # -------------------------------------------------------------------
18
- # Locate latest results directory
19
- # -------------------------------------------------------------------
20
-
21
  ROOT = Path("benchmarks/results_pro")
22
- run_dirs = sorted(
23
- ROOT.glob("*/summary.json"), key=lambda p: p.stat().st_mtime, reverse=True
24
- )
25
- if not run_dirs:
26
- raise SystemExit("❌ No benchmark results found under benchmarks/results_pro/")
27
- summary_path = run_dirs[0]
28
- run_dir = summary_path.parent
29
- print(f"πŸ“‚ Using latest run: {run_dir.name}")
30
-
31
- # -------------------------------------------------------------------
32
- # Load summary
33
- # -------------------------------------------------------------------
34
- with summary_path.open(encoding="utf-8") as f:
35
- summary = json.load(f)
36
-
37
- # -------------------------------------------------------------------
38
- # Derive per-stage averages if not present
39
- # -------------------------------------------------------------------
40
- STAGES = ["detector", "planner", "generator", "safety", "executor", "verifier"]
41
- stage_means = {s: summary.get(f"{s}_avg_ms") for s in STAGES}
42
- need_fallback = any(v is None for v in stage_means.values())
43
-
44
- if need_fallback:
45
- eval_path = run_dir / "eval.jsonl"
46
- totals = {s: 0.0 for s in STAGES}
47
- counts = {s: 0 for s in STAGES}
48
- if eval_path.exists():
49
- with eval_path.open(encoding="utf-8") as f:
50
- for line in f:
51
- rec = json.loads(line)
52
- for t in rec.get("trace", []) or []:
53
- s = t.get("stage")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  ms = t.get("ms", t.get("duration_ms", 0.0))
55
- if s in totals:
56
  totals[s] += float(ms)
57
  counts[s] += 1
58
- stage_means = {
59
- s: round(totals[s] / max(counts[s], 1), 2) if counts[s] else 0.0 for s in STAGES
60
- }
61
-
62
- latencies = [stage_means[s] for s in STAGES]
63
-
64
- # -------------------------------------------------------------------
65
- # Plot average latency per stage
66
- # -------------------------------------------------------------------
67
- plt.figure(figsize=(7, 5))
68
- plt.bar(STAGES, latencies, color="#6fa8dc")
69
- plt.title("Average Latency per Stage (ms)")
70
- plt.xlabel("Stage")
71
- plt.ylabel("Latency (ms)")
72
- plt.tight_layout()
73
- plt.savefig(run_dir / "latency_per_stage.png")
74
- print(f"πŸ“Š Saved latency chart β†’ {run_dir / 'latency_per_stage.png'}")
75
-
76
- # -------------------------------------------------------------------
77
- # Plot EM / SM / ExecAcc metrics
78
- # -------------------------------------------------------------------
79
- metrics = ["EM", "SM", "ExecAcc"]
80
- scores = [summary.get(k, 0.0) for k in metrics]
81
-
82
- plt.figure(figsize=(7, 5))
83
- plt.bar(metrics, scores, color="#93c47d")
84
- plt.title("EM / SM / ExecAcc")
85
- plt.xlabel("Metric")
86
- plt.ylabel("Score")
87
- plt.ylim(0, 1)
88
- plt.tight_layout()
89
- plt.savefig(run_dir / "metrics_overview.png")
90
- print(f"πŸ“Š Saved metrics chart β†’ {run_dir / 'metrics_overview.png'}")
91
-
92
- # -------------------------------------------------------------------
93
- # Quick textual summary
94
- # -------------------------------------------------------------------
95
- print(
96
- f"\nβœ… Summary for {run_dir.name}\n"
97
- f"Avg latency: {summary.get('avg_latency_ms', 'n/a')} ms\n"
98
- f"Success rate: {summary.get('success_rate', 0.0):.0%}\n"
99
- f"EM: {summary.get('EM', 0.0):.3f} | SM: {summary.get('SM', 0.0):.3f} | ExecAcc: {summary.get('ExecAcc', 0.0):.3f}\n"
100
- )
101
- time.sleep(0.2)
 
1
  """
2
+ Plot latest Spider benchmark results.
3
 
4
+ Outputs in the latest folder under benchmarks/results_pro/:
5
+ - metrics_overview.png: EM/SM/ExecAcc + latency (avg, p50, p95)
6
+ - latency_per_stage.png: bar of average per-stage latency
7
+ - latency_histogram.png: latency distribution across samples
 
 
8
  """
9
 
10
+ from __future__ import annotations
11
+
12
  import json
 
13
  from pathlib import Path
14
  import matplotlib.pyplot as plt
15
 
 
 
 
 
16
  ROOT = Path("benchmarks/results_pro")
17
+
18
+
19
+ def _latest_run_dir() -> Path:
20
+ summaries = sorted(
21
+ ROOT.glob("*/summary.json"), key=lambda p: p.stat().st_mtime, reverse=True
22
+ )
23
+ if not summaries:
24
+ raise SystemExit("❌ No benchmark results found under benchmarks/results_pro/")
25
+ return summaries[0].parent
26
+
27
+
28
+ def _load_summary(run: Path) -> dict:
29
+ return json.loads((run / "summary.json").read_text(encoding="utf-8"))
30
+
31
+
32
+ def _load_eval_rows(run: Path) -> list[dict]:
33
+ lines = (run / "eval.jsonl").read_text(encoding="utf-8").splitlines()
34
+ return [json.loads(x) for x in lines]
35
+
36
+
37
+ def plot_metrics_overview(run: Path, summary: dict) -> None:
38
+ # EM/SM/ExecAcc on [0,1]; latency in ms (show as seconds for scale)
39
+ labels = ["EM", "SM", "ExecAcc", "avg(s)", "p50(s)", "p95(s)"]
40
+ values = [
41
+ summary.get("EM", 0.0),
42
+ summary.get("SM", 0.0),
43
+ summary.get("ExecAcc", 0.0),
44
+ summary.get("avg_latency_ms", 0.0) / 1000.0,
45
+ summary.get("p50_latency_ms", 0.0) / 1000.0,
46
+ summary.get("p95_latency_ms", 0.0) / 1000.0,
47
+ ]
48
+
49
+ plt.figure(figsize=(9, 5))
50
+ bars = plt.bar(labels, values)
51
+ for b, v in zip(bars, values):
52
+ plt.text(b.get_x() + b.get_width() / 2, v, f"{v:.2f}", ha="center", va="bottom")
53
+ plt.title("Metrics Overview (Spider)")
54
+ plt.ylim(0, max(1.0, max(values) * 1.15 if values else 1.0))
55
+ plt.tight_layout()
56
+ plt.savefig(run / "metrics_overview.png")
57
+ plt.close()
58
+
59
+
60
+ def plot_latency_hist(run: Path, rows: list[dict]) -> None:
61
+ latencies = [
62
+ r.get("latency_ms", 0)
63
+ for r in rows
64
+ if isinstance(r.get("latency_ms"), (int, float))
65
+ ]
66
+ if not latencies:
67
+ return
68
+ plt.figure(figsize=(9, 4))
69
+ plt.hist(latencies, bins=min(20, max(5, int(len(latencies) ** 0.5))))
70
+ plt.title("Latency Distribution (ms)")
71
+ plt.xlabel("Latency (ms)")
72
+ plt.ylabel("Count")
73
+ plt.tight_layout()
74
+ plt.savefig(run / "latency_histogram.png")
75
+ plt.close()
76
+
77
+
78
+ def plot_latency_per_stage(run: Path, summary: dict, rows: list[dict]) -> None:
79
+ stages = [
80
+ "detector",
81
+ "planner",
82
+ "generator",
83
+ "safety",
84
+ "executor",
85
+ "verifier",
86
+ "repair",
87
+ ]
88
+ # prefer summary keys if available; else derive from traces
89
+ raw_values = [summary.get(f"{s}_avg_ms") for s in stages]
90
+ # convert Nones to 0.0
91
+ values: list[float] = [float(v or 0.0) for v in raw_values]
92
+
93
+ if not any(values):
94
+ totals = {s: 0.0 for s in stages}
95
+ counts = {s: 0 for s in stages}
96
+ for r in rows:
97
+ trace = r.get("trace") or r.get("traces") or []
98
+ for t in trace:
99
+ s = t.get("stage")
100
+ if s in totals:
101
  ms = t.get("ms", t.get("duration_ms", 0.0))
102
+ try:
103
  totals[s] += float(ms)
104
  counts[s] += 1
105
+ except Exception:
106
+ pass
107
+ values = [round(totals[s] / counts[s], 2) if counts[s] else 0.0 for s in stages]
108
+
109
+ plt.figure(figsize=(10, 5))
110
+ bars = plt.bar(stages, values)
111
+ for b, v in zip(bars, values):
112
+ plt.text(
113
+ b.get_x() + b.get_width() / 2,
114
+ float(v),
115
+ f"{v:.1f}",
116
+ ha="center",
117
+ va="bottom",
118
+ )
119
+ plt.title("Average Latency per Stage (ms)")
120
+ plt.xlabel("Stage")
121
+ plt.ylabel("Latency (ms)")
122
+ plt.tight_layout()
123
+ plt.savefig(run / "latency_per_stage.png")
124
+ plt.close()
125
+
126
+
127
+ def main() -> None:
128
+ run = _latest_run_dir()
129
+ print(f"πŸ“‚ Using latest run: {run.name}")
130
+ summary = _load_summary(run)
131
+ rows = _load_eval_rows(run)
132
+ plot_metrics_overview(run, summary)
133
+ plot_latency_hist(run, rows)
134
+ plot_latency_per_stage(run, summary, rows)
135
+ print(
136
+ "βœ… Saved: metrics_overview.png, latency_histogram.png, latency_per_stage.png"
137
+ )
138
+
139
+
140
+ if __name__ == "__main__":
141
+ main()
 
 
 
 
 
 
 
benchmarks/results_pro/20251109-095552/eval.jsonl ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {"source": "spider", "db_id": "concert_singer", "query": "How many singers do we have?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "select count(*) from singer limit 1", "ok": true, "latency_ms": 11661, "em": 1.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 8989, "summary": "ok", "notes": {"len_plan": 1451}, "token_in": 270, "token_out": 347, "cost_usd": 0.0002487}, {"stage": "generator", "duration_ms": 977, "summary": "ok", "notes": {"rationale_len": 30}, "token_in": 834, "token_out": 19, "cost_usd": 0.00013649999999999998}, {"stage": "safety", "duration_ms": 2, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 2, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 745, "summary": "ok", "notes": {"old_sql_len": 27, "new_sql_len": 35}, "token_in": 318, "token_out": 8, "cost_usd": 5.2499999999999995e-05}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 937, "summary": "ok", "notes": {"old_sql_len": 35, "new_sql_len": 35}, "token_in": 321, "token_out": 8, "cost_usd": 5.295e-05}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
2
+ {"source": "spider", "db_id": "concert_singer", "query": "What is the total number of singers?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "select count(*) from singer limit 1", "ok": true, "latency_ms": 9786, "em": 1.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 6574, "summary": "ok", "notes": {"len_plan": 1479}, "token_in": 271, "token_out": 343, "cost_usd": 0.00024645}, {"stage": "generator", "duration_ms": 955, "summary": "ok", "notes": {"rationale_len": 30}, "token_in": 831, "token_out": 19, "cost_usd": 0.00013605}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 2, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 986, "summary": "ok", "notes": {"old_sql_len": 27, "new_sql_len": 35}, "token_in": 318, "token_out": 8, "cost_usd": 5.2499999999999995e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 1262, "summary": "ok", "notes": {"old_sql_len": 35, "new_sql_len": 35}, "token_in": 321, "token_out": 8, "cost_usd": 5.295e-05}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
3
+ {"source": "spider", "db_id": "concert_singer", "query": "Show name, country, age for all singers ordered by age from the oldest to the youngest.", "gold_sql": "SELECT name , country , age FROM singer ORDER BY age DESC", "pred_sql": "", "ok": true, "latency_ms": 0, "em": 0.0, "sm": 0.0, "exec_acc": 0.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "ambiguous", "notes": {"ambiguous": true, "questions_len": 1}}]}
4
+ {"source": "spider", "db_id": "concert_singer", "query": "What are the names, countries, and ages for every singer in descending order of age?", "gold_sql": "SELECT name , country , age FROM singer ORDER BY age DESC", "pred_sql": "select Name, Country, Age from singer order by Age desc LIMIT 10", "ok": true, "latency_ms": 8674, "em": 0.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 5293, "summary": "ok", "notes": {"len_plan": 1333}, "token_in": 281, "token_out": 305, "cost_usd": 0.00022514999999999997}, {"stage": "generator", "duration_ms": 1510, "summary": "ok", "notes": {"rationale_len": 85}, "token_in": 803, "token_out": 37, "cost_usd": 0.00014265}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 6, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 857, "summary": "ok", "notes": {"old_sql_len": 55, "new_sql_len": 64}, "token_in": 325, "token_out": 21, "cost_usd": 6.135e-05}, {"stage": "safety", "duration_ms": 2, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 6, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 1004, "summary": "ok", "notes": {"old_sql_len": 64, "new_sql_len": 64}, "token_in": 328, "token_out": 21, "cost_usd": 6.18e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 6, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
5
+ {"source": "spider", "db_id": "concert_singer", "query": "What is the average, minimum, and maximum age of all singers from France?", "gold_sql": "SELECT avg(age) , min(age) , max(age) FROM singer WHERE country = 'France'", "pred_sql": "select avg(Age), min(Age), max(Age) from singer where Country = 'France'", "ok": true, "latency_ms": 11247, "em": 0.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 7296, "summary": "ok", "notes": {"len_plan": 1578}, "token_in": 279, "token_out": 425, "cost_usd": 0.00029685}, {"stage": "generator", "duration_ms": 1552, "summary": "ok", "notes": {"rationale_len": 67}, "token_in": 921, "token_out": 42, "cost_usd": 0.00016334999999999999}, {"stage": "safety", "duration_ms": 2, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 1222, "summary": "ok", "notes": {"old_sql_len": 72, "new_sql_len": 80}, "token_in": 333, "token_out": 24, "cost_usd": 6.435e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 1163, "summary": "ok", "notes": {"old_sql_len": 80, "new_sql_len": 72}, "token_in": 337, "token_out": 28, "cost_usd": 6.735e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
benchmarks/results_pro/20251109-095552/summary.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "timestamp": "2025-11-09T09:56:33",
3
+ "total": 5,
4
+ "success": 5,
5
+ "success_rate": 1.0,
6
+ "avg_latency_ms": 8273.6,
7
+ "EM": 0.4,
8
+ "SM": 0.8,
9
+ "ExecAcc": 0.8,
10
+ "split": "dev",
11
+ "config": "configs/sqlite_pipeline.yaml"
12
+ }
benchmarks/results_pro/20251109-100021/eval.jsonl ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {"source": "spider", "db_id": "concert_singer", "query": "How many singers do we have?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "select count(*) from singer limit 1", "ok": true, "latency_ms": 9656, "em": 1.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 7138, "summary": "ok", "notes": {"len_plan": 1287}, "token_in": 265, "token_out": 303, "cost_usd": 0.00022154999999999996}, {"stage": "generator", "duration_ms": 875, "summary": "ok", "notes": {"rationale_len": 30}, "token_in": 785, "token_out": 19, "cost_usd": 0.00012915}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 803, "summary": "ok", "notes": {"old_sql_len": 27, "new_sql_len": 35}, "token_in": 313, "token_out": 8, "cost_usd": 5.175e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 829, "summary": "ok", "notes": {"old_sql_len": 35, "new_sql_len": 35}, "token_in": 316, "token_out": 8, "cost_usd": 5.2199999999999995e-05}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
2
+ {"source": "spider", "db_id": "concert_singer", "query": "What is the total number of singers?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "select count(*) from singer limit 1", "ok": true, "latency_ms": 11252, "em": 1.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 8353, "summary": "ok", "notes": {"len_plan": 1399}, "token_in": 266, "token_out": 330, "cost_usd": 0.00023789999999999998}, {"stage": "generator", "duration_ms": 1048, "summary": "ok", "notes": {"rationale_len": 30}, "token_in": 813, "token_out": 19, "cost_usd": 0.00013335}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 794, "summary": "ok", "notes": {"old_sql_len": 27, "new_sql_len": 35}, "token_in": 313, "token_out": 8, "cost_usd": 5.175e-05}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 1052, "summary": "ok", "notes": {"old_sql_len": 35, "new_sql_len": 35}, "token_in": 316, "token_out": 8, "cost_usd": 5.2199999999999995e-05}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
3
+ {"source": "spider", "db_id": "concert_singer", "query": "Show name, country, age for all singers ordered by age from the oldest to the youngest.", "gold_sql": "SELECT name , country , age FROM singer ORDER BY age DESC", "pred_sql": "", "ok": true, "latency_ms": 0, "em": 0.0, "sm": 0.0, "exec_acc": 0.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "ambiguous", "notes": {"ambiguous": true, "questions_len": 1}}]}
4
+ {"source": "spider", "db_id": "concert_singer", "query": "What are the names, countries, and ages for every singer in descending order of age?", "gold_sql": "SELECT name , country , age FROM singer ORDER BY age DESC", "pred_sql": "select Name, Country, Age from singer order by Age desc LIMIT 10", "ok": true, "latency_ms": 8517, "em": 0.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 5263, "summary": "ok", "notes": {"len_plan": 1304}, "token_in": 276, "token_out": 300, "cost_usd": 0.0002214}, {"stage": "generator", "duration_ms": 1022, "summary": "ok", "notes": {"rationale_len": 85}, "token_in": 793, "token_out": 37, "cost_usd": 0.00014115}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 6, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 977, "summary": "ok", "notes": {"old_sql_len": 55, "new_sql_len": 64}, "token_in": 320, "token_out": 21, "cost_usd": 6.0599999999999996e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 6, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 1249, "summary": "ok", "notes": {"old_sql_len": 64, "new_sql_len": 64}, "token_in": 323, "token_out": 21, "cost_usd": 6.104999999999999e-05}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 0, "summary": "ok", "notes": {"row_count": 6, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
5
+ {"source": "spider", "db_id": "concert_singer", "query": "What is the average, minimum, and maximum age of all singers from France?", "gold_sql": "SELECT avg(age) , min(age) , max(age) FROM singer WHERE country = 'France'", "pred_sql": "select avg(Age), min(Age), max(Age) from singer where Country = 'France'", "ok": true, "latency_ms": 15468, "em": 0.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 11390, "summary": "ok", "notes": {"len_plan": 1400}, "token_in": 274, "token_out": 348, "cost_usd": 0.0002499}, {"stage": "generator", "duration_ms": 1252, "summary": "ok", "notes": {"rationale_len": 95}, "token_in": 839, "token_out": 45, "cost_usd": 0.00015285}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 1384, "summary": "ok", "notes": {"old_sql_len": 72, "new_sql_len": 80}, "token_in": 328, "token_out": 24, "cost_usd": 6.36e-05}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 0, "summary": "ok", "notes": {"row_count": 1, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 1437, "summary": "ok", "notes": {"old_sql_len": 80, "new_sql_len": 72}, "token_in": 332, "token_out": 21, "cost_usd": 6.24e-05}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 0, "summary": "ok", "notes": {"row_count": 1, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
benchmarks/results_pro/20251109-100021/results.csv ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ db_id,query,ok,em,sm,exec_acc,latency_ms
2
+ concert_singer,"How many singers do we have?",βœ…,1.0,1.0,1.0,9656
3
+ concert_singer,"What is the total number of singers?",βœ…,1.0,1.0,1.0,11252
4
+ concert_singer,"Show name, country, age for all singers ordered by age from the oldest to the youngest.",βœ…,0.0,0.0,0.0,0
5
+ concert_singer,"What are the names, countries, and ages for every singer in descending order of age?",βœ…,0.0,1.0,1.0,8517
6
+ concert_singer,"What is the average, minimum, and maximum age of all singers from France?",βœ…,0.0,1.0,1.0,15468
benchmarks/results_pro/20251109-100021/summary.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "timestamp": "2025-11-09T10:01:06",
3
+ "split": "dev",
4
+ "config": "configs/sqlite_pipeline.yaml",
5
+ "total": 5,
6
+ "success": 5,
7
+ "success_rate": 1.0,
8
+ "avg_latency_ms": 8978.6,
9
+ "p50_latency_ms": 9656.0,
10
+ "p95_latency_ms": 14624.8,
11
+ "EM": 0.4,
12
+ "SM": 0.8,
13
+ "ExecAcc": 0.8,
14
+ "detector_avg_ms": 0.0,
15
+ "planner_avg_ms": 8036.0,
16
+ "generator_avg_ms": 1049.25,
17
+ "safety_avg_ms": 0.33,
18
+ "executor_avg_ms": 0.75,
19
+ "verifier_avg_ms": 0.0,
20
+ "repair_avg_ms": 1065.62
21
+ }