Melika Kheirieh commited on
Commit
bf06cf7
Β·
1 Parent(s): 3b2af0f

feat(core): stabilize benchmark pipeline with accurate latency tracking, retry-empty handling, and refined plots

Browse files
adapters/llm/openai_provider.py CHANGED
@@ -147,6 +147,7 @@ CRITICAL RULES:
147
  6. Use lowercase for SQL keywords (select, from, where, etc.)
148
  7. Do not add unnecessary parentheses or formatting
149
  8. Match exact column and table names from the schema (case-sensitive)
 
150
 
151
  IMPORTANT:
152
  - For counting all rows: Use COUNT(*) not COUNT(column_name)
 
147
  6. Use lowercase for SQL keywords (select, from, where, etc.)
148
  7. Do not add unnecessary parentheses or formatting
149
  8. Match exact column and table names from the schema (case-sensitive)
150
+ 9. NEVER return empty SQL. If unsure, return the simplest valid SQL that answers the question.
151
 
152
  IMPORTANT:
153
  - For counting all rows: Use COUNT(*) not COUNT(column_name)
benchmarks/evaluate_spider_pro.py CHANGED
@@ -66,31 +66,43 @@ def extract_clean_sql(text: str | None) -> str:
66
 
67
 
68
  def normalize_sql(sql: str) -> str:
69
- """Light normalization to make EM stricter-but-fair."""
 
 
 
 
 
 
 
70
  if not sql:
71
  return ""
72
  s = sql.strip()
73
- # unify case but keep literals recognizable
74
- s = re.sub(r"\s+", " ", s).strip()
75
- s = s.rstrip(";")
76
 
77
- # drop table prefixes a.b -> b
78
- s = re.sub(r"\b\w+\.(\w+)\b", r"\1", s)
 
 
 
 
 
 
 
 
 
 
 
79
 
80
- # collapse quotes around identifiers
81
  s = re.sub(r"`([A-Za-z_]\w*)`", r"\1", s)
82
  s = re.sub(r'"([A-Za-z_]\w*)"', r"\1", s)
83
 
84
- # COUNT(foo) -> COUNT(*), DISTINCT inside COUNT -> COUNT(*)
85
- s = re.sub(r"(?i)COUNT\s*\(\s*DISTINCT\s+[^)]+\)", "COUNT(*)", s)
86
- s = re.sub(r"(?i)COUNT\s*\(\s*[A-Za-z_]\w*\s*\)", "COUNT(*)", s)
87
 
88
- # strip trailing LIMIT n
89
- s = re.sub(r"(?i)\s+LIMIT\s+\d+\s*$", "", s)
90
-
91
- # canonical whitespace + upper keywords for stability
92
  s = re.sub(r"\s+", " ", s).strip()
93
- # keyword upper (a bit heuristic)
 
94
  for kw in [
95
  "select",
96
  "from",
@@ -107,6 +119,7 @@ def normalize_sql(sql: str) -> str:
107
  "desc",
108
  ]:
109
  s = re.sub(rf"(?i)\b{kw}\b", kw.upper(), s)
 
110
  return s
111
 
112
 
 
66
 
67
 
68
  def normalize_sql(sql: str) -> str:
69
+ """
70
+ Conservative normalization for exact-match (EM):
71
+ - Trim, collapse spaces, drop trailing ';'
72
+ - Drop trailing 'LIMIT n'
73
+ - Remove table prefixes only in single-table, no-join queries
74
+ - Unquote identifiers like `name` or "name"
75
+ - Uppercase common SQL keywords (string literals unaffected)
76
+ """
77
  if not sql:
78
  return ""
79
  s = sql.strip()
 
 
 
80
 
81
+ # Collapse whitespace early and drop trailing ';'
82
+ s = re.sub(r"\s+", " ", s).strip().rstrip(";")
83
+
84
+ # Drop trailing LIMIT n
85
+ s = re.sub(r"(?i)\s+LIMIT\s+\d+\s*$", "", s)
86
+
87
+ # Remove table prefixes only if single FROM and no JOIN
88
+ lower = s.lower()
89
+ if lower.count(" from ") == 1 and " join " not in lower:
90
+ m = re.search(r"(?i)\bfrom\s+([a-z_][a-z0-9_]*)", s, flags=re.IGNORECASE)
91
+ if m:
92
+ table = m.group(1)
93
+ s = re.sub(rf"\b{re.escape(table)}\.(\w+)\b", r"\1", s)
94
 
95
+ # Unquote identifiers: `foo` -> foo, "foo" -> foo (strings '...' remain)
96
  s = re.sub(r"`([A-Za-z_]\w*)`", r"\1", s)
97
  s = re.sub(r'"([A-Za-z_]\w*)"', r"\1", s)
98
 
99
+ # Normalize comma spacing: "a , b" -> "a, b"
100
+ s = re.sub(r"\s*,\s*", ", ", s)
 
101
 
102
+ # Final whitespace collapse
 
 
 
103
  s = re.sub(r"\s+", " ", s).strip()
104
+
105
+ # Uppercase common keywords (word-boundary safe)
106
  for kw in [
107
  "select",
108
  "from",
 
119
  "desc",
120
  ]:
121
  s = re.sub(rf"(?i)\b{kw}\b", kw.upper(), s)
122
+
123
  return s
124
 
125
 
benchmarks/results_pro/20251109-125209/eval.jsonl ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {"source": "spider", "db_id": "concert_singer", "query": "How many singers do we have?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "select count(*) from singer", "ok": true, "latency_ms": 9129, "em": 1.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 1, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 6689, "summary": "ok", "notes": {}}, {"stage": "generator", "duration_ms": 2430, "summary": "failed", "notes": {"rationale_len": 30}}, {"stage": "safety", "duration_ms": 2, "summary": "failed", "notes": {}}, {"stage": "executor", "duration_ms": 2, "summary": "failed", "notes": {"row_count": 1, "col_count": 1, "sql_length": 27}}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"sql_length": 27, "has_select": true, "has_from": true, "has_over": false, "has_group_by": false, "has_distinct": false, "has_aggregate": true, "mixes_cols": false, "verified": true}}, {"stage": "pipeline", "duration_ms": 1, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
2
+ {"source": "spider", "db_id": "concert_singer", "query": "What is the total number of singers?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "select count(*) from singer", "ok": true, "latency_ms": 10009, "em": 1.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 1, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 9046, "summary": "ok", "notes": {}}, {"stage": "generator", "duration_ms": 960, "summary": "failed", "notes": {"rationale_len": 30}}, {"stage": "safety", "duration_ms": 1, "summary": "failed", "notes": {}}, {"stage": "executor", "duration_ms": 2, "summary": "failed", "notes": {"row_count": 1, "col_count": 1, "sql_length": 27}}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"sql_length": 27, "has_select": true, "has_from": true, "has_over": false, "has_group_by": false, "has_distinct": false, "has_aggregate": true, "mixes_cols": false, "verified": true}}, {"stage": "pipeline", "duration_ms": 1, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
3
+ {"source": "spider", "db_id": "concert_singer", "query": "Show name, country, age for all singers ordered by age from the oldest to the youngest.", "gold_sql": "SELECT name , country , age FROM singer ORDER BY age DESC", "pred_sql": "", "ok": true, "latency_ms": 0, "em": 0.0, "sm": 0.0, "exec_acc": 0.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 1, "summary": "ambiguous", "notes": {"ambiguous": true, "questions_len": 1}}]}
4
+ {"source": "spider", "db_id": "concert_singer", "query": "What are the names, countries, and ages for every singer in descending order of age?", "gold_sql": "SELECT name , country , age FROM singer ORDER BY age DESC", "pred_sql": "select Name, Country, Age from singer order by Age desc", "ok": true, "latency_ms": 9560, "em": 0.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 1, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 8063, "summary": "ok", "notes": {}}, {"stage": "generator", "duration_ms": 1494, "summary": "failed", "notes": {"rationale_len": 85}}, {"stage": "safety", "duration_ms": 1, "summary": "failed", "notes": {}}, {"stage": "executor", "duration_ms": 1, "summary": "failed", "notes": {"row_count": 6, "col_count": 3, "sql_length": 55}}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"sql_length": 55, "has_select": true, "has_from": true, "has_over": false, "has_group_by": false, "has_distinct": false, "has_aggregate": false, "mixes_cols": false, "verified": true}}, {"stage": "pipeline", "duration_ms": 1, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
5
+ {"source": "spider", "db_id": "concert_singer", "query": "What is the average, minimum, and maximum age of all singers from France?", "gold_sql": "SELECT avg(age) , min(age) , max(age) FROM singer WHERE country = 'France'", "pred_sql": "select avg(Age), min(Age), max(Age) from singer where Country = 'France'", "ok": true, "latency_ms": 12661, "em": 0.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 1, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 7869, "summary": "ok", "notes": {}}, {"stage": "generator", "duration_ms": 2410, "summary": "failed", "notes": {"rationale_len": 67}}, {"stage": "safety", "duration_ms": 3, "summary": "failed", "notes": {}}, {"stage": "executor", "duration_ms": 2, "summary": "failed", "notes": {"row_count": 1, "col_count": 3, "sql_length": 72}}, {"stage": "verifier", "duration_ms": 1, "summary": "failed", "notes": {"sql_length": 72, "has_select": true, "has_from": true, "has_over": false, "has_group_by": false, "has_distinct": false, "has_aggregate": true, "mixes_cols": true, "verified": false}}, {"stage": "repair", "duration_ms": 1179, "summary": "failed", "notes": {"old_sql_len": 72, "new_sql_len": 80}}, {"stage": "safety", "duration_ms": 1, "summary": "failed", "notes": {}}, {"stage": "executor", "duration_ms": 1, "summary": "failed", "notes": {"row_count": 1, "col_count": 3, "sql_length": 80}}, {"stage": "verifier", "duration_ms": 1, "summary": "failed", "notes": {"sql_length": 80, "has_select": true, "has_from": true, "has_over": false, "has_group_by": false, "has_distinct": false, "has_aggregate": true, "mixes_cols": true, "verified": false}}, {"stage": "repair", "duration_ms": 1191, "summary": "failed", "notes": {"old_sql_len": 80, "new_sql_len": 72}}, {"stage": "safety", "duration_ms": 1, "summary": "failed", "notes": {}}, {"stage": "executor", "duration_ms": 1, "summary": "failed", "notes": {"row_count": 1, "col_count": 3, "sql_length": 72}}, {"stage": "verifier", "duration_ms": 1, "summary": "failed", "notes": {"sql_length": 72, "has_select": true, "has_from": true, "has_over": false, "has_group_by": false, "has_distinct": false, "has_aggregate": true, "mixes_cols": true, "verified": false}}, {"stage": "pipeline", "duration_ms": 1, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 1, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
benchmarks/results_pro/20251109-125209/latency_histogram.png ADDED
benchmarks/results_pro/20251109-125209/latency_per_stage.png ADDED
benchmarks/results_pro/20251109-125209/metrics_overview.png ADDED
benchmarks/results_pro/20251109-125209/results.csv ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ db_id,query,ok,em,sm,exec_acc,latency_ms
2
+ concert_singer,"How many singers do we have?",βœ…,1.0,1.0,1.0,9129
3
+ concert_singer,"What is the total number of singers?",βœ…,1.0,1.0,1.0,10009
4
+ concert_singer,"Show name, country, age for all singers ordered by age from the oldest to the youngest.",βœ…,0.0,0.0,0.0,0
5
+ concert_singer,"What are the names, countries, and ages for every singer in descending order of age?",βœ…,0.0,1.0,1.0,9560
6
+ concert_singer,"What is the average, minimum, and maximum age of all singers from France?",βœ…,0.0,1.0,1.0,12661
benchmarks/results_pro/20251109-125209/summary.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "timestamp": "2025-11-09T12:52:50",
3
+ "split": "dev",
4
+ "config": "configs/sqlite_pipeline.yaml",
5
+ "total": 5,
6
+ "success": 5,
7
+ "success_rate": 1.0,
8
+ "avg_latency_ms": 8271.8,
9
+ "p50_latency_ms": 9560.0,
10
+ "p95_latency_ms": 12130.6,
11
+ "EM": 0.4,
12
+ "SM": 0.8,
13
+ "ExecAcc": 0.8,
14
+ "detector_avg_ms": 1.0,
15
+ "planner_avg_ms": 7916.75,
16
+ "generator_avg_ms": 1823.5,
17
+ "safety_avg_ms": 1.5,
18
+ "executor_avg_ms": 1.5,
19
+ "verifier_avg_ms": 1.0,
20
+ "repair_avg_ms": 1185.0
21
+ }
benchmarks/results_pro/20251109-125509/eval.jsonl ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {"source": "spider", "db_id": "concert_singer", "query": "How many singers do we have?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "select count(*) from singer", "ok": true, "latency_ms": 7007, "em": 1.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 1, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 5977, "summary": "ok", "notes": {}}, {"stage": "generator", "duration_ms": 1019, "summary": "failed", "notes": {"rationale_len": 30}}, {"stage": "safety", "duration_ms": 3, "summary": "failed", "notes": {}}, {"stage": "executor", "duration_ms": 4, "summary": "failed", "notes": {"row_count": 1, "col_count": 1, "sql_length": 27}}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"sql_length": 27, "has_select": true, "has_from": true, "has_over": false, "has_group_by": false, "has_distinct": false, "has_aggregate": true, "mixes_cols": false, "verified": true}}, {"stage": "pipeline", "duration_ms": 1, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
2
+ {"source": "spider", "db_id": "concert_singer", "query": "What is the total number of singers?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "select count(*) from singer", "ok": true, "latency_ms": 5715, "em": 1.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 1, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 4893, "summary": "ok", "notes": {}}, {"stage": "generator", "duration_ms": 819, "summary": "failed", "notes": {"rationale_len": 30}}, {"stage": "safety", "duration_ms": 1, "summary": "failed", "notes": {}}, {"stage": "executor", "duration_ms": 1, "summary": "failed", "notes": {"row_count": 1, "col_count": 1, "sql_length": 27}}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"sql_length": 27, "has_select": true, "has_from": true, "has_over": false, "has_group_by": false, "has_distinct": false, "has_aggregate": true, "mixes_cols": false, "verified": true}}, {"stage": "pipeline", "duration_ms": 1, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
3
+ {"source": "spider", "db_id": "concert_singer", "query": "Show name, country, age for all singers ordered by age from the oldest to the youngest.", "gold_sql": "SELECT name , country , age FROM singer ORDER BY age DESC", "pred_sql": "", "ok": true, "latency_ms": 0, "em": 0.0, "sm": 0.0, "exec_acc": 0.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 1, "summary": "ambiguous", "notes": {"ambiguous": true, "questions_len": 1}}]}
4
+ {"source": "spider", "db_id": "concert_singer", "query": "What are the names, countries, and ages for every singer in descending order of age?", "gold_sql": "SELECT name , country , age FROM singer ORDER BY age DESC", "pred_sql": "select Name, Country, Age from singer order by Age desc", "ok": true, "latency_ms": 7982, "em": 0.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 1, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 6702, "summary": "ok", "notes": {}}, {"stage": "generator", "duration_ms": 1278, "summary": "failed", "notes": {"rationale_len": 85}}, {"stage": "safety", "duration_ms": 1, "summary": "failed", "notes": {}}, {"stage": "executor", "duration_ms": 1, "summary": "failed", "notes": {"row_count": 6, "col_count": 3, "sql_length": 55}}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"sql_length": 55, "has_select": true, "has_from": true, "has_over": false, "has_group_by": false, "has_distinct": false, "has_aggregate": false, "mixes_cols": false, "verified": true}}, {"stage": "pipeline", "duration_ms": 1, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
5
+ {"source": "spider", "db_id": "concert_singer", "query": "What is the average, minimum, and maximum age of all singers from France?", "gold_sql": "SELECT avg(age) , min(age) , max(age) FROM singer WHERE country = 'France'", "pred_sql": "select avg(Age), min(Age), max(Age) from singer where Country = 'France'", "ok": true, "latency_ms": 11942, "em": 0.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 1, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 8334, "summary": "ok", "notes": {}}, {"stage": "generator", "duration_ms": 1279, "summary": "failed", "notes": {"rationale_len": 71}}, {"stage": "safety", "duration_ms": 2, "summary": "failed", "notes": {}}, {"stage": "executor", "duration_ms": 1, "summary": "failed", "notes": {"row_count": 1, "col_count": 3, "sql_length": 72}}, {"stage": "verifier", "duration_ms": 1, "summary": "failed", "notes": {"sql_length": 72, "has_select": true, "has_from": true, "has_over": false, "has_group_by": false, "has_distinct": false, "has_aggregate": true, "mixes_cols": true, "verified": false}}, {"stage": "repair", "duration_ms": 1242, "summary": "failed", "notes": {"old_sql_len": 72, "new_sql_len": 80}}, {"stage": "safety", "duration_ms": 2, "summary": "failed", "notes": {}}, {"stage": "executor", "duration_ms": 1, "summary": "failed", "notes": {"row_count": 1, "col_count": 3, "sql_length": 80}}, {"stage": "verifier", "duration_ms": 1, "summary": "failed", "notes": {"sql_length": 80, "has_select": true, "has_from": true, "has_over": false, "has_group_by": false, "has_distinct": false, "has_aggregate": true, "mixes_cols": true, "verified": false}}, {"stage": "repair", "duration_ms": 1075, "summary": "failed", "notes": {"old_sql_len": 80, "new_sql_len": 72}}, {"stage": "safety", "duration_ms": 2, "summary": "failed", "notes": {}}, {"stage": "executor", "duration_ms": 1, "summary": "failed", "notes": {"row_count": 1, "col_count": 3, "sql_length": 72}}, {"stage": "verifier", "duration_ms": 1, "summary": "failed", "notes": {"sql_length": 72, "has_select": true, "has_from": true, "has_over": false, "has_group_by": false, "has_distinct": false, "has_aggregate": true, "mixes_cols": true, "verified": false}}, {"stage": "pipeline", "duration_ms": 1, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 1, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
benchmarks/results_pro/20251109-125509/latency_histogram.png ADDED
benchmarks/results_pro/20251109-125509/latency_per_stage.png ADDED
benchmarks/results_pro/20251109-125509/metrics_overview.png ADDED
benchmarks/results_pro/20251109-125509/results.csv ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ db_id,query,ok,em,sm,exec_acc,latency_ms
2
+ concert_singer,"How many singers do we have?",βœ…,1.0,1.0,1.0,7007
3
+ concert_singer,"What is the total number of singers?",βœ…,1.0,1.0,1.0,5715
4
+ concert_singer,"Show name, country, age for all singers ordered by age from the oldest to the youngest.",βœ…,0.0,0.0,0.0,0
5
+ concert_singer,"What are the names, countries, and ages for every singer in descending order of age?",βœ…,0.0,1.0,1.0,7982
6
+ concert_singer,"What is the average, minimum, and maximum age of all singers from France?",βœ…,0.0,1.0,1.0,11942
benchmarks/results_pro/20251109-125509/summary.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "timestamp": "2025-11-09T12:55:42",
3
+ "split": "dev",
4
+ "config": "configs/sqlite_pipeline.yaml",
5
+ "total": 5,
6
+ "success": 5,
7
+ "success_rate": 1.0,
8
+ "avg_latency_ms": 6529.2,
9
+ "p50_latency_ms": 7007.0,
10
+ "p95_latency_ms": 11150.0,
11
+ "EM": 0.4,
12
+ "SM": 0.8,
13
+ "ExecAcc": 0.8,
14
+ "detector_avg_ms": 1.0,
15
+ "planner_avg_ms": 6476.5,
16
+ "generator_avg_ms": 1098.75,
17
+ "safety_avg_ms": 1.83,
18
+ "executor_avg_ms": 1.5,
19
+ "verifier_avg_ms": 1.0,
20
+ "repair_avg_ms": 1158.5
21
+ }