Spaces:
Running
Running
Melika Kheirieh
commited on
Commit
Β·
bf06cf7
1
Parent(s):
3b2af0f
feat(core): stabilize benchmark pipeline with accurate latency tracking, retry-empty handling, and refined plots
Browse files- adapters/llm/openai_provider.py +1 -0
- benchmarks/evaluate_spider_pro.py +28 -15
- benchmarks/results_pro/20251109-125209/eval.jsonl +5 -0
- benchmarks/results_pro/20251109-125209/latency_histogram.png +0 -0
- benchmarks/results_pro/20251109-125209/latency_per_stage.png +0 -0
- benchmarks/results_pro/20251109-125209/metrics_overview.png +0 -0
- benchmarks/results_pro/20251109-125209/results.csv +6 -0
- benchmarks/results_pro/20251109-125209/summary.json +21 -0
- benchmarks/results_pro/20251109-125509/eval.jsonl +5 -0
- benchmarks/results_pro/20251109-125509/latency_histogram.png +0 -0
- benchmarks/results_pro/20251109-125509/latency_per_stage.png +0 -0
- benchmarks/results_pro/20251109-125509/metrics_overview.png +0 -0
- benchmarks/results_pro/20251109-125509/results.csv +6 -0
- benchmarks/results_pro/20251109-125509/summary.json +21 -0
adapters/llm/openai_provider.py
CHANGED
|
@@ -147,6 +147,7 @@ CRITICAL RULES:
|
|
| 147 |
6. Use lowercase for SQL keywords (select, from, where, etc.)
|
| 148 |
7. Do not add unnecessary parentheses or formatting
|
| 149 |
8. Match exact column and table names from the schema (case-sensitive)
|
|
|
|
| 150 |
|
| 151 |
IMPORTANT:
|
| 152 |
- For counting all rows: Use COUNT(*) not COUNT(column_name)
|
|
|
|
| 147 |
6. Use lowercase for SQL keywords (select, from, where, etc.)
|
| 148 |
7. Do not add unnecessary parentheses or formatting
|
| 149 |
8. Match exact column and table names from the schema (case-sensitive)
|
| 150 |
+
9. NEVER return empty SQL. If unsure, return the simplest valid SQL that answers the question.
|
| 151 |
|
| 152 |
IMPORTANT:
|
| 153 |
- For counting all rows: Use COUNT(*) not COUNT(column_name)
|
benchmarks/evaluate_spider_pro.py
CHANGED
|
@@ -66,31 +66,43 @@ def extract_clean_sql(text: str | None) -> str:
|
|
| 66 |
|
| 67 |
|
| 68 |
def normalize_sql(sql: str) -> str:
|
| 69 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
if not sql:
|
| 71 |
return ""
|
| 72 |
s = sql.strip()
|
| 73 |
-
# unify case but keep literals recognizable
|
| 74 |
-
s = re.sub(r"\s+", " ", s).strip()
|
| 75 |
-
s = s.rstrip(";")
|
| 76 |
|
| 77 |
-
#
|
| 78 |
-
s = re.sub(r"\
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
|
| 80 |
-
#
|
| 81 |
s = re.sub(r"`([A-Za-z_]\w*)`", r"\1", s)
|
| 82 |
s = re.sub(r'"([A-Za-z_]\w*)"', r"\1", s)
|
| 83 |
|
| 84 |
-
#
|
| 85 |
-
s = re.sub(r"
|
| 86 |
-
s = re.sub(r"(?i)COUNT\s*\(\s*[A-Za-z_]\w*\s*\)", "COUNT(*)", s)
|
| 87 |
|
| 88 |
-
#
|
| 89 |
-
s = re.sub(r"(?i)\s+LIMIT\s+\d+\s*$", "", s)
|
| 90 |
-
|
| 91 |
-
# canonical whitespace + upper keywords for stability
|
| 92 |
s = re.sub(r"\s+", " ", s).strip()
|
| 93 |
-
|
|
|
|
| 94 |
for kw in [
|
| 95 |
"select",
|
| 96 |
"from",
|
|
@@ -107,6 +119,7 @@ def normalize_sql(sql: str) -> str:
|
|
| 107 |
"desc",
|
| 108 |
]:
|
| 109 |
s = re.sub(rf"(?i)\b{kw}\b", kw.upper(), s)
|
|
|
|
| 110 |
return s
|
| 111 |
|
| 112 |
|
|
|
|
| 66 |
|
| 67 |
|
| 68 |
def normalize_sql(sql: str) -> str:
|
| 69 |
+
"""
|
| 70 |
+
Conservative normalization for exact-match (EM):
|
| 71 |
+
- Trim, collapse spaces, drop trailing ';'
|
| 72 |
+
- Drop trailing 'LIMIT n'
|
| 73 |
+
- Remove table prefixes only in single-table, no-join queries
|
| 74 |
+
- Unquote identifiers like `name` or "name"
|
| 75 |
+
- Uppercase common SQL keywords (string literals unaffected)
|
| 76 |
+
"""
|
| 77 |
if not sql:
|
| 78 |
return ""
|
| 79 |
s = sql.strip()
|
|
|
|
|
|
|
|
|
|
| 80 |
|
| 81 |
+
# Collapse whitespace early and drop trailing ';'
|
| 82 |
+
s = re.sub(r"\s+", " ", s).strip().rstrip(";")
|
| 83 |
+
|
| 84 |
+
# Drop trailing LIMIT n
|
| 85 |
+
s = re.sub(r"(?i)\s+LIMIT\s+\d+\s*$", "", s)
|
| 86 |
+
|
| 87 |
+
# Remove table prefixes only if single FROM and no JOIN
|
| 88 |
+
lower = s.lower()
|
| 89 |
+
if lower.count(" from ") == 1 and " join " not in lower:
|
| 90 |
+
m = re.search(r"(?i)\bfrom\s+([a-z_][a-z0-9_]*)", s, flags=re.IGNORECASE)
|
| 91 |
+
if m:
|
| 92 |
+
table = m.group(1)
|
| 93 |
+
s = re.sub(rf"\b{re.escape(table)}\.(\w+)\b", r"\1", s)
|
| 94 |
|
| 95 |
+
# Unquote identifiers: `foo` -> foo, "foo" -> foo (strings '...' remain)
|
| 96 |
s = re.sub(r"`([A-Za-z_]\w*)`", r"\1", s)
|
| 97 |
s = re.sub(r'"([A-Za-z_]\w*)"', r"\1", s)
|
| 98 |
|
| 99 |
+
# Normalize comma spacing: "a , b" -> "a, b"
|
| 100 |
+
s = re.sub(r"\s*,\s*", ", ", s)
|
|
|
|
| 101 |
|
| 102 |
+
# Final whitespace collapse
|
|
|
|
|
|
|
|
|
|
| 103 |
s = re.sub(r"\s+", " ", s).strip()
|
| 104 |
+
|
| 105 |
+
# Uppercase common keywords (word-boundary safe)
|
| 106 |
for kw in [
|
| 107 |
"select",
|
| 108 |
"from",
|
|
|
|
| 119 |
"desc",
|
| 120 |
]:
|
| 121 |
s = re.sub(rf"(?i)\b{kw}\b", kw.upper(), s)
|
| 122 |
+
|
| 123 |
return s
|
| 124 |
|
| 125 |
|
benchmarks/results_pro/20251109-125209/eval.jsonl
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"source": "spider", "db_id": "concert_singer", "query": "How many singers do we have?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "select count(*) from singer", "ok": true, "latency_ms": 9129, "em": 1.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 1, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 6689, "summary": "ok", "notes": {}}, {"stage": "generator", "duration_ms": 2430, "summary": "failed", "notes": {"rationale_len": 30}}, {"stage": "safety", "duration_ms": 2, "summary": "failed", "notes": {}}, {"stage": "executor", "duration_ms": 2, "summary": "failed", "notes": {"row_count": 1, "col_count": 1, "sql_length": 27}}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"sql_length": 27, "has_select": true, "has_from": true, "has_over": false, "has_group_by": false, "has_distinct": false, "has_aggregate": true, "mixes_cols": false, "verified": true}}, {"stage": "pipeline", "duration_ms": 1, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
|
| 2 |
+
{"source": "spider", "db_id": "concert_singer", "query": "What is the total number of singers?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "select count(*) from singer", "ok": true, "latency_ms": 10009, "em": 1.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 1, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 9046, "summary": "ok", "notes": {}}, {"stage": "generator", "duration_ms": 960, "summary": "failed", "notes": {"rationale_len": 30}}, {"stage": "safety", "duration_ms": 1, "summary": "failed", "notes": {}}, {"stage": "executor", "duration_ms": 2, "summary": "failed", "notes": {"row_count": 1, "col_count": 1, "sql_length": 27}}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"sql_length": 27, "has_select": true, "has_from": true, "has_over": false, "has_group_by": false, "has_distinct": false, "has_aggregate": true, "mixes_cols": false, "verified": true}}, {"stage": "pipeline", "duration_ms": 1, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
|
| 3 |
+
{"source": "spider", "db_id": "concert_singer", "query": "Show name, country, age for all singers ordered by age from the oldest to the youngest.", "gold_sql": "SELECT name , country , age FROM singer ORDER BY age DESC", "pred_sql": "", "ok": true, "latency_ms": 0, "em": 0.0, "sm": 0.0, "exec_acc": 0.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 1, "summary": "ambiguous", "notes": {"ambiguous": true, "questions_len": 1}}]}
|
| 4 |
+
{"source": "spider", "db_id": "concert_singer", "query": "What are the names, countries, and ages for every singer in descending order of age?", "gold_sql": "SELECT name , country , age FROM singer ORDER BY age DESC", "pred_sql": "select Name, Country, Age from singer order by Age desc", "ok": true, "latency_ms": 9560, "em": 0.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 1, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 8063, "summary": "ok", "notes": {}}, {"stage": "generator", "duration_ms": 1494, "summary": "failed", "notes": {"rationale_len": 85}}, {"stage": "safety", "duration_ms": 1, "summary": "failed", "notes": {}}, {"stage": "executor", "duration_ms": 1, "summary": "failed", "notes": {"row_count": 6, "col_count": 3, "sql_length": 55}}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"sql_length": 55, "has_select": true, "has_from": true, "has_over": false, "has_group_by": false, "has_distinct": false, "has_aggregate": false, "mixes_cols": false, "verified": true}}, {"stage": "pipeline", "duration_ms": 1, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
|
| 5 |
+
{"source": "spider", "db_id": "concert_singer", "query": "What is the average, minimum, and maximum age of all singers from France?", "gold_sql": "SELECT avg(age) , min(age) , max(age) FROM singer WHERE country = 'France'", "pred_sql": "select avg(Age), min(Age), max(Age) from singer where Country = 'France'", "ok": true, "latency_ms": 12661, "em": 0.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 1, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 7869, "summary": "ok", "notes": {}}, {"stage": "generator", "duration_ms": 2410, "summary": "failed", "notes": {"rationale_len": 67}}, {"stage": "safety", "duration_ms": 3, "summary": "failed", "notes": {}}, {"stage": "executor", "duration_ms": 2, "summary": "failed", "notes": {"row_count": 1, "col_count": 3, "sql_length": 72}}, {"stage": "verifier", "duration_ms": 1, "summary": "failed", "notes": {"sql_length": 72, "has_select": true, "has_from": true, "has_over": false, "has_group_by": false, "has_distinct": false, "has_aggregate": true, "mixes_cols": true, "verified": false}}, {"stage": "repair", "duration_ms": 1179, "summary": "failed", "notes": {"old_sql_len": 72, "new_sql_len": 80}}, {"stage": "safety", "duration_ms": 1, "summary": "failed", "notes": {}}, {"stage": "executor", "duration_ms": 1, "summary": "failed", "notes": {"row_count": 1, "col_count": 3, "sql_length": 80}}, {"stage": "verifier", "duration_ms": 1, "summary": "failed", "notes": {"sql_length": 80, "has_select": true, "has_from": true, "has_over": false, "has_group_by": false, "has_distinct": false, "has_aggregate": true, "mixes_cols": true, "verified": false}}, {"stage": "repair", "duration_ms": 1191, "summary": "failed", "notes": {"old_sql_len": 80, "new_sql_len": 72}}, {"stage": "safety", "duration_ms": 1, "summary": "failed", "notes": {}}, {"stage": "executor", "duration_ms": 1, "summary": "failed", "notes": {"row_count": 1, "col_count": 3, "sql_length": 72}}, {"stage": "verifier", "duration_ms": 1, "summary": "failed", "notes": {"sql_length": 72, "has_select": true, "has_from": true, "has_over": false, "has_group_by": false, "has_distinct": false, "has_aggregate": true, "mixes_cols": true, "verified": false}}, {"stage": "pipeline", "duration_ms": 1, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 1, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
|
benchmarks/results_pro/20251109-125209/latency_histogram.png
ADDED
|
benchmarks/results_pro/20251109-125209/latency_per_stage.png
ADDED
|
benchmarks/results_pro/20251109-125209/metrics_overview.png
ADDED
|
benchmarks/results_pro/20251109-125209/results.csv
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
db_id,query,ok,em,sm,exec_acc,latency_ms
|
| 2 |
+
concert_singer,"How many singers do we have?",β
,1.0,1.0,1.0,9129
|
| 3 |
+
concert_singer,"What is the total number of singers?",β
,1.0,1.0,1.0,10009
|
| 4 |
+
concert_singer,"Show name, country, age for all singers ordered by age from the oldest to the youngest.",β
,0.0,0.0,0.0,0
|
| 5 |
+
concert_singer,"What are the names, countries, and ages for every singer in descending order of age?",β
,0.0,1.0,1.0,9560
|
| 6 |
+
concert_singer,"What is the average, minimum, and maximum age of all singers from France?",β
,0.0,1.0,1.0,12661
|
benchmarks/results_pro/20251109-125209/summary.json
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"timestamp": "2025-11-09T12:52:50",
|
| 3 |
+
"split": "dev",
|
| 4 |
+
"config": "configs/sqlite_pipeline.yaml",
|
| 5 |
+
"total": 5,
|
| 6 |
+
"success": 5,
|
| 7 |
+
"success_rate": 1.0,
|
| 8 |
+
"avg_latency_ms": 8271.8,
|
| 9 |
+
"p50_latency_ms": 9560.0,
|
| 10 |
+
"p95_latency_ms": 12130.6,
|
| 11 |
+
"EM": 0.4,
|
| 12 |
+
"SM": 0.8,
|
| 13 |
+
"ExecAcc": 0.8,
|
| 14 |
+
"detector_avg_ms": 1.0,
|
| 15 |
+
"planner_avg_ms": 7916.75,
|
| 16 |
+
"generator_avg_ms": 1823.5,
|
| 17 |
+
"safety_avg_ms": 1.5,
|
| 18 |
+
"executor_avg_ms": 1.5,
|
| 19 |
+
"verifier_avg_ms": 1.0,
|
| 20 |
+
"repair_avg_ms": 1185.0
|
| 21 |
+
}
|
benchmarks/results_pro/20251109-125509/eval.jsonl
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"source": "spider", "db_id": "concert_singer", "query": "How many singers do we have?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "select count(*) from singer", "ok": true, "latency_ms": 7007, "em": 1.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 1, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 5977, "summary": "ok", "notes": {}}, {"stage": "generator", "duration_ms": 1019, "summary": "failed", "notes": {"rationale_len": 30}}, {"stage": "safety", "duration_ms": 3, "summary": "failed", "notes": {}}, {"stage": "executor", "duration_ms": 4, "summary": "failed", "notes": {"row_count": 1, "col_count": 1, "sql_length": 27}}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"sql_length": 27, "has_select": true, "has_from": true, "has_over": false, "has_group_by": false, "has_distinct": false, "has_aggregate": true, "mixes_cols": false, "verified": true}}, {"stage": "pipeline", "duration_ms": 1, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
|
| 2 |
+
{"source": "spider", "db_id": "concert_singer", "query": "What is the total number of singers?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "select count(*) from singer", "ok": true, "latency_ms": 5715, "em": 1.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 1, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 4893, "summary": "ok", "notes": {}}, {"stage": "generator", "duration_ms": 819, "summary": "failed", "notes": {"rationale_len": 30}}, {"stage": "safety", "duration_ms": 1, "summary": "failed", "notes": {}}, {"stage": "executor", "duration_ms": 1, "summary": "failed", "notes": {"row_count": 1, "col_count": 1, "sql_length": 27}}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"sql_length": 27, "has_select": true, "has_from": true, "has_over": false, "has_group_by": false, "has_distinct": false, "has_aggregate": true, "mixes_cols": false, "verified": true}}, {"stage": "pipeline", "duration_ms": 1, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
|
| 3 |
+
{"source": "spider", "db_id": "concert_singer", "query": "Show name, country, age for all singers ordered by age from the oldest to the youngest.", "gold_sql": "SELECT name , country , age FROM singer ORDER BY age DESC", "pred_sql": "", "ok": true, "latency_ms": 0, "em": 0.0, "sm": 0.0, "exec_acc": 0.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 1, "summary": "ambiguous", "notes": {"ambiguous": true, "questions_len": 1}}]}
|
| 4 |
+
{"source": "spider", "db_id": "concert_singer", "query": "What are the names, countries, and ages for every singer in descending order of age?", "gold_sql": "SELECT name , country , age FROM singer ORDER BY age DESC", "pred_sql": "select Name, Country, Age from singer order by Age desc", "ok": true, "latency_ms": 7982, "em": 0.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 1, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 6702, "summary": "ok", "notes": {}}, {"stage": "generator", "duration_ms": 1278, "summary": "failed", "notes": {"rationale_len": 85}}, {"stage": "safety", "duration_ms": 1, "summary": "failed", "notes": {}}, {"stage": "executor", "duration_ms": 1, "summary": "failed", "notes": {"row_count": 6, "col_count": 3, "sql_length": 55}}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"sql_length": 55, "has_select": true, "has_from": true, "has_over": false, "has_group_by": false, "has_distinct": false, "has_aggregate": false, "mixes_cols": false, "verified": true}}, {"stage": "pipeline", "duration_ms": 1, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
|
| 5 |
+
{"source": "spider", "db_id": "concert_singer", "query": "What is the average, minimum, and maximum age of all singers from France?", "gold_sql": "SELECT avg(age) , min(age) , max(age) FROM singer WHERE country = 'France'", "pred_sql": "select avg(Age), min(Age), max(Age) from singer where Country = 'France'", "ok": true, "latency_ms": 11942, "em": 0.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 1, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 8334, "summary": "ok", "notes": {}}, {"stage": "generator", "duration_ms": 1279, "summary": "failed", "notes": {"rationale_len": 71}}, {"stage": "safety", "duration_ms": 2, "summary": "failed", "notes": {}}, {"stage": "executor", "duration_ms": 1, "summary": "failed", "notes": {"row_count": 1, "col_count": 3, "sql_length": 72}}, {"stage": "verifier", "duration_ms": 1, "summary": "failed", "notes": {"sql_length": 72, "has_select": true, "has_from": true, "has_over": false, "has_group_by": false, "has_distinct": false, "has_aggregate": true, "mixes_cols": true, "verified": false}}, {"stage": "repair", "duration_ms": 1242, "summary": "failed", "notes": {"old_sql_len": 72, "new_sql_len": 80}}, {"stage": "safety", "duration_ms": 2, "summary": "failed", "notes": {}}, {"stage": "executor", "duration_ms": 1, "summary": "failed", "notes": {"row_count": 1, "col_count": 3, "sql_length": 80}}, {"stage": "verifier", "duration_ms": 1, "summary": "failed", "notes": {"sql_length": 80, "has_select": true, "has_from": true, "has_over": false, "has_group_by": false, "has_distinct": false, "has_aggregate": true, "mixes_cols": true, "verified": false}}, {"stage": "repair", "duration_ms": 1075, "summary": "failed", "notes": {"old_sql_len": 80, "new_sql_len": 72}}, {"stage": "safety", "duration_ms": 2, "summary": "failed", "notes": {}}, {"stage": "executor", "duration_ms": 1, "summary": "failed", "notes": {"row_count": 1, "col_count": 3, "sql_length": 72}}, {"stage": "verifier", "duration_ms": 1, "summary": "failed", "notes": {"sql_length": 72, "has_select": true, "has_from": true, "has_over": false, "has_group_by": false, "has_distinct": false, "has_aggregate": true, "mixes_cols": true, "verified": false}}, {"stage": "pipeline", "duration_ms": 1, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 1, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
|
benchmarks/results_pro/20251109-125509/latency_histogram.png
ADDED
|
benchmarks/results_pro/20251109-125509/latency_per_stage.png
ADDED
|
benchmarks/results_pro/20251109-125509/metrics_overview.png
ADDED
|
benchmarks/results_pro/20251109-125509/results.csv
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
db_id,query,ok,em,sm,exec_acc,latency_ms
|
| 2 |
+
concert_singer,"How many singers do we have?",β
,1.0,1.0,1.0,7007
|
| 3 |
+
concert_singer,"What is the total number of singers?",β
,1.0,1.0,1.0,5715
|
| 4 |
+
concert_singer,"Show name, country, age for all singers ordered by age from the oldest to the youngest.",β
,0.0,0.0,0.0,0
|
| 5 |
+
concert_singer,"What are the names, countries, and ages for every singer in descending order of age?",β
,0.0,1.0,1.0,7982
|
| 6 |
+
concert_singer,"What is the average, minimum, and maximum age of all singers from France?",β
,0.0,1.0,1.0,11942
|
benchmarks/results_pro/20251109-125509/summary.json
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"timestamp": "2025-11-09T12:55:42",
|
| 3 |
+
"split": "dev",
|
| 4 |
+
"config": "configs/sqlite_pipeline.yaml",
|
| 5 |
+
"total": 5,
|
| 6 |
+
"success": 5,
|
| 7 |
+
"success_rate": 1.0,
|
| 8 |
+
"avg_latency_ms": 6529.2,
|
| 9 |
+
"p50_latency_ms": 7007.0,
|
| 10 |
+
"p95_latency_ms": 11150.0,
|
| 11 |
+
"EM": 0.4,
|
| 12 |
+
"SM": 0.8,
|
| 13 |
+
"ExecAcc": 0.8,
|
| 14 |
+
"detector_avg_ms": 1.0,
|
| 15 |
+
"planner_avg_ms": 6476.5,
|
| 16 |
+
"generator_avg_ms": 1098.75,
|
| 17 |
+
"safety_avg_ms": 1.83,
|
| 18 |
+
"executor_avg_ms": 1.5,
|
| 19 |
+
"verifier_avg_ms": 1.0,
|
| 20 |
+
"repair_avg_ms": 1158.5
|
| 21 |
+
}
|