Spaces:
Sleeping
Sleeping
Melika Kheirieh
commited on
Commit
Β·
3716701
1
Parent(s):
296a94d
feat(trace): enrich StageTrace (sql_length/row_count/verified/error_type/repair_attempts/skipped) and propagate in normalization; tag EmptySQL; annotate repair attempts
Browse files- benchmarks/results_pro/20251109-092540/summary.json +0 -12
- benchmarks/results_pro/20251109-092823/eval.jsonl +0 -5
- benchmarks/results_pro/20251109-092823/summary.json +0 -12
- benchmarks/results_pro/20251109-093743/eval.jsonl +0 -5
- benchmarks/results_pro/20251109-093743/summary.json +0 -12
- benchmarks/results_pro/20251109-095552/eval.jsonl +0 -5
- benchmarks/results_pro/20251109-095552/summary.json +0 -12
- benchmarks/results_pro/20251109-100021/eval.jsonl +0 -5
- benchmarks/results_pro/{20251109-092540 β 20251109-100618}/eval.jsonl +4 -4
- benchmarks/results_pro/20251109-100618/latency_histogram.png +0 -0
- benchmarks/results_pro/20251109-100618/latency_per_stage.png +0 -0
- benchmarks/results_pro/20251109-100618/metrics_overview.png +0 -0
- benchmarks/results_pro/{20251109-100021 β 20251109-100618}/results.csv +4 -4
- benchmarks/results_pro/20251109-100618/summary.json +21 -0
- benchmarks/results_pro/20251109-103601/eval.jsonl +5 -0
- benchmarks/results_pro/20251109-103601/results.csv +6 -0
- benchmarks/results_pro/{20251109-100021 β 20251109-103601}/summary.json +10 -10
- nl2sql/pipeline.py +21 -2
- nl2sql/types.py +8 -0
benchmarks/results_pro/20251109-092540/summary.json
DELETED
|
@@ -1,12 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"timestamp": "2025-11-09T09:26:21",
|
| 3 |
-
"total": 5,
|
| 4 |
-
"success": 5,
|
| 5 |
-
"success_rate": 1.0,
|
| 6 |
-
"avg_latency_ms": 8215.8,
|
| 7 |
-
"EM": 0.4,
|
| 8 |
-
"SM": 0.8,
|
| 9 |
-
"ExecAcc": 0.8,
|
| 10 |
-
"split": "dev",
|
| 11 |
-
"config": "configs/sqlite_pipeline.yaml"
|
| 12 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
benchmarks/results_pro/20251109-092823/eval.jsonl
DELETED
|
@@ -1,5 +0,0 @@
|
|
| 1 |
-
{"source": "spider", "db_id": "concert_singer", "query": "How many singers do we have?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "select count(*) from singer limit 1", "ok": true, "latency_ms": 7982, "em": 1.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 5384, "summary": "ok", "notes": {"len_plan": 1287}, "token_in": 270, "token_out": 306, "cost_usd": 0.0002241}, {"stage": "generator", "duration_ms": 900, "summary": "ok", "notes": {"rationale_len": 30}, "token_in": 794, "token_out": 19, "cost_usd": 0.0001305}, {"stage": "safety", "duration_ms": 2, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 888, "summary": "ok", "notes": {"old_sql_len": 27, "new_sql_len": 35}, "token_in": 318, "token_out": 8, "cost_usd": 5.2499999999999995e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 797, "summary": "ok", "notes": {"old_sql_len": 35, "new_sql_len": 35}, "token_in": 321, "token_out": 8, "cost_usd": 5.295e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
|
| 2 |
-
{"source": "spider", "db_id": "concert_singer", "query": "What is the total number of singers?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "select count(*) from singer limit 1", "ok": true, "latency_ms": 9717, "em": 1.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 6881, "summary": "ok", "notes": {"len_plan": 1352}, "token_in": 271, "token_out": 319, "cost_usd": 0.00023204999999999998}, {"stage": "generator", "duration_ms": 1162, "summary": "ok", "notes": {"rationale_len": 30}, "token_in": 808, "token_out": 19, "cost_usd": 0.0001326}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 716, "summary": "ok", "notes": {"old_sql_len": 27, "new_sql_len": 35}, "token_in": 318, "token_out": 8, "cost_usd": 5.2499999999999995e-05}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 0, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 950, "summary": "ok", "notes": {"old_sql_len": 35, "new_sql_len": 35}, "token_in": 321, "token_out": 8, "cost_usd": 5.295e-05}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 0, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
|
| 3 |
-
{"source": "spider", "db_id": "concert_singer", "query": "Show name, country, age for all singers ordered by age from the oldest to the youngest.", "gold_sql": "SELECT name , country , age FROM singer ORDER BY age DESC", "pred_sql": "", "ok": true, "latency_ms": 0, "em": 0.0, "sm": 0.0, "exec_acc": 0.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "ambiguous", "notes": {"ambiguous": true, "questions_len": 1}}]}
|
| 4 |
-
{"source": "spider", "db_id": "concert_singer", "query": "What are the names, countries, and ages for every singer in descending order of age?", "gold_sql": "SELECT name , country , age FROM singer ORDER BY age DESC", "pred_sql": "select Name, Country, Age from singer order by Age desc LIMIT 10", "ok": true, "latency_ms": 8523, "em": 0.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 5311, "summary": "ok", "notes": {"len_plan": 1449}, "token_in": 281, "token_out": 343, "cost_usd": 0.00024795}, {"stage": "generator", "duration_ms": 1306, "summary": "ok", "notes": {"rationale_len": 85}, "token_in": 842, "token_out": 37, "cost_usd": 0.00014849999999999998}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 6, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 996, "summary": "ok", "notes": {"old_sql_len": 55, "new_sql_len": 64}, "token_in": 325, "token_out": 21, "cost_usd": 6.135e-05}, {"stage": "safety", "duration_ms": 3, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 6, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 900, "summary": "ok", "notes": {"old_sql_len": 64, "new_sql_len": 64}, "token_in": 328, "token_out": 21, "cost_usd": 6.18e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 6, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
|
| 5 |
-
{"source": "spider", "db_id": "concert_singer", "query": "What is the average, minimum, and maximum age of all singers from France?", "gold_sql": "SELECT avg(age) , min(age) , max(age) FROM singer WHERE country = 'France'", "pred_sql": "select avg(Age), min(Age), max(Age) from singer where Country = 'France'", "ok": true, "latency_ms": 12291, "em": 0.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 8346, "summary": "ok", "notes": {"len_plan": 1363}, "token_in": 279, "token_out": 334, "cost_usd": 0.00024225}, {"stage": "generator", "duration_ms": 1636, "summary": "ok", "notes": {"rationale_len": 87}, "token_in": 831, "token_out": 46, "cost_usd": 0.00015225}, {"stage": "safety", "duration_ms": 3, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 2, "summary": "ok", "notes": {"row_count": 1, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 2, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 1137, "summary": "ok", "notes": {"old_sql_len": 72, "new_sql_len": 80}, "token_in": 333, "token_out": 25, "cost_usd": 6.495e-05}, {"stage": "safety", "duration_ms": 3, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 3, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 1151, "summary": "ok", "notes": {"old_sql_len": 80, "new_sql_len": 72}, "token_in": 337, "token_out": 28, "cost_usd": 6.735e-05}, {"stage": "safety", "duration_ms": 3, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
benchmarks/results_pro/20251109-092823/summary.json
DELETED
|
@@ -1,12 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"timestamp": "2025-11-09T09:29:01",
|
| 3 |
-
"total": 5,
|
| 4 |
-
"success": 5,
|
| 5 |
-
"success_rate": 1.0,
|
| 6 |
-
"avg_latency_ms": 7702.6,
|
| 7 |
-
"EM": 0.4,
|
| 8 |
-
"SM": 0.8,
|
| 9 |
-
"ExecAcc": 0.8,
|
| 10 |
-
"split": "dev",
|
| 11 |
-
"config": "configs/sqlite_pipeline.yaml"
|
| 12 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
benchmarks/results_pro/20251109-093743/eval.jsonl
DELETED
|
@@ -1,5 +0,0 @@
|
|
| 1 |
-
{"source": "spider", "db_id": "concert_singer", "query": "How many singers do we have?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "select count(*) from singer limit 1", "ok": true, "latency_ms": 10480, "em": 1.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 8010, "summary": "ok", "notes": {"len_plan": 1445}, "token_in": 270, "token_out": 337, "cost_usd": 0.00024270000000000002}, {"stage": "generator", "duration_ms": 1029, "summary": "ok", "notes": {"rationale_len": 30}, "token_in": 825, "token_out": 19, "cost_usd": 0.00013514999999999998}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 2, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 678, "summary": "ok", "notes": {"old_sql_len": 27, "new_sql_len": 35}, "token_in": 318, "token_out": 8, "cost_usd": 5.2499999999999995e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 2, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 750, "summary": "ok", "notes": {"old_sql_len": 35, "new_sql_len": 35}, "token_in": 321, "token_out": 8, "cost_usd": 5.295e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
|
| 2 |
-
{"source": "spider", "db_id": "concert_singer", "query": "What is the total number of singers?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "select count(*) from singer limit 1", "ok": true, "latency_ms": 10687, "em": 1.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 6978, "summary": "ok", "notes": {"len_plan": 1512}, "token_in": 271, "token_out": 355, "cost_usd": 0.00025364999999999996}, {"stage": "generator", "duration_ms": 2192, "summary": "ok", "notes": {"rationale_len": 30}, "token_in": 844, "token_out": 19, "cost_usd": 0.000138}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 0, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 652, "summary": "ok", "notes": {"old_sql_len": 27, "new_sql_len": 35}, "token_in": 318, "token_out": 8, "cost_usd": 5.2499999999999995e-05}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 0, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 863, "summary": "ok", "notes": {"old_sql_len": 35, "new_sql_len": 35}, "token_in": 321, "token_out": 8, "cost_usd": 5.295e-05}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 0, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
|
| 3 |
-
{"source": "spider", "db_id": "concert_singer", "query": "Show name, country, age for all singers ordered by age from the oldest to the youngest.", "gold_sql": "SELECT name , country , age FROM singer ORDER BY age DESC", "pred_sql": "", "ok": true, "latency_ms": 0, "em": 0.0, "sm": 0.0, "exec_acc": 0.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "ambiguous", "notes": {"ambiguous": true, "questions_len": 1}}]}
|
| 4 |
-
{"source": "spider", "db_id": "concert_singer", "query": "What are the names, countries, and ages for every singer in descending order of age?", "gold_sql": "SELECT name , country , age FROM singer ORDER BY age DESC", "pred_sql": "select Name, Country, Age from singer order by Age desc LIMIT 10", "ok": true, "latency_ms": 16736, "em": 0.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 13205, "summary": "ok", "notes": {"len_plan": 1758}, "token_in": 281, "token_out": 409, "cost_usd": 0.00028754999999999997}, {"stage": "generator", "duration_ms": 1537, "summary": "ok", "notes": {"rationale_len": 83}, "token_in": 908, "token_out": 37, "cost_usd": 0.0001584}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 6, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 1019, "summary": "ok", "notes": {"old_sql_len": 55, "new_sql_len": 64}, "token_in": 325, "token_out": 21, "cost_usd": 6.135e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 0, "summary": "ok", "notes": {"row_count": 6, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 968, "summary": "ok", "notes": {"old_sql_len": 64, "new_sql_len": 64}, "token_in": 328, "token_out": 21, "cost_usd": 6.18e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 6, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
|
| 5 |
-
{"source": "spider", "db_id": "concert_singer", "query": "What is the average, minimum, and maximum age of all singers from France?", "gold_sql": "SELECT avg(age) , min(age) , max(age) FROM singer WHERE country = 'France'", "pred_sql": "select avg(Age), min(Age), max(Age) from singer where Country = 'France'", "ok": true, "latency_ms": 12440, "em": 0.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 7973, "summary": "ok", "notes": {"len_plan": 1377}, "token_in": 279, "token_out": 345, "cost_usd": 0.00024884999999999995}, {"stage": "generator", "duration_ms": 1827, "summary": "ok", "notes": {"rationale_len": 94}, "token_in": 841, "token_out": 47, "cost_usd": 0.00015434999999999998}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 1312, "summary": "ok", "notes": {"old_sql_len": 72, "new_sql_len": 80}, "token_in": 333, "token_out": 24, "cost_usd": 6.435e-05}, {"stage": "safety", "duration_ms": 3, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 2, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 1313, "summary": "ok", "notes": {"old_sql_len": 80, "new_sql_len": 72}, "token_in": 337, "token_out": 21, "cost_usd": 6.315e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
benchmarks/results_pro/20251109-093743/summary.json
DELETED
|
@@ -1,12 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"timestamp": "2025-11-09T09:38:33",
|
| 3 |
-
"total": 5,
|
| 4 |
-
"success": 5,
|
| 5 |
-
"success_rate": 1.0,
|
| 6 |
-
"avg_latency_ms": 10068.6,
|
| 7 |
-
"EM": 0.4,
|
| 8 |
-
"SM": 0.8,
|
| 9 |
-
"ExecAcc": 0.8,
|
| 10 |
-
"split": "dev",
|
| 11 |
-
"config": "configs/sqlite_pipeline.yaml"
|
| 12 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
benchmarks/results_pro/20251109-095552/eval.jsonl
DELETED
|
@@ -1,5 +0,0 @@
|
|
| 1 |
-
{"source": "spider", "db_id": "concert_singer", "query": "How many singers do we have?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "select count(*) from singer limit 1", "ok": true, "latency_ms": 11661, "em": 1.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 8989, "summary": "ok", "notes": {"len_plan": 1451}, "token_in": 270, "token_out": 347, "cost_usd": 0.0002487}, {"stage": "generator", "duration_ms": 977, "summary": "ok", "notes": {"rationale_len": 30}, "token_in": 834, "token_out": 19, "cost_usd": 0.00013649999999999998}, {"stage": "safety", "duration_ms": 2, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 2, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 745, "summary": "ok", "notes": {"old_sql_len": 27, "new_sql_len": 35}, "token_in": 318, "token_out": 8, "cost_usd": 5.2499999999999995e-05}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 937, "summary": "ok", "notes": {"old_sql_len": 35, "new_sql_len": 35}, "token_in": 321, "token_out": 8, "cost_usd": 5.295e-05}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
|
| 2 |
-
{"source": "spider", "db_id": "concert_singer", "query": "What is the total number of singers?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "select count(*) from singer limit 1", "ok": true, "latency_ms": 9786, "em": 1.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 6574, "summary": "ok", "notes": {"len_plan": 1479}, "token_in": 271, "token_out": 343, "cost_usd": 0.00024645}, {"stage": "generator", "duration_ms": 955, "summary": "ok", "notes": {"rationale_len": 30}, "token_in": 831, "token_out": 19, "cost_usd": 0.00013605}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 2, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 986, "summary": "ok", "notes": {"old_sql_len": 27, "new_sql_len": 35}, "token_in": 318, "token_out": 8, "cost_usd": 5.2499999999999995e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 1262, "summary": "ok", "notes": {"old_sql_len": 35, "new_sql_len": 35}, "token_in": 321, "token_out": 8, "cost_usd": 5.295e-05}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
|
| 3 |
-
{"source": "spider", "db_id": "concert_singer", "query": "Show name, country, age for all singers ordered by age from the oldest to the youngest.", "gold_sql": "SELECT name , country , age FROM singer ORDER BY age DESC", "pred_sql": "", "ok": true, "latency_ms": 0, "em": 0.0, "sm": 0.0, "exec_acc": 0.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "ambiguous", "notes": {"ambiguous": true, "questions_len": 1}}]}
|
| 4 |
-
{"source": "spider", "db_id": "concert_singer", "query": "What are the names, countries, and ages for every singer in descending order of age?", "gold_sql": "SELECT name , country , age FROM singer ORDER BY age DESC", "pred_sql": "select Name, Country, Age from singer order by Age desc LIMIT 10", "ok": true, "latency_ms": 8674, "em": 0.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 5293, "summary": "ok", "notes": {"len_plan": 1333}, "token_in": 281, "token_out": 305, "cost_usd": 0.00022514999999999997}, {"stage": "generator", "duration_ms": 1510, "summary": "ok", "notes": {"rationale_len": 85}, "token_in": 803, "token_out": 37, "cost_usd": 0.00014265}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 6, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 857, "summary": "ok", "notes": {"old_sql_len": 55, "new_sql_len": 64}, "token_in": 325, "token_out": 21, "cost_usd": 6.135e-05}, {"stage": "safety", "duration_ms": 2, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 6, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 1004, "summary": "ok", "notes": {"old_sql_len": 64, "new_sql_len": 64}, "token_in": 328, "token_out": 21, "cost_usd": 6.18e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 6, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
|
| 5 |
-
{"source": "spider", "db_id": "concert_singer", "query": "What is the average, minimum, and maximum age of all singers from France?", "gold_sql": "SELECT avg(age) , min(age) , max(age) FROM singer WHERE country = 'France'", "pred_sql": "select avg(Age), min(Age), max(Age) from singer where Country = 'France'", "ok": true, "latency_ms": 11247, "em": 0.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 7296, "summary": "ok", "notes": {"len_plan": 1578}, "token_in": 279, "token_out": 425, "cost_usd": 0.00029685}, {"stage": "generator", "duration_ms": 1552, "summary": "ok", "notes": {"rationale_len": 67}, "token_in": 921, "token_out": 42, "cost_usd": 0.00016334999999999999}, {"stage": "safety", "duration_ms": 2, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 1222, "summary": "ok", "notes": {"old_sql_len": 72, "new_sql_len": 80}, "token_in": 333, "token_out": 24, "cost_usd": 6.435e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 1163, "summary": "ok", "notes": {"old_sql_len": 80, "new_sql_len": 72}, "token_in": 337, "token_out": 28, "cost_usd": 6.735e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
benchmarks/results_pro/20251109-095552/summary.json
DELETED
|
@@ -1,12 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"timestamp": "2025-11-09T09:56:33",
|
| 3 |
-
"total": 5,
|
| 4 |
-
"success": 5,
|
| 5 |
-
"success_rate": 1.0,
|
| 6 |
-
"avg_latency_ms": 8273.6,
|
| 7 |
-
"EM": 0.4,
|
| 8 |
-
"SM": 0.8,
|
| 9 |
-
"ExecAcc": 0.8,
|
| 10 |
-
"split": "dev",
|
| 11 |
-
"config": "configs/sqlite_pipeline.yaml"
|
| 12 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
benchmarks/results_pro/20251109-100021/eval.jsonl
DELETED
|
@@ -1,5 +0,0 @@
|
|
| 1 |
-
{"source": "spider", "db_id": "concert_singer", "query": "How many singers do we have?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "select count(*) from singer limit 1", "ok": true, "latency_ms": 9656, "em": 1.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 7138, "summary": "ok", "notes": {"len_plan": 1287}, "token_in": 265, "token_out": 303, "cost_usd": 0.00022154999999999996}, {"stage": "generator", "duration_ms": 875, "summary": "ok", "notes": {"rationale_len": 30}, "token_in": 785, "token_out": 19, "cost_usd": 0.00012915}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 803, "summary": "ok", "notes": {"old_sql_len": 27, "new_sql_len": 35}, "token_in": 313, "token_out": 8, "cost_usd": 5.175e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 829, "summary": "ok", "notes": {"old_sql_len": 35, "new_sql_len": 35}, "token_in": 316, "token_out": 8, "cost_usd": 5.2199999999999995e-05}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
|
| 2 |
-
{"source": "spider", "db_id": "concert_singer", "query": "What is the total number of singers?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "select count(*) from singer limit 1", "ok": true, "latency_ms": 11252, "em": 1.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 8353, "summary": "ok", "notes": {"len_plan": 1399}, "token_in": 266, "token_out": 330, "cost_usd": 0.00023789999999999998}, {"stage": "generator", "duration_ms": 1048, "summary": "ok", "notes": {"rationale_len": 30}, "token_in": 813, "token_out": 19, "cost_usd": 0.00013335}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 794, "summary": "ok", "notes": {"old_sql_len": 27, "new_sql_len": 35}, "token_in": 313, "token_out": 8, "cost_usd": 5.175e-05}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 1052, "summary": "ok", "notes": {"old_sql_len": 35, "new_sql_len": 35}, "token_in": 316, "token_out": 8, "cost_usd": 5.2199999999999995e-05}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
|
| 3 |
-
{"source": "spider", "db_id": "concert_singer", "query": "Show name, country, age for all singers ordered by age from the oldest to the youngest.", "gold_sql": "SELECT name , country , age FROM singer ORDER BY age DESC", "pred_sql": "", "ok": true, "latency_ms": 0, "em": 0.0, "sm": 0.0, "exec_acc": 0.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "ambiguous", "notes": {"ambiguous": true, "questions_len": 1}}]}
|
| 4 |
-
{"source": "spider", "db_id": "concert_singer", "query": "What are the names, countries, and ages for every singer in descending order of age?", "gold_sql": "SELECT name , country , age FROM singer ORDER BY age DESC", "pred_sql": "select Name, Country, Age from singer order by Age desc LIMIT 10", "ok": true, "latency_ms": 8517, "em": 0.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 5263, "summary": "ok", "notes": {"len_plan": 1304}, "token_in": 276, "token_out": 300, "cost_usd": 0.0002214}, {"stage": "generator", "duration_ms": 1022, "summary": "ok", "notes": {"rationale_len": 85}, "token_in": 793, "token_out": 37, "cost_usd": 0.00014115}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 6, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 977, "summary": "ok", "notes": {"old_sql_len": 55, "new_sql_len": 64}, "token_in": 320, "token_out": 21, "cost_usd": 6.0599999999999996e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 6, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 1249, "summary": "ok", "notes": {"old_sql_len": 64, "new_sql_len": 64}, "token_in": 323, "token_out": 21, "cost_usd": 6.104999999999999e-05}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 0, "summary": "ok", "notes": {"row_count": 6, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
|
| 5 |
-
{"source": "spider", "db_id": "concert_singer", "query": "What is the average, minimum, and maximum age of all singers from France?", "gold_sql": "SELECT avg(age) , min(age) , max(age) FROM singer WHERE country = 'France'", "pred_sql": "select avg(Age), min(Age), max(Age) from singer where Country = 'France'", "ok": true, "latency_ms": 15468, "em": 0.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 11390, "summary": "ok", "notes": {"len_plan": 1400}, "token_in": 274, "token_out": 348, "cost_usd": 0.0002499}, {"stage": "generator", "duration_ms": 1252, "summary": "ok", "notes": {"rationale_len": 95}, "token_in": 839, "token_out": 45, "cost_usd": 0.00015285}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 1384, "summary": "ok", "notes": {"old_sql_len": 72, "new_sql_len": 80}, "token_in": 328, "token_out": 24, "cost_usd": 6.36e-05}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 0, "summary": "ok", "notes": {"row_count": 1, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 1437, "summary": "ok", "notes": {"old_sql_len": 80, "new_sql_len": 72}, "token_in": 332, "token_out": 21, "cost_usd": 6.24e-05}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 0, "summary": "ok", "notes": {"row_count": 1, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
benchmarks/results_pro/{20251109-092540 β 20251109-100618}/eval.jsonl
RENAMED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
-
{"source": "spider", "db_id": "concert_singer", "query": "How many singers do we have?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "select count(*) from singer limit 1", "ok": true, "latency_ms":
|
| 2 |
-
{"source": "spider", "db_id": "concert_singer", "query": "What is the total number of singers?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "select count(*) from singer limit 1", "ok": true, "latency_ms":
|
| 3 |
{"source": "spider", "db_id": "concert_singer", "query": "Show name, country, age for all singers ordered by age from the oldest to the youngest.", "gold_sql": "SELECT name , country , age FROM singer ORDER BY age DESC", "pred_sql": "", "ok": true, "latency_ms": 0, "em": 0.0, "sm": 0.0, "exec_acc": 0.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "ambiguous", "notes": {"ambiguous": true, "questions_len": 1}}]}
|
| 4 |
-
{"source": "spider", "db_id": "concert_singer", "query": "What are the names, countries, and ages for every singer in descending order of age?", "gold_sql": "SELECT name , country , age FROM singer ORDER BY age DESC", "pred_sql": "select Name, Country, Age from singer order by Age desc LIMIT 10", "ok": true, "latency_ms":
|
| 5 |
-
{"source": "spider", "db_id": "concert_singer", "query": "What is the average, minimum, and maximum age of all singers from France?", "gold_sql": "SELECT avg(age) , min(age) , max(age) FROM singer WHERE country = 'France'", "pred_sql": "select avg(Age), min(Age), max(Age) from singer where Country = 'France'", "ok": true, "latency_ms":
|
|
|
|
| 1 |
+
{"source": "spider", "db_id": "concert_singer", "query": "How many singers do we have?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "select count(*) from singer limit 1", "ok": true, "latency_ms": 10217, "em": 1.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 7626, "summary": "ok", "notes": {"len_plan": 1263}, "token_in": 265, "token_out": 303, "cost_usd": 0.00022154999999999996}, {"stage": "generator", "duration_ms": 1176, "summary": "ok", "notes": {"rationale_len": 30}, "token_in": 785, "token_out": 19, "cost_usd": 0.00012915}, {"stage": "safety", "duration_ms": 3, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 3, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 2, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 751, "summary": "ok", "notes": {"old_sql_len": 27, "new_sql_len": 27}, "token_in": 313, "token_out": 6, "cost_usd": 5.0549999999999995e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 643, "summary": "ok", "notes": {"old_sql_len": 27, "new_sql_len": 35}, "token_in": 313, "token_out": 8, "cost_usd": 5.175e-05}, {"stage": "safety", "duration_ms": 2, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 2, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
|
| 2 |
+
{"source": "spider", "db_id": "concert_singer", "query": "What is the total number of singers?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "select count(*) from singer limit 1", "ok": true, "latency_ms": 10180, "em": 1.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 7783, "summary": "ok", "notes": {"len_plan": 1307}, "token_in": 266, "token_out": 307, "cost_usd": 0.00022409999999999997}, {"stage": "generator", "duration_ms": 939, "summary": "ok", "notes": {"rationale_len": 30}, "token_in": 790, "token_out": 19, "cost_usd": 0.00012989999999999999}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 2, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 2, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 711, "summary": "ok", "notes": {"old_sql_len": 27, "new_sql_len": 35}, "token_in": 313, "token_out": 8, "cost_usd": 5.175e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 732, "summary": "ok", "notes": {"old_sql_len": 35, "new_sql_len": 35}, "token_in": 316, "token_out": 8, "cost_usd": 5.2199999999999995e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
|
| 3 |
{"source": "spider", "db_id": "concert_singer", "query": "Show name, country, age for all singers ordered by age from the oldest to the youngest.", "gold_sql": "SELECT name , country , age FROM singer ORDER BY age DESC", "pred_sql": "", "ok": true, "latency_ms": 0, "em": 0.0, "sm": 0.0, "exec_acc": 0.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "ambiguous", "notes": {"ambiguous": true, "questions_len": 1}}]}
|
| 4 |
+
{"source": "spider", "db_id": "concert_singer", "query": "What are the names, countries, and ages for every singer in descending order of age?", "gold_sql": "SELECT name , country , age FROM singer ORDER BY age DESC", "pred_sql": "select Name, Country, Age from singer order by Age desc LIMIT 10", "ok": true, "latency_ms": 7726, "em": 0.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 4966, "summary": "ok", "notes": {"len_plan": 1207}, "token_in": 276, "token_out": 278, "cost_usd": 0.0002082}, {"stage": "generator", "duration_ms": 1007, "summary": "ok", "notes": {"rationale_len": 85}, "token_in": 771, "token_out": 37, "cost_usd": 0.00013785}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 6, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 938, "summary": "ok", "notes": {"old_sql_len": 55, "new_sql_len": 64}, "token_in": 320, "token_out": 21, "cost_usd": 6.0599999999999996e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 6, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 807, "summary": "ok", "notes": {"old_sql_len": 64, "new_sql_len": 64}, "token_in": 323, "token_out": 21, "cost_usd": 6.104999999999999e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 6, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
|
| 5 |
+
{"source": "spider", "db_id": "concert_singer", "query": "What is the average, minimum, and maximum age of all singers from France?", "gold_sql": "SELECT avg(age) , min(age) , max(age) FROM singer WHERE country = 'France'", "pred_sql": "select avg(Age), min(Age), max(Age) from singer where Country = 'France'", "ok": true, "latency_ms": 16635, "em": 0.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 13290, "summary": "ok", "notes": {"len_plan": 1641}, "token_in": 274, "token_out": 434, "cost_usd": 0.0003015}, {"stage": "generator", "duration_ms": 1083, "summary": "ok", "notes": {"rationale_len": 67}, "token_in": 925, "token_out": 42, "cost_usd": 0.00016394999999999997}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 1072, "summary": "ok", "notes": {"old_sql_len": 72, "new_sql_len": 80}, "token_in": 328, "token_out": 24, "cost_usd": 6.36e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 1179, "summary": "ok", "notes": {"old_sql_len": 80, "new_sql_len": 72}, "token_in": 332, "token_out": 21, "cost_usd": 6.24e-05}, {"stage": "safety", "duration_ms": 2, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
|
benchmarks/results_pro/20251109-100618/latency_histogram.png
ADDED
|
benchmarks/results_pro/20251109-100618/latency_per_stage.png
ADDED
|
benchmarks/results_pro/20251109-100618/metrics_overview.png
ADDED
|
benchmarks/results_pro/{20251109-100021 β 20251109-100618}/results.csv
RENAMED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
db_id,query,ok,em,sm,exec_acc,latency_ms
|
| 2 |
-
concert_singer,"How many singers do we have?",β
,1.0,1.0,1.0,
|
| 3 |
-
concert_singer,"What is the total number of singers?",β
,1.0,1.0,1.0,
|
| 4 |
concert_singer,"Show name, country, age for all singers ordered by age from the oldest to the youngest.",β
,0.0,0.0,0.0,0
|
| 5 |
-
concert_singer,"What are the names, countries, and ages for every singer in descending order of age?",β
,0.0,1.0,1.0,
|
| 6 |
-
concert_singer,"What is the average, minimum, and maximum age of all singers from France?",β
,0.0,1.0,1.0,
|
|
|
|
| 1 |
db_id,query,ok,em,sm,exec_acc,latency_ms
|
| 2 |
+
concert_singer,"How many singers do we have?",β
,1.0,1.0,1.0,10217
|
| 3 |
+
concert_singer,"What is the total number of singers?",β
,1.0,1.0,1.0,10180
|
| 4 |
concert_singer,"Show name, country, age for all singers ordered by age from the oldest to the youngest.",β
,0.0,0.0,0.0,0
|
| 5 |
+
concert_singer,"What are the names, countries, and ages for every singer in descending order of age?",β
,0.0,1.0,1.0,7726
|
| 6 |
+
concert_singer,"What is the average, minimum, and maximum age of all singers from France?",β
,0.0,1.0,1.0,16635
|
benchmarks/results_pro/20251109-100618/summary.json
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"timestamp": "2025-11-09T10:07:03",
|
| 3 |
+
"split": "dev",
|
| 4 |
+
"config": "configs/sqlite_pipeline.yaml",
|
| 5 |
+
"total": 5,
|
| 6 |
+
"success": 5,
|
| 7 |
+
"success_rate": 1.0,
|
| 8 |
+
"avg_latency_ms": 8951.6,
|
| 9 |
+
"p50_latency_ms": 10180.0,
|
| 10 |
+
"p95_latency_ms": 15351.4,
|
| 11 |
+
"EM": 0.4,
|
| 12 |
+
"SM": 0.8,
|
| 13 |
+
"ExecAcc": 0.8,
|
| 14 |
+
"detector_avg_ms": 0.0,
|
| 15 |
+
"planner_avg_ms": 8416.25,
|
| 16 |
+
"generator_avg_ms": 1051.25,
|
| 17 |
+
"safety_avg_ms": 1.25,
|
| 18 |
+
"executor_avg_ms": 1.33,
|
| 19 |
+
"verifier_avg_ms": 0.58,
|
| 20 |
+
"repair_avg_ms": 854.12
|
| 21 |
+
}
|
benchmarks/results_pro/20251109-103601/eval.jsonl
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"source": "spider", "db_id": "concert_singer", "query": "How many singers do we have?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "select count(*) from singer limit 1", "ok": true, "latency_ms": 10975, "em": 1.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}, "skipped": false}, {"stage": "planner", "duration_ms": 8073, "summary": "ok", "notes": {"len_plan": 1533}, "token_in": 265, "token_out": 384, "cost_usd": 0.00027015, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "generator", "duration_ms": 959, "summary": "ok", "notes": {"rationale_len": 30}, "token_in": 866, "token_out": 19, "cost_usd": 0.0001413, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 2, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 2, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "repair", "duration_ms": 884, "summary": "ok", "notes": {"old_sql_len": 27, "new_sql_len": 35, "attempt": 1}, "token_in": 313, "token_out": 8, "cost_usd": 5.175e-05, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "repair", "duration_ms": 1040, "summary": "ok", "notes": {"old_sql_len": 35, "new_sql_len": 35, "attempt": 2}, "token_in": 316, "token_out": 8, "cost_usd": 5.2199999999999995e-05, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}, "skipped": false}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}, "skipped": false}]}
|
| 2 |
+
{"source": "spider", "db_id": "concert_singer", "query": "What is the total number of singers?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "select count(*) from singer limit 1", "ok": true, "latency_ms": 11792, "em": 1.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}, "skipped": false}, {"stage": "planner", "duration_ms": 8633, "summary": "ok", "notes": {"len_plan": 1444}, "token_in": 266, "token_out": 354, "cost_usd": 0.00025229999999999995, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "generator", "duration_ms": 1406, "summary": "ok", "notes": {"rationale_len": 30}, "token_in": 837, "token_out": 19, "cost_usd": 0.00013695, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "repair", "duration_ms": 988, "summary": "ok", "notes": {"old_sql_len": 27, "new_sql_len": 35, "attempt": 1}, "token_in": 313, "token_out": 8, "cost_usd": 5.175e-05, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "repair", "duration_ms": 755, "summary": "ok", "notes": {"old_sql_len": 35, "new_sql_len": 35, "attempt": 2}, "token_in": 316, "token_out": 8, "cost_usd": 5.2199999999999995e-05, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}, "skipped": false}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}, "skipped": false}]}
|
| 3 |
+
{"source": "spider", "db_id": "concert_singer", "query": "Show name, country, age for all singers ordered by age from the oldest to the youngest.", "gold_sql": "SELECT name , country , age FROM singer ORDER BY age DESC", "pred_sql": "", "ok": true, "latency_ms": 0, "em": 0.0, "sm": 0.0, "exec_acc": 0.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "ambiguous", "notes": {"ambiguous": true, "questions_len": 1}, "skipped": false}]}
|
| 4 |
+
{"source": "spider", "db_id": "concert_singer", "query": "What are the names, countries, and ages for every singer in descending order of age?", "gold_sql": "SELECT name , country , age FROM singer ORDER BY age DESC", "pred_sql": "select Name, Country, Age from singer order by Age desc LIMIT 10", "ok": true, "latency_ms": 9181, "em": 0.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}, "skipped": false}, {"stage": "planner", "duration_ms": 5624, "summary": "ok", "notes": {"len_plan": 1296}, "token_in": 276, "token_out": 297, "cost_usd": 0.00021959999999999997, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "generator", "duration_ms": 1398, "summary": "ok", "notes": {"rationale_len": 85}, "token_in": 790, "token_out": 37, "cost_usd": 0.0001407, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 6, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "repair", "duration_ms": 948, "summary": "ok", "notes": {"old_sql_len": 55, "new_sql_len": 64, "attempt": 1}, "token_in": 320, "token_out": 21, "cost_usd": 6.0599999999999996e-05, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 6, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "repair", "duration_ms": 1200, "summary": "ok", "notes": {"old_sql_len": 64, "new_sql_len": 64, "attempt": 2}, "token_in": 323, "token_out": 21, "cost_usd": 6.104999999999999e-05, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 6, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}, "skipped": false}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}, "skipped": false}]}
|
| 5 |
+
{"source": "spider", "db_id": "concert_singer", "query": "What is the average, minimum, and maximum age of all singers from France?", "gold_sql": "SELECT avg(age) , min(age) , max(age) FROM singer WHERE country = 'France'", "pred_sql": "select avg(Age), min(Age), max(Age) from singer where Country = 'France'", "ok": true, "latency_ms": 14419, "em": 0.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}, "skipped": false}, {"stage": "planner", "duration_ms": 9792, "summary": "ok", "notes": {"len_plan": 1406}, "token_in": 274, "token_out": 348, "cost_usd": 0.0002499, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "generator", "duration_ms": 1727, "summary": "ok", "notes": {"rationale_len": 90}, "token_in": 839, "token_out": 46, "cost_usd": 0.00015345, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 4, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "repair", "duration_ms": 1130, "summary": "ok", "notes": {"old_sql_len": 72, "new_sql_len": 80, "attempt": 1}, "token_in": 328, "token_out": 24, "cost_usd": 6.36e-05, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 2, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "repair", "duration_ms": 1752, "summary": "ok", "notes": {"old_sql_len": 80, "new_sql_len": 72, "attempt": 2}, "token_in": 332, "token_out": 25, "cost_usd": 6.48e-05, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 2, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}, "skipped": false}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}, "skipped": false}]}
|
benchmarks/results_pro/20251109-103601/results.csv
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
db_id,query,ok,em,sm,exec_acc,latency_ms
|
| 2 |
+
concert_singer,"How many singers do we have?",β
,1.0,1.0,1.0,10975
|
| 3 |
+
concert_singer,"What is the total number of singers?",β
,1.0,1.0,1.0,11792
|
| 4 |
+
concert_singer,"Show name, country, age for all singers ordered by age from the oldest to the youngest.",β
,0.0,0.0,0.0,0
|
| 5 |
+
concert_singer,"What are the names, countries, and ages for every singer in descending order of age?",β
,0.0,1.0,1.0,9181
|
| 6 |
+
concert_singer,"What is the average, minimum, and maximum age of all singers from France?",β
,0.0,1.0,1.0,14419
|
benchmarks/results_pro/{20251109-100021 β 20251109-103601}/summary.json
RENAMED
|
@@ -1,21 +1,21 @@
|
|
| 1 |
{
|
| 2 |
-
"timestamp": "2025-11-09T10:
|
| 3 |
"split": "dev",
|
| 4 |
"config": "configs/sqlite_pipeline.yaml",
|
| 5 |
"total": 5,
|
| 6 |
"success": 5,
|
| 7 |
"success_rate": 1.0,
|
| 8 |
-
"avg_latency_ms":
|
| 9 |
-
"p50_latency_ms":
|
| 10 |
-
"p95_latency_ms":
|
| 11 |
"EM": 0.4,
|
| 12 |
"SM": 0.8,
|
| 13 |
"ExecAcc": 0.8,
|
| 14 |
"detector_avg_ms": 0.0,
|
| 15 |
-
"planner_avg_ms":
|
| 16 |
-
"generator_avg_ms":
|
| 17 |
-
"safety_avg_ms":
|
| 18 |
-
"executor_avg_ms":
|
| 19 |
-
"verifier_avg_ms": 0.
|
| 20 |
-
"repair_avg_ms":
|
| 21 |
}
|
|
|
|
| 1 |
{
|
| 2 |
+
"timestamp": "2025-11-09T10:36:47",
|
| 3 |
"split": "dev",
|
| 4 |
"config": "configs/sqlite_pipeline.yaml",
|
| 5 |
"total": 5,
|
| 6 |
"success": 5,
|
| 7 |
"success_rate": 1.0,
|
| 8 |
+
"avg_latency_ms": 9273.4,
|
| 9 |
+
"p50_latency_ms": 10975.0,
|
| 10 |
+
"p95_latency_ms": 13893.6,
|
| 11 |
"EM": 0.4,
|
| 12 |
"SM": 0.8,
|
| 13 |
"ExecAcc": 0.8,
|
| 14 |
"detector_avg_ms": 0.0,
|
| 15 |
+
"planner_avg_ms": 8030.5,
|
| 16 |
+
"generator_avg_ms": 1372.5,
|
| 17 |
+
"safety_avg_ms": 1.5,
|
| 18 |
+
"executor_avg_ms": 1.08,
|
| 19 |
+
"verifier_avg_ms": 0.25,
|
| 20 |
+
"repair_avg_ms": 1087.12
|
| 21 |
}
|
nl2sql/pipeline.py
CHANGED
|
@@ -74,12 +74,14 @@ class Pipeline:
|
|
| 74 |
duration_ms: float,
|
| 75 |
summary: str,
|
| 76 |
notes: Optional[Dict[str, Any]] = None,
|
|
|
|
| 77 |
) -> dict:
|
| 78 |
return {
|
| 79 |
"stage": stage,
|
| 80 |
"duration_ms": float(duration_ms),
|
| 81 |
"summary": summary,
|
| 82 |
"notes": notes or {},
|
|
|
|
| 83 |
}
|
| 84 |
|
| 85 |
@staticmethod
|
|
@@ -102,7 +104,17 @@ class Pipeline:
|
|
| 102 |
"summary": summary,
|
| 103 |
"notes": notes,
|
| 104 |
}
|
| 105 |
-
for k in (
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
if k in t:
|
| 107 |
payload[k] = t[k]
|
| 108 |
norm.append(payload)
|
|
@@ -231,7 +243,12 @@ class Pipeline:
|
|
| 231 |
if not sql or not str(sql).strip():
|
| 232 |
pipeline_runs_total.labels(status="error").inc()
|
| 233 |
traces.append(
|
| 234 |
-
self._mk_trace(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 235 |
)
|
| 236 |
return FinalResult(
|
| 237 |
ok=False,
|
|
@@ -318,6 +335,8 @@ class Pipeline:
|
|
| 318 |
traces.extend(self._trace_list(r_fix))
|
| 319 |
if not getattr(r_fix, "trace", None):
|
| 320 |
_fallback_trace("repair", dt, r_fix.ok)
|
|
|
|
|
|
|
| 321 |
if not r_fix.ok:
|
| 322 |
break
|
| 323 |
|
|
|
|
| 74 |
duration_ms: float,
|
| 75 |
summary: str,
|
| 76 |
notes: Optional[Dict[str, Any]] = None,
|
| 77 |
+
skipped: bool = False,
|
| 78 |
) -> dict:
|
| 79 |
return {
|
| 80 |
"stage": stage,
|
| 81 |
"duration_ms": float(duration_ms),
|
| 82 |
"summary": summary,
|
| 83 |
"notes": notes or {},
|
| 84 |
+
"skipped": bool(skipped),
|
| 85 |
}
|
| 86 |
|
| 87 |
@staticmethod
|
|
|
|
| 104 |
"summary": summary,
|
| 105 |
"notes": notes,
|
| 106 |
}
|
| 107 |
+
for k in (
|
| 108 |
+
"token_in",
|
| 109 |
+
"token_out",
|
| 110 |
+
"cost_usd",
|
| 111 |
+
"sql_length",
|
| 112 |
+
"row_count",
|
| 113 |
+
"verified",
|
| 114 |
+
"error_type",
|
| 115 |
+
"repair_attempts",
|
| 116 |
+
"skipped",
|
| 117 |
+
):
|
| 118 |
if k in t:
|
| 119 |
payload[k] = t[k]
|
| 120 |
norm.append(payload)
|
|
|
|
| 243 |
if not sql or not str(sql).strip():
|
| 244 |
pipeline_runs_total.labels(status="error").inc()
|
| 245 |
traces.append(
|
| 246 |
+
self._mk_trace(
|
| 247 |
+
"generator",
|
| 248 |
+
0.0,
|
| 249 |
+
"failed",
|
| 250 |
+
{"reason": "empty_sql", "error_type": "EmptySQL"},
|
| 251 |
+
)
|
| 252 |
)
|
| 253 |
return FinalResult(
|
| 254 |
ok=False,
|
|
|
|
| 335 |
traces.extend(self._trace_list(r_fix))
|
| 336 |
if not getattr(r_fix, "trace", None):
|
| 337 |
_fallback_trace("repair", dt, r_fix.ok)
|
| 338 |
+
# annotate attempt
|
| 339 |
+
traces[-1]["notes"]["attempt"] = _attempt + 1
|
| 340 |
if not r_fix.ok:
|
| 341 |
break
|
| 342 |
|
nl2sql/types.py
CHANGED
|
@@ -12,6 +12,14 @@ class StageTrace:
|
|
| 12 |
token_out: Optional[int] = None
|
| 13 |
cost_usd: Optional[float] = None
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
@dataclass(frozen=True)
|
| 17 |
class StageResult:
|
|
|
|
| 12 |
token_out: Optional[int] = None
|
| 13 |
cost_usd: Optional[float] = None
|
| 14 |
|
| 15 |
+
# Enriched fields
|
| 16 |
+
sql_length: Optional[int] = None
|
| 17 |
+
row_count: Optional[int] = None
|
| 18 |
+
verified: Optional[bool] = None
|
| 19 |
+
error_type: Optional[str] = None
|
| 20 |
+
repair_attempts: Optional[int] = None
|
| 21 |
+
skipped: bool = False
|
| 22 |
+
|
| 23 |
|
| 24 |
@dataclass(frozen=True)
|
| 25 |
class StageResult:
|