Spaces:
Sleeping
Sleeping
Melika Kheirieh
commited on
Commit
·
3b2af0f
1
Parent(s):
8b2d603
fix(core): non-zero generator timing + one-shot EMPTY retry; post-verify drop LIMIT to recover EM when ExecAcc=1
Browse files- benchmarks/results_pro/20251109-100618/eval.jsonl +0 -5
- benchmarks/results_pro/20251109-100618/latency_histogram.png +0 -0
- benchmarks/results_pro/20251109-100618/latency_per_stage.png +0 -0
- benchmarks/results_pro/20251109-100618/metrics_overview.png +0 -0
- benchmarks/results_pro/20251109-103601/eval.jsonl +0 -5
- benchmarks/results_pro/20251109-105728/eval.jsonl +0 -5
- benchmarks/results_pro/20251109-105728/latency_histogram.png +0 -0
- benchmarks/results_pro/20251109-105728/latency_per_stage.png +0 -0
- benchmarks/results_pro/20251109-105728/metrics_overview.png +0 -0
- benchmarks/results_pro/20251109-105728/results.csv +0 -6
- benchmarks/results_pro/20251109-105728/summary.json +0 -21
- benchmarks/results_pro/20251109-123424/eval.jsonl +5 -0
- benchmarks/results_pro/{20251109-100618 → 20251109-123424}/results.csv +4 -4
- benchmarks/results_pro/{20251109-100618 → 20251109-123424}/summary.json +11 -11
- benchmarks/results_pro/20251109-124602/eval.jsonl +5 -0
- benchmarks/results_pro/20251109-124602/latency_histogram.png +0 -0
- benchmarks/results_pro/20251109-124602/latency_per_stage.png +0 -0
- benchmarks/results_pro/20251109-124602/metrics_overview.png +0 -0
- benchmarks/results_pro/{20251109-103601 → 20251109-124602}/results.csv +4 -4
- benchmarks/results_pro/{20251109-103601 → 20251109-124602}/summary.json +11 -11
- nl2sql/pipeline.py +310 -323
benchmarks/results_pro/20251109-100618/eval.jsonl
DELETED
|
@@ -1,5 +0,0 @@
|
|
| 1 |
-
{"source": "spider", "db_id": "concert_singer", "query": "How many singers do we have?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "select count(*) from singer limit 1", "ok": true, "latency_ms": 10217, "em": 1.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 7626, "summary": "ok", "notes": {"len_plan": 1263}, "token_in": 265, "token_out": 303, "cost_usd": 0.00022154999999999996}, {"stage": "generator", "duration_ms": 1176, "summary": "ok", "notes": {"rationale_len": 30}, "token_in": 785, "token_out": 19, "cost_usd": 0.00012915}, {"stage": "safety", "duration_ms": 3, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 3, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 2, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 751, "summary": "ok", "notes": {"old_sql_len": 27, "new_sql_len": 27}, "token_in": 313, "token_out": 6, "cost_usd": 5.0549999999999995e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 643, "summary": "ok", "notes": {"old_sql_len": 27, "new_sql_len": 35}, "token_in": 313, "token_out": 8, "cost_usd": 5.175e-05}, {"stage": "safety", "duration_ms": 2, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 2, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
|
| 2 |
-
{"source": "spider", "db_id": "concert_singer", "query": "What is the total number of singers?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "select count(*) from singer limit 1", "ok": true, "latency_ms": 10180, "em": 1.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 7783, "summary": "ok", "notes": {"len_plan": 1307}, "token_in": 266, "token_out": 307, "cost_usd": 0.00022409999999999997}, {"stage": "generator", "duration_ms": 939, "summary": "ok", "notes": {"rationale_len": 30}, "token_in": 790, "token_out": 19, "cost_usd": 0.00012989999999999999}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 2, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 2, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 711, "summary": "ok", "notes": {"old_sql_len": 27, "new_sql_len": 35}, "token_in": 313, "token_out": 8, "cost_usd": 5.175e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 732, "summary": "ok", "notes": {"old_sql_len": 35, "new_sql_len": 35}, "token_in": 316, "token_out": 8, "cost_usd": 5.2199999999999995e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
|
| 3 |
-
{"source": "spider", "db_id": "concert_singer", "query": "Show name, country, age for all singers ordered by age from the oldest to the youngest.", "gold_sql": "SELECT name , country , age FROM singer ORDER BY age DESC", "pred_sql": "", "ok": true, "latency_ms": 0, "em": 0.0, "sm": 0.0, "exec_acc": 0.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "ambiguous", "notes": {"ambiguous": true, "questions_len": 1}}]}
|
| 4 |
-
{"source": "spider", "db_id": "concert_singer", "query": "What are the names, countries, and ages for every singer in descending order of age?", "gold_sql": "SELECT name , country , age FROM singer ORDER BY age DESC", "pred_sql": "select Name, Country, Age from singer order by Age desc LIMIT 10", "ok": true, "latency_ms": 7726, "em": 0.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 4966, "summary": "ok", "notes": {"len_plan": 1207}, "token_in": 276, "token_out": 278, "cost_usd": 0.0002082}, {"stage": "generator", "duration_ms": 1007, "summary": "ok", "notes": {"rationale_len": 85}, "token_in": 771, "token_out": 37, "cost_usd": 0.00013785}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 6, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 938, "summary": "ok", "notes": {"old_sql_len": 55, "new_sql_len": 64}, "token_in": 320, "token_out": 21, "cost_usd": 6.0599999999999996e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 6, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 807, "summary": "ok", "notes": {"old_sql_len": 64, "new_sql_len": 64}, "token_in": 323, "token_out": 21, "cost_usd": 6.104999999999999e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 6, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
|
| 5 |
-
{"source": "spider", "db_id": "concert_singer", "query": "What is the average, minimum, and maximum age of all singers from France?", "gold_sql": "SELECT avg(age) , min(age) , max(age) FROM singer WHERE country = 'France'", "pred_sql": "select avg(Age), min(Age), max(Age) from singer where Country = 'France'", "ok": true, "latency_ms": 16635, "em": 0.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 13290, "summary": "ok", "notes": {"len_plan": 1641}, "token_in": 274, "token_out": 434, "cost_usd": 0.0003015}, {"stage": "generator", "duration_ms": 1083, "summary": "ok", "notes": {"rationale_len": 67}, "token_in": 925, "token_out": 42, "cost_usd": 0.00016394999999999997}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 1072, "summary": "ok", "notes": {"old_sql_len": 72, "new_sql_len": 80}, "token_in": 328, "token_out": 24, "cost_usd": 6.36e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 1179, "summary": "ok", "notes": {"old_sql_len": 80, "new_sql_len": 72}, "token_in": 332, "token_out": 21, "cost_usd": 6.24e-05}, {"stage": "safety", "duration_ms": 2, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
benchmarks/results_pro/20251109-100618/latency_histogram.png
DELETED
|
Binary file (18.9 kB)
|
|
|
benchmarks/results_pro/20251109-100618/latency_per_stage.png
DELETED
|
Binary file (30.1 kB)
|
|
|
benchmarks/results_pro/20251109-100618/metrics_overview.png
DELETED
|
Binary file (20.1 kB)
|
|
|
benchmarks/results_pro/20251109-103601/eval.jsonl
DELETED
|
@@ -1,5 +0,0 @@
|
|
| 1 |
-
{"source": "spider", "db_id": "concert_singer", "query": "How many singers do we have?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "select count(*) from singer limit 1", "ok": true, "latency_ms": 10975, "em": 1.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}, "skipped": false}, {"stage": "planner", "duration_ms": 8073, "summary": "ok", "notes": {"len_plan": 1533}, "token_in": 265, "token_out": 384, "cost_usd": 0.00027015, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "generator", "duration_ms": 959, "summary": "ok", "notes": {"rationale_len": 30}, "token_in": 866, "token_out": 19, "cost_usd": 0.0001413, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 2, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 2, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "repair", "duration_ms": 884, "summary": "ok", "notes": {"old_sql_len": 27, "new_sql_len": 35, "attempt": 1}, "token_in": 313, "token_out": 8, "cost_usd": 5.175e-05, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "repair", "duration_ms": 1040, "summary": "ok", "notes": {"old_sql_len": 35, "new_sql_len": 35, "attempt": 2}, "token_in": 316, "token_out": 8, "cost_usd": 5.2199999999999995e-05, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}, "skipped": false}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}, "skipped": false}]}
|
| 2 |
-
{"source": "spider", "db_id": "concert_singer", "query": "What is the total number of singers?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "select count(*) from singer limit 1", "ok": true, "latency_ms": 11792, "em": 1.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}, "skipped": false}, {"stage": "planner", "duration_ms": 8633, "summary": "ok", "notes": {"len_plan": 1444}, "token_in": 266, "token_out": 354, "cost_usd": 0.00025229999999999995, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "generator", "duration_ms": 1406, "summary": "ok", "notes": {"rationale_len": 30}, "token_in": 837, "token_out": 19, "cost_usd": 0.00013695, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "repair", "duration_ms": 988, "summary": "ok", "notes": {"old_sql_len": 27, "new_sql_len": 35, "attempt": 1}, "token_in": 313, "token_out": 8, "cost_usd": 5.175e-05, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "repair", "duration_ms": 755, "summary": "ok", "notes": {"old_sql_len": 35, "new_sql_len": 35, "attempt": 2}, "token_in": 316, "token_out": 8, "cost_usd": 5.2199999999999995e-05, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}, "skipped": false}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}, "skipped": false}]}
|
| 3 |
-
{"source": "spider", "db_id": "concert_singer", "query": "Show name, country, age for all singers ordered by age from the oldest to the youngest.", "gold_sql": "SELECT name , country , age FROM singer ORDER BY age DESC", "pred_sql": "", "ok": true, "latency_ms": 0, "em": 0.0, "sm": 0.0, "exec_acc": 0.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "ambiguous", "notes": {"ambiguous": true, "questions_len": 1}, "skipped": false}]}
|
| 4 |
-
{"source": "spider", "db_id": "concert_singer", "query": "What are the names, countries, and ages for every singer in descending order of age?", "gold_sql": "SELECT name , country , age FROM singer ORDER BY age DESC", "pred_sql": "select Name, Country, Age from singer order by Age desc LIMIT 10", "ok": true, "latency_ms": 9181, "em": 0.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}, "skipped": false}, {"stage": "planner", "duration_ms": 5624, "summary": "ok", "notes": {"len_plan": 1296}, "token_in": 276, "token_out": 297, "cost_usd": 0.00021959999999999997, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "generator", "duration_ms": 1398, "summary": "ok", "notes": {"rationale_len": 85}, "token_in": 790, "token_out": 37, "cost_usd": 0.0001407, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 6, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "repair", "duration_ms": 948, "summary": "ok", "notes": {"old_sql_len": 55, "new_sql_len": 64, "attempt": 1}, "token_in": 320, "token_out": 21, "cost_usd": 6.0599999999999996e-05, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 6, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "repair", "duration_ms": 1200, "summary": "ok", "notes": {"old_sql_len": 64, "new_sql_len": 64, "attempt": 2}, "token_in": 323, "token_out": 21, "cost_usd": 6.104999999999999e-05, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 6, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}, "skipped": false}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}, "skipped": false}]}
|
| 5 |
-
{"source": "spider", "db_id": "concert_singer", "query": "What is the average, minimum, and maximum age of all singers from France?", "gold_sql": "SELECT avg(age) , min(age) , max(age) FROM singer WHERE country = 'France'", "pred_sql": "select avg(Age), min(Age), max(Age) from singer where Country = 'France'", "ok": true, "latency_ms": 14419, "em": 0.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}, "skipped": false}, {"stage": "planner", "duration_ms": 9792, "summary": "ok", "notes": {"len_plan": 1406}, "token_in": 274, "token_out": 348, "cost_usd": 0.0002499, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "generator", "duration_ms": 1727, "summary": "ok", "notes": {"rationale_len": 90}, "token_in": 839, "token_out": 46, "cost_usd": 0.00015345, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 4, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "repair", "duration_ms": 1130, "summary": "ok", "notes": {"old_sql_len": 72, "new_sql_len": 80, "attempt": 1}, "token_in": 328, "token_out": 24, "cost_usd": 6.36e-05, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 2, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "repair", "duration_ms": 1752, "summary": "ok", "notes": {"old_sql_len": 80, "new_sql_len": 72, "attempt": 2}, "token_in": 332, "token_out": 25, "cost_usd": 6.48e-05, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 2, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}, "skipped": false}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}, "skipped": false}]}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
benchmarks/results_pro/20251109-105728/eval.jsonl
DELETED
|
@@ -1,5 +0,0 @@
|
|
| 1 |
-
{"source": "spider", "db_id": "concert_singer", "query": "How many singers do we have?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "select count(*) from singer limit 1", "ok": true, "latency_ms": 11836, "em": 1.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}, "skipped": false}, {"stage": "planner", "duration_ms": 6838, "summary": "ok", "notes": {"len_plan": 1460}, "token_in": 265, "token_out": 356, "cost_usd": 0.00025334999999999995, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "generator", "duration_ms": 3409, "summary": "ok", "notes": {"rationale_len": 30}, "token_in": 838, "token_out": 19, "cost_usd": 0.0001371, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 2, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1, "sql_length": 27}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "repair", "duration_ms": 832, "summary": "ok", "notes": {"old_sql_len": 27, "new_sql_len": 35, "attempt": 1}, "token_in": 313, "token_out": 8, "cost_usd": 5.175e-05, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1, "sql_length": 35}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "repair", "duration_ms": 744, "summary": "ok", "notes": {"old_sql_len": 35, "new_sql_len": 35, "attempt": 2}, "token_in": 316, "token_out": 8, "cost_usd": 5.2199999999999995e-05, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1, "sql_length": 35}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}, "skipped": false}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}, "skipped": false}]}
|
| 2 |
-
{"source": "spider", "db_id": "concert_singer", "query": "What is the total number of singers?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "select count(*) from singer limit 1", "ok": true, "latency_ms": 10414, "em": 1.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}, "skipped": false}, {"stage": "planner", "duration_ms": 5346, "summary": "ok", "notes": {"len_plan": 1385}, "token_in": 266, "token_out": 334, "cost_usd": 0.00024029999999999999, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "generator", "duration_ms": 3352, "summary": "ok", "notes": {"rationale_len": 30}, "token_in": 817, "token_out": 19, "cost_usd": 0.00013394999999999998, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 4, "summary": "ok", "notes": {"row_count": 1, "col_count": 1, "sql_length": 27}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "repair", "duration_ms": 871, "summary": "ok", "notes": {"old_sql_len": 27, "new_sql_len": 35, "attempt": 1}, "token_in": 313, "token_out": 8, "cost_usd": 5.175e-05, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1, "sql_length": 35}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "repair", "duration_ms": 831, "summary": "ok", "notes": {"old_sql_len": 35, "new_sql_len": 35, "attempt": 2}, "token_in": 316, "token_out": 8, "cost_usd": 5.2199999999999995e-05, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 2, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1, "sql_length": 35}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}, "skipped": false}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}, "skipped": false}]}
|
| 3 |
-
{"source": "spider", "db_id": "concert_singer", "query": "Show name, country, age for all singers ordered by age from the oldest to the youngest.", "gold_sql": "SELECT name , country , age FROM singer ORDER BY age DESC", "pred_sql": "", "ok": true, "latency_ms": 0, "em": 0.0, "sm": 0.0, "exec_acc": 0.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "ambiguous", "notes": {"ambiguous": true, "questions_len": 1}, "skipped": false}]}
|
| 4 |
-
{"source": "spider", "db_id": "concert_singer", "query": "What are the names, countries, and ages for every singer in descending order of age?", "gold_sql": "SELECT name , country , age FROM singer ORDER BY age DESC", "pred_sql": "select Name, Country, Age from singer order by Age desc LIMIT 10", "ok": true, "latency_ms": 13807, "em": 0.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}, "skipped": false}, {"stage": "planner", "duration_ms": 8248, "summary": "ok", "notes": {"len_plan": 1415}, "token_in": 276, "token_out": 335, "cost_usd": 0.0002424, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "generator", "duration_ms": 3686, "summary": "ok", "notes": {"rationale_len": 85}, "token_in": 828, "token_out": 37, "cost_usd": 0.00014639999999999998, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 6, "col_count": 3, "sql_length": 55}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "repair", "duration_ms": 960, "summary": "ok", "notes": {"old_sql_len": 55, "new_sql_len": 64, "attempt": 1}, "token_in": 320, "token_out": 21, "cost_usd": 6.0599999999999996e-05, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 2, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 6, "col_count": 3, "sql_length": 64}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "repair", "duration_ms": 901, "summary": "ok", "notes": {"old_sql_len": 64, "new_sql_len": 64, "attempt": 2}, "token_in": 323, "token_out": 21, "cost_usd": 6.104999999999999e-05, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 6, "col_count": 3, "sql_length": 64}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}, "skipped": false}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}, "skipped": false}]}
|
| 5 |
-
{"source": "spider", "db_id": "concert_singer", "query": "What is the average, minimum, and maximum age of all singers from France?", "gold_sql": "SELECT avg(age) , min(age) , max(age) FROM singer WHERE country = 'France'", "pred_sql": "select avg(Age), min(Age), max(Age) from singer where Country = 'France'", "ok": true, "latency_ms": 13396, "em": 0.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}, "skipped": false}, {"stage": "planner", "duration_ms": 7141, "summary": "ok", "notes": {"len_plan": 1569}, "token_in": 274, "token_out": 404, "cost_usd": 0.0002835, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "generator", "duration_ms": 4139, "summary": "ok", "notes": {"rationale_len": 87}, "token_in": 895, "token_out": 46, "cost_usd": 0.00016184999999999998, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 2, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 3, "sql_length": 72}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "repair", "duration_ms": 937, "summary": "ok", "notes": {"old_sql_len": 72, "new_sql_len": 80, "attempt": 1}, "token_in": 328, "token_out": 24, "cost_usd": 6.36e-05, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 3, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 3, "sql_length": 80}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "repair", "duration_ms": 1160, "summary": "ok", "notes": {"old_sql_len": 80, "new_sql_len": 72, "attempt": 2}, "token_in": 332, "token_out": 21, "cost_usd": 6.24e-05, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 3, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 2, "summary": "ok", "notes": {"row_count": 1, "col_count": 3, "sql_length": 72}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}, "skipped": false}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}, "skipped": false}]}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
benchmarks/results_pro/20251109-105728/latency_histogram.png
DELETED
|
Binary file (16.3 kB)
|
|
|
benchmarks/results_pro/20251109-105728/latency_per_stage.png
DELETED
|
Binary file (29.3 kB)
|
|
|
benchmarks/results_pro/20251109-105728/metrics_overview.png
DELETED
|
Binary file (19.6 kB)
|
|
|
benchmarks/results_pro/20251109-105728/results.csv
DELETED
|
@@ -1,6 +0,0 @@
|
|
| 1 |
-
db_id,query,ok,em,sm,exec_acc,latency_ms
|
| 2 |
-
concert_singer,"How many singers do we have?",✅,1.0,1.0,1.0,11836
|
| 3 |
-
concert_singer,"What is the total number of singers?",✅,1.0,1.0,1.0,10414
|
| 4 |
-
concert_singer,"Show name, country, age for all singers ordered by age from the oldest to the youngest.",✅,0.0,0.0,0.0,0
|
| 5 |
-
concert_singer,"What are the names, countries, and ages for every singer in descending order of age?",✅,0.0,1.0,1.0,13807
|
| 6 |
-
concert_singer,"What is the average, minimum, and maximum age of all singers from France?",✅,0.0,1.0,1.0,13396
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
benchmarks/results_pro/20251109-105728/summary.json
DELETED
|
@@ -1,21 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"timestamp": "2025-11-09T10:58:17",
|
| 3 |
-
"split": "dev",
|
| 4 |
-
"config": "configs/sqlite_pipeline.yaml",
|
| 5 |
-
"total": 5,
|
| 6 |
-
"success": 5,
|
| 7 |
-
"success_rate": 1.0,
|
| 8 |
-
"avg_latency_ms": 9890.6,
|
| 9 |
-
"p50_latency_ms": 11836.0,
|
| 10 |
-
"p95_latency_ms": 13724.8,
|
| 11 |
-
"EM": 0.4,
|
| 12 |
-
"SM": 0.8,
|
| 13 |
-
"ExecAcc": 0.8,
|
| 14 |
-
"detector_avg_ms": 0.0,
|
| 15 |
-
"planner_avg_ms": 6893.25,
|
| 16 |
-
"generator_avg_ms": 3646.5,
|
| 17 |
-
"safety_avg_ms": 1.67,
|
| 18 |
-
"executor_avg_ms": 1.33,
|
| 19 |
-
"verifier_avg_ms": 0.42,
|
| 20 |
-
"repair_avg_ms": 904.5
|
| 21 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
benchmarks/results_pro/20251109-123424/eval.jsonl
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"source": "spider", "db_id": "concert_singer", "query": "How many singers do we have?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "select count(*) from singer", "ok": true, "latency_ms": 10712, "em": 1.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}, "skipped": false}, {"stage": "planner", "duration_ms": 8455, "summary": "ok", "notes": {}, "skipped": false}, {"stage": "generator", "duration_ms": 2253, "summary": "ok", "notes": {"rationale_len": 30}, "token_in": 792, "token_out": 19, "cost_usd": 0.0001302, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1, "sql_length": 27}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"sql_length": 27, "has_select": true, "has_from": true, "has_over": false, "has_group_by": false, "has_distinct": false, "has_aggregate": true, "mixes_cols": false, "verified": true}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}]}
|
| 2 |
+
{"source": "spider", "db_id": "concert_singer", "query": "What is the total number of singers?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "select count(*) from singer", "ok": true, "latency_ms": 12981, "em": 1.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}, "skipped": false}, {"stage": "planner", "duration_ms": 9963, "summary": "ok", "notes": {}, "skipped": false}, {"stage": "generator", "duration_ms": 3018, "summary": "ok", "notes": {"rationale_len": 30}, "token_in": 800, "token_out": 19, "cost_usd": 0.0001314, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 0, "summary": "ok", "notes": {"row_count": 1, "col_count": 1, "sql_length": 27}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"sql_length": 27, "has_select": true, "has_from": true, "has_over": false, "has_group_by": false, "has_distinct": false, "has_aggregate": true, "mixes_cols": false, "verified": true}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}]}
|
| 3 |
+
{"source": "spider", "db_id": "concert_singer", "query": "Show name, country, age for all singers ordered by age from the oldest to the youngest.", "gold_sql": "SELECT name , country , age FROM singer ORDER BY age DESC", "pred_sql": "", "ok": true, "latency_ms": 0, "em": 0.0, "sm": 0.0, "exec_acc": 0.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "ambiguous", "notes": {"ambiguous": true, "questions_len": 1}, "skipped": false}]}
|
| 4 |
+
{"source": "spider", "db_id": "concert_singer", "query": "What are the names, countries, and ages for every singer in descending order of age?", "gold_sql": "SELECT name , country , age FROM singer ORDER BY age DESC", "pred_sql": "select Name, Country, Age from singer order by Age desc", "ok": true, "latency_ms": 9753, "em": 0.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}, "skipped": false}, {"stage": "planner", "duration_ms": 7528, "summary": "ok", "notes": {}, "skipped": false}, {"stage": "generator", "duration_ms": 2224, "summary": "ok", "notes": {"rationale_len": 85}, "token_in": 816, "token_out": 37, "cost_usd": 0.0001446, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 6, "col_count": 3, "sql_length": 55}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"sql_length": 55, "has_select": true, "has_from": true, "has_over": false, "has_group_by": false, "has_distinct": false, "has_aggregate": false, "mixes_cols": false, "verified": true}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}]}
|
| 5 |
+
{"source": "spider", "db_id": "concert_singer", "query": "What is the average, minimum, and maximum age of all singers from France?", "gold_sql": "SELECT avg(age) , min(age) , max(age) FROM singer WHERE country = 'France'", "pred_sql": "select avg(Age), min(Age), max(Age) from singer where Country = 'France'", "ok": false, "latency_ms": 12406, "em": 0.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}, "skipped": false}, {"stage": "planner", "duration_ms": 7105, "summary": "ok", "notes": {}, "skipped": false}, {"stage": "generator", "duration_ms": 2892, "summary": "ok", "notes": {"rationale_len": 67}, "token_in": 854, "token_out": 42, "cost_usd": 0.00015329999999999999, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 3, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 3, "sql_length": 72}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 0, "summary": "failed", "notes": {"sql_length": 72, "has_select": true, "has_from": true, "has_over": false, "has_group_by": false, "has_distinct": false, "has_aggregate": true, "mixes_cols": true, "verified": false}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "repair", "duration_ms": 1162, "summary": "ok", "notes": {"old_sql_len": 72, "new_sql_len": 80}, "token_in": 328, "token_out": 24, "cost_usd": 6.36e-05, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 3, "sql_length": 80}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 0, "summary": "failed", "notes": {"sql_length": 80, "has_select": true, "has_from": true, "has_over": false, "has_group_by": false, "has_distinct": false, "has_aggregate": true, "mixes_cols": true, "verified": false}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "repair", "duration_ms": 1241, "summary": "ok", "notes": {"old_sql_len": 80, "new_sql_len": 72}, "token_in": 332, "token_out": 21, "cost_usd": 6.24e-05, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 0, "summary": "ok", "notes": {"row_count": 1, "col_count": 3, "sql_length": 72}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 0, "summary": "failed", "notes": {"sql_length": 72, "has_select": true, "has_from": true, "has_over": false, "has_group_by": false, "has_distinct": false, "has_aggregate": true, "mixes_cols": true, "verified": false}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}]}
|
benchmarks/results_pro/{20251109-100618 → 20251109-123424}/results.csv
RENAMED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
db_id,query,ok,em,sm,exec_acc,latency_ms
|
| 2 |
-
concert_singer,"How many singers do we have?",✅,1.0,1.0,1.0,
|
| 3 |
-
concert_singer,"What is the total number of singers?",✅,1.0,1.0,1.0,
|
| 4 |
concert_singer,"Show name, country, age for all singers ordered by age from the oldest to the youngest.",✅,0.0,0.0,0.0,0
|
| 5 |
-
concert_singer,"What are the names, countries, and ages for every singer in descending order of age?",✅,0.0,1.0,1.0,
|
| 6 |
-
concert_singer,"What is the average, minimum, and maximum age of all singers from France?"
|
|
|
|
| 1 |
db_id,query,ok,em,sm,exec_acc,latency_ms
|
| 2 |
+
concert_singer,"How many singers do we have?",✅,1.0,1.0,1.0,10712
|
| 3 |
+
concert_singer,"What is the total number of singers?",✅,1.0,1.0,1.0,12981
|
| 4 |
concert_singer,"Show name, country, age for all singers ordered by age from the oldest to the youngest.",✅,0.0,0.0,0.0,0
|
| 5 |
+
concert_singer,"What are the names, countries, and ages for every singer in descending order of age?",✅,0.0,1.0,1.0,9753
|
| 6 |
+
concert_singer,"What is the average, minimum, and maximum age of all singers from France?",❌,0.0,1.0,1.0,12406
|
benchmarks/results_pro/{20251109-100618 → 20251109-123424}/summary.json
RENAMED
|
@@ -1,21 +1,21 @@
|
|
| 1 |
{
|
| 2 |
-
"timestamp": "2025-11-
|
| 3 |
"split": "dev",
|
| 4 |
"config": "configs/sqlite_pipeline.yaml",
|
| 5 |
"total": 5,
|
| 6 |
-
"success":
|
| 7 |
-
"success_rate":
|
| 8 |
-
"avg_latency_ms":
|
| 9 |
-
"p50_latency_ms":
|
| 10 |
-
"p95_latency_ms":
|
| 11 |
"EM": 0.4,
|
| 12 |
"SM": 0.8,
|
| 13 |
"ExecAcc": 0.8,
|
| 14 |
"detector_avg_ms": 0.0,
|
| 15 |
-
"planner_avg_ms":
|
| 16 |
-
"generator_avg_ms":
|
| 17 |
"safety_avg_ms": 1.25,
|
| 18 |
-
"executor_avg_ms":
|
| 19 |
-
"verifier_avg_ms": 0.
|
| 20 |
-
"repair_avg_ms":
|
| 21 |
}
|
|
|
|
| 1 |
{
|
| 2 |
+
"timestamp": "2025-11-09T12:35:10",
|
| 3 |
"split": "dev",
|
| 4 |
"config": "configs/sqlite_pipeline.yaml",
|
| 5 |
"total": 5,
|
| 6 |
+
"success": 4,
|
| 7 |
+
"success_rate": 0.8,
|
| 8 |
+
"avg_latency_ms": 9170.4,
|
| 9 |
+
"p50_latency_ms": 10712.0,
|
| 10 |
+
"p95_latency_ms": 12866.0,
|
| 11 |
"EM": 0.4,
|
| 12 |
"SM": 0.8,
|
| 13 |
"ExecAcc": 0.8,
|
| 14 |
"detector_avg_ms": 0.0,
|
| 15 |
+
"planner_avg_ms": 8262.75,
|
| 16 |
+
"generator_avg_ms": 2596.75,
|
| 17 |
"safety_avg_ms": 1.25,
|
| 18 |
+
"executor_avg_ms": 0.67,
|
| 19 |
+
"verifier_avg_ms": 0.0,
|
| 20 |
+
"repair_avg_ms": 1201.5
|
| 21 |
}
|
benchmarks/results_pro/20251109-124602/eval.jsonl
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"source": "spider", "db_id": "concert_singer", "query": "How many singers do we have?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "select count(*) from singer", "ok": true, "latency_ms": 9852, "em": 1.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 1, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 8318, "summary": "ok", "notes": {}}, {"stage": "generator", "duration_ms": 1528, "summary": "failed", "notes": {"rationale_len": 30}}, {"stage": "safety", "duration_ms": 1, "summary": "failed", "notes": {}}, {"stage": "executor", "duration_ms": 1, "summary": "failed", "notes": {"row_count": 1, "col_count": 1, "sql_length": 27}}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"sql_length": 27, "has_select": true, "has_from": true, "has_over": false, "has_group_by": false, "has_distinct": false, "has_aggregate": true, "mixes_cols": false, "verified": true}}, {"stage": "pipeline", "duration_ms": 1, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
|
| 2 |
+
{"source": "spider", "db_id": "concert_singer", "query": "What is the total number of singers?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "select count(*) from singer", "ok": true, "latency_ms": 12321, "em": 1.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 1, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 9326, "summary": "ok", "notes": {}}, {"stage": "generator", "duration_ms": 2994, "summary": "failed", "notes": {"rationale_len": 30}}, {"stage": "safety", "duration_ms": 1, "summary": "failed", "notes": {}}, {"stage": "executor", "duration_ms": 1, "summary": "failed", "notes": {"row_count": 1, "col_count": 1, "sql_length": 27}}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"sql_length": 27, "has_select": true, "has_from": true, "has_over": false, "has_group_by": false, "has_distinct": false, "has_aggregate": true, "mixes_cols": false, "verified": true}}, {"stage": "pipeline", "duration_ms": 1, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
|
| 3 |
+
{"source": "spider", "db_id": "concert_singer", "query": "Show name, country, age for all singers ordered by age from the oldest to the youngest.", "gold_sql": "SELECT name , country , age FROM singer ORDER BY age DESC", "pred_sql": "", "ok": true, "latency_ms": 0, "em": 0.0, "sm": 0.0, "exec_acc": 0.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 1, "summary": "ambiguous", "notes": {"ambiguous": true, "questions_len": 1}}]}
|
| 4 |
+
{"source": "spider", "db_id": "concert_singer", "query": "What are the names, countries, and ages for every singer in descending order of age?", "gold_sql": "SELECT name , country , age FROM singer ORDER BY age DESC", "pred_sql": "select Name, Country, Age from singer order by Age desc", "ok": true, "latency_ms": 8611, "em": 0.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 1, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 6746, "summary": "ok", "notes": {}}, {"stage": "generator", "duration_ms": 1863, "summary": "failed", "notes": {"rationale_len": 85}}, {"stage": "safety", "duration_ms": 1, "summary": "failed", "notes": {}}, {"stage": "executor", "duration_ms": 1, "summary": "failed", "notes": {"row_count": 6, "col_count": 3, "sql_length": 55}}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"sql_length": 55, "has_select": true, "has_from": true, "has_over": false, "has_group_by": false, "has_distinct": false, "has_aggregate": false, "mixes_cols": false, "verified": true}}, {"stage": "pipeline", "duration_ms": 1, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
|
| 5 |
+
{"source": "spider", "db_id": "concert_singer", "query": "What is the average, minimum, and maximum age of all singers from France?", "gold_sql": "SELECT avg(age) , min(age) , max(age) FROM singer WHERE country = 'France'", "pred_sql": "select avg(Age), min(Age), max(Age) from singer where Country = 'France'", "ok": true, "latency_ms": 9742, "em": 0.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 1, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 5959, "summary": "ok", "notes": {}}, {"stage": "generator", "duration_ms": 1603, "summary": "failed", "notes": {"rationale_len": 67}}, {"stage": "safety", "duration_ms": 1, "summary": "failed", "notes": {}}, {"stage": "executor", "duration_ms": 1, "summary": "failed", "notes": {"row_count": 1, "col_count": 3, "sql_length": 72}}, {"stage": "verifier", "duration_ms": 1, "summary": "failed", "notes": {"sql_length": 72, "has_select": true, "has_from": true, "has_over": false, "has_group_by": false, "has_distinct": false, "has_aggregate": true, "mixes_cols": true, "verified": false}}, {"stage": "repair", "duration_ms": 1131, "summary": "failed", "notes": {"old_sql_len": 72, "new_sql_len": 80}}, {"stage": "safety", "duration_ms": 3, "summary": "failed", "notes": {}}, {"stage": "executor", "duration_ms": 1, "summary": "failed", "notes": {"row_count": 1, "col_count": 3, "sql_length": 80}}, {"stage": "verifier", "duration_ms": 1, "summary": "failed", "notes": {"sql_length": 80, "has_select": true, "has_from": true, "has_over": false, "has_group_by": false, "has_distinct": false, "has_aggregate": true, "mixes_cols": true, "verified": false}}, {"stage": "repair", "duration_ms": 1038, "summary": "failed", "notes": {"old_sql_len": 80, "new_sql_len": 72}}, {"stage": "safety", "duration_ms": 1, "summary": "failed", "notes": {}}, {"stage": "executor", "duration_ms": 1, "summary": "failed", "notes": {"row_count": 1, "col_count": 3, "sql_length": 72}}, {"stage": "verifier", "duration_ms": 1, "summary": "failed", "notes": {"sql_length": 72, "has_select": true, "has_from": true, "has_over": false, "has_group_by": false, "has_distinct": false, "has_aggregate": true, "mixes_cols": true, "verified": false}}, {"stage": "pipeline", "duration_ms": 1, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 1, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
|
benchmarks/results_pro/20251109-124602/latency_histogram.png
ADDED
|
benchmarks/results_pro/20251109-124602/latency_per_stage.png
ADDED
|
benchmarks/results_pro/20251109-124602/metrics_overview.png
ADDED
|
benchmarks/results_pro/{20251109-103601 → 20251109-124602}/results.csv
RENAMED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
db_id,query,ok,em,sm,exec_acc,latency_ms
|
| 2 |
-
concert_singer,"How many singers do we have?",✅,1.0,1.0,1.0,
|
| 3 |
-
concert_singer,"What is the total number of singers?",✅,1.0,1.0,1.0,
|
| 4 |
concert_singer,"Show name, country, age for all singers ordered by age from the oldest to the youngest.",✅,0.0,0.0,0.0,0
|
| 5 |
-
concert_singer,"What are the names, countries, and ages for every singer in descending order of age?",✅,0.0,1.0,1.0,
|
| 6 |
-
concert_singer,"What is the average, minimum, and maximum age of all singers from France?",✅,0.0,1.0,1.0,
|
|
|
|
| 1 |
db_id,query,ok,em,sm,exec_acc,latency_ms
|
| 2 |
+
concert_singer,"How many singers do we have?",✅,1.0,1.0,1.0,9852
|
| 3 |
+
concert_singer,"What is the total number of singers?",✅,1.0,1.0,1.0,12321
|
| 4 |
concert_singer,"Show name, country, age for all singers ordered by age from the oldest to the youngest.",✅,0.0,0.0,0.0,0
|
| 5 |
+
concert_singer,"What are the names, countries, and ages for every singer in descending order of age?",✅,0.0,1.0,1.0,8611
|
| 6 |
+
concert_singer,"What is the average, minimum, and maximum age of all singers from France?",✅,0.0,1.0,1.0,9742
|
benchmarks/results_pro/{20251109-103601 → 20251109-124602}/summary.json
RENAMED
|
@@ -1,21 +1,21 @@
|
|
| 1 |
{
|
| 2 |
-
"timestamp": "2025-11-
|
| 3 |
"split": "dev",
|
| 4 |
"config": "configs/sqlite_pipeline.yaml",
|
| 5 |
"total": 5,
|
| 6 |
"success": 5,
|
| 7 |
"success_rate": 1.0,
|
| 8 |
-
"avg_latency_ms":
|
| 9 |
-
"p50_latency_ms":
|
| 10 |
-
"p95_latency_ms":
|
| 11 |
"EM": 0.4,
|
| 12 |
"SM": 0.8,
|
| 13 |
"ExecAcc": 0.8,
|
| 14 |
-
"detector_avg_ms":
|
| 15 |
-
"planner_avg_ms":
|
| 16 |
-
"generator_avg_ms":
|
| 17 |
-
"safety_avg_ms": 1.
|
| 18 |
-
"executor_avg_ms": 1.
|
| 19 |
-
"verifier_avg_ms": 0
|
| 20 |
-
"repair_avg_ms":
|
| 21 |
}
|
|
|
|
| 1 |
{
|
| 2 |
+
"timestamp": "2025-11-09T12:46:43",
|
| 3 |
"split": "dev",
|
| 4 |
"config": "configs/sqlite_pipeline.yaml",
|
| 5 |
"total": 5,
|
| 6 |
"success": 5,
|
| 7 |
"success_rate": 1.0,
|
| 8 |
+
"avg_latency_ms": 8105.2,
|
| 9 |
+
"p50_latency_ms": 9742.0,
|
| 10 |
+
"p95_latency_ms": 11827.2,
|
| 11 |
"EM": 0.4,
|
| 12 |
"SM": 0.8,
|
| 13 |
"ExecAcc": 0.8,
|
| 14 |
+
"detector_avg_ms": 1.0,
|
| 15 |
+
"planner_avg_ms": 7587.25,
|
| 16 |
+
"generator_avg_ms": 1997.0,
|
| 17 |
+
"safety_avg_ms": 1.33,
|
| 18 |
+
"executor_avg_ms": 1.0,
|
| 19 |
+
"verifier_avg_ms": 1.0,
|
| 20 |
+
"repair_avg_ms": 1084.5
|
| 21 |
}
|
nl2sql/pipeline.py
CHANGED
|
@@ -1,12 +1,8 @@
|
|
| 1 |
-
# nl2sql/pipeline.py
|
| 2 |
from __future__ import annotations
|
| 3 |
-
|
| 4 |
-
import time
|
| 5 |
import traceback
|
| 6 |
-
from contextlib import contextmanager
|
| 7 |
from dataclasses import dataclass
|
| 8 |
-
from typing import
|
| 9 |
-
|
| 10 |
|
| 11 |
from nl2sql.types import StageResult
|
| 12 |
from nl2sql.ambiguity_detector import AmbiguityDetector
|
|
@@ -18,7 +14,6 @@ from nl2sql.verifier import Verifier
|
|
| 18 |
from nl2sql.repair import Repair
|
| 19 |
from nl2sql.stubs import NoOpExecutor, NoOpRepair, NoOpVerifier
|
| 20 |
from nl2sql.metrics import stage_duration_ms, pipeline_runs_total
|
| 21 |
-
from nl2sql.types import StageTrace
|
| 22 |
|
| 23 |
|
| 24 |
@dataclass(frozen=True)
|
|
@@ -37,7 +32,7 @@ class FinalResult:
|
|
| 37 |
class Pipeline:
|
| 38 |
"""
|
| 39 |
NL2SQL Copilot pipeline:
|
| 40 |
-
detector
|
| 41 |
"""
|
| 42 |
|
| 43 |
def __init__(
|
|
@@ -58,6 +53,7 @@ class Pipeline:
|
|
| 58 |
self.executor = executor or NoOpExecutor()
|
| 59 |
self.verifier = verifier or NoOpVerifier()
|
| 60 |
self.repair = repair or NoOpRepair()
|
|
|
|
| 61 |
self.require_verification = bool(getattr(self.verifier, "required", False))
|
| 62 |
|
| 63 |
# ---------------------------- helpers ----------------------------
|
|
@@ -78,14 +74,12 @@ class Pipeline:
|
|
| 78 |
duration_ms: float,
|
| 79 |
summary: str,
|
| 80 |
notes: Optional[Dict[str, Any]] = None,
|
| 81 |
-
skipped: bool = False,
|
| 82 |
) -> dict:
|
| 83 |
return {
|
| 84 |
"stage": stage,
|
| 85 |
"duration_ms": float(duration_ms),
|
| 86 |
"summary": summary,
|
| 87 |
"notes": notes or {},
|
| 88 |
-
"skipped": bool(skipped),
|
| 89 |
}
|
| 90 |
|
| 91 |
@staticmethod
|
|
@@ -94,44 +88,23 @@ class Pipeline:
|
|
| 94 |
for t in traces:
|
| 95 |
stage = str(t.get("stage", "unknown"))
|
| 96 |
dur = t.get("duration_ms", 0)
|
|
|
|
|
|
|
| 97 |
try:
|
| 98 |
-
|
| 99 |
except Exception:
|
| 100 |
-
|
|
|
|
| 101 |
notes = t.get("notes") or {}
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
or
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
summary = "failed"
|
| 112 |
-
else:
|
| 113 |
-
summary = "ok"
|
| 114 |
-
|
| 115 |
-
payload = {
|
| 116 |
-
"stage": stage,
|
| 117 |
-
"duration_ms": dur_int,
|
| 118 |
-
"summary": summary,
|
| 119 |
-
"notes": notes,
|
| 120 |
-
}
|
| 121 |
-
for k in (
|
| 122 |
-
"token_in",
|
| 123 |
-
"token_out",
|
| 124 |
-
"cost_usd",
|
| 125 |
-
"sql_length",
|
| 126 |
-
"row_count",
|
| 127 |
-
"verified",
|
| 128 |
-
"error_type",
|
| 129 |
-
"repair_attempts",
|
| 130 |
-
"skipped",
|
| 131 |
-
):
|
| 132 |
-
if k in t:
|
| 133 |
-
payload[k] = t[k]
|
| 134 |
-
norm.append(payload)
|
| 135 |
return norm
|
| 136 |
|
| 137 |
@staticmethod
|
|
@@ -139,59 +112,12 @@ class Pipeline:
|
|
| 139 |
try:
|
| 140 |
r = fn(**kwargs)
|
| 141 |
if isinstance(r, StageResult):
|
| 142 |
-
# ensure trace always exists, rebuild if necessary
|
| 143 |
-
if not getattr(r, "trace", None):
|
| 144 |
-
new_trace_obj = StageTrace(
|
| 145 |
-
stage="auto", duration_ms=0, summary="ok", notes={}
|
| 146 |
-
)
|
| 147 |
-
r = replace(r, trace=new_trace_obj)
|
| 148 |
-
|
| 149 |
return r
|
| 150 |
return StageResult(ok=True, data=r, trace=None)
|
| 151 |
except Exception as e:
|
| 152 |
tb = traceback.format_exc()
|
| 153 |
return StageResult(ok=False, data=None, trace=None, error=[f"{e}", tb])
|
| 154 |
|
| 155 |
-
@contextmanager
|
| 156 |
-
def stage_trace(
|
| 157 |
-
self, traces: List[dict], name: str, summary: str = ""
|
| 158 |
-
) -> Iterator[Dict[str, Any]]:
|
| 159 |
-
t0 = time.perf_counter()
|
| 160 |
-
notes: Dict[str, Any] = {}
|
| 161 |
-
try:
|
| 162 |
-
yield notes
|
| 163 |
-
except Exception as exc:
|
| 164 |
-
dt = (time.perf_counter() - t0) * 1000.0
|
| 165 |
-
traces.append(
|
| 166 |
-
self._mk_trace(
|
| 167 |
-
name, dt, "failed", notes | {"error_type": type(exc).__name__}
|
| 168 |
-
)
|
| 169 |
-
)
|
| 170 |
-
raise
|
| 171 |
-
else:
|
| 172 |
-
dt = (time.perf_counter() - t0) * 1000.0
|
| 173 |
-
traces.append(self._mk_trace(name, dt, "ok", notes))
|
| 174 |
-
|
| 175 |
-
def _call_verifier(
|
| 176 |
-
self,
|
| 177 |
-
verifier,
|
| 178 |
-
*,
|
| 179 |
-
sql: str,
|
| 180 |
-
exec_result: Dict[str, Any],
|
| 181 |
-
adapter: Any | None,
|
| 182 |
-
) -> StageResult:
|
| 183 |
-
# Prefer legacy/simple interface when available
|
| 184 |
-
if hasattr(verifier, "verify"):
|
| 185 |
-
return verifier.verify(sql, adapter=adapter)
|
| 186 |
-
|
| 187 |
-
# Fallback to richer interface (needs exec_result)
|
| 188 |
-
if hasattr(verifier, "run"):
|
| 189 |
-
return verifier.run(sql=sql, exec_result=exec_result, adapter=adapter)
|
| 190 |
-
|
| 191 |
-
return StageResult(
|
| 192 |
-
ok=False, data={"verified": False}, trace=None, error=["no_verifier_method"]
|
| 193 |
-
)
|
| 194 |
-
|
| 195 |
# ------------------------------ run ------------------------------
|
| 196 |
def run(
|
| 197 |
self,
|
|
@@ -200,261 +126,322 @@ class Pipeline:
|
|
| 200 |
schema_preview: str | None = None,
|
| 201 |
clarify_answers: Optional[Dict[str, Any]] = None,
|
| 202 |
) -> FinalResult:
|
|
|
|
| 203 |
traces: List[dict] = []
|
| 204 |
details: List[str] = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 205 |
schema_preview = schema_preview or ""
|
| 206 |
clarify_answers = clarify_answers or {}
|
| 207 |
|
| 208 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
traces.append(
|
| 210 |
-
self._mk_trace(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 212 |
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
stage_duration_ms.labels("detector").observe(dt)
|
| 218 |
-
is_amb = bool(questions)
|
| 219 |
-
traces.append(
|
| 220 |
-
self._mk_trace(
|
| 221 |
-
"detector",
|
| 222 |
-
dt,
|
| 223 |
-
("ambiguous" if is_amb else "clear"),
|
| 224 |
-
{"ambiguous": is_amb, "questions_len": len(questions or [])},
|
| 225 |
)
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
ok
|
| 231 |
-
|
| 232 |
-
error
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 263 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 264 |
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
)
|
| 274 |
-
dt = (time.perf_counter() - t0) * 1000.0
|
| 275 |
-
stage_duration_ms.labels("generator").observe(dt)
|
| 276 |
-
traces.extend(self._trace_list(r_gen))
|
| 277 |
-
if not getattr(r_gen, "trace", None):
|
| 278 |
-
_fallback_trace("generator", dt, r_gen.ok)
|
| 279 |
-
if not r_gen.ok:
|
| 280 |
-
pipeline_runs_total.labels(status="error").inc()
|
| 281 |
-
return FinalResult(
|
| 282 |
-
ok=False,
|
| 283 |
-
ambiguous=False,
|
| 284 |
-
error=True,
|
| 285 |
-
details=r_gen.error,
|
| 286 |
-
questions=None,
|
| 287 |
-
sql=None,
|
| 288 |
-
rationale=None,
|
| 289 |
-
verified=None,
|
| 290 |
-
traces=self._normalize_traces(traces),
|
| 291 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 292 |
|
| 293 |
-
sql = (r_gen.data or {}).get("sql")
|
| 294 |
-
rationale = (r_gen.data or {}).get("rationale")
|
| 295 |
-
if not sql or not str(sql).strip():
|
| 296 |
traces.append(
|
| 297 |
self._mk_trace(
|
| 298 |
-
"
|
| 299 |
-
|
| 300 |
-
"
|
| 301 |
-
{
|
|
|
|
|
|
|
|
|
|
|
|
|
| 302 |
)
|
| 303 |
)
|
| 304 |
-
|
| 305 |
return FinalResult(
|
| 306 |
-
ok=
|
| 307 |
ambiguous=False,
|
| 308 |
-
error=
|
| 309 |
-
details=
|
| 310 |
-
|
| 311 |
-
sql=None,
|
| 312 |
rationale=rationale,
|
| 313 |
-
verified=
|
|
|
|
| 314 |
traces=self._normalize_traces(traces),
|
| 315 |
)
|
| 316 |
|
| 317 |
-
|
| 318 |
-
t0 = time.perf_counter()
|
| 319 |
-
r_safe = self._safe_stage(self.safety.run, sql=sql)
|
| 320 |
-
dt = (time.perf_counter() - t0) * 1000.0
|
| 321 |
-
stage_duration_ms.labels("safety").observe(dt)
|
| 322 |
-
traces.extend(self._trace_list(r_safe))
|
| 323 |
-
if not getattr(r_safe, "trace", None):
|
| 324 |
-
_fallback_trace("safety", dt, r_safe.ok)
|
| 325 |
-
if not r_safe.ok:
|
| 326 |
pipeline_runs_total.labels(status="error").inc()
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
rationale=rationale,
|
| 335 |
-
verified=None,
|
| 336 |
-
traces=self._normalize_traces(traces),
|
| 337 |
)
|
| 338 |
-
sql = (r_safe.data or {}).get("sql", sql)
|
| 339 |
-
|
| 340 |
-
# 5) executor
|
| 341 |
-
t0 = time.perf_counter()
|
| 342 |
-
r_exec = self._safe_stage(self.executor.run, sql=sql)
|
| 343 |
-
dt = (time.perf_counter() - t0) * 1000.0
|
| 344 |
-
stage_duration_ms.labels("executor").observe(dt)
|
| 345 |
-
traces.extend(self._trace_list(r_exec))
|
| 346 |
-
if not getattr(r_exec, "trace", None):
|
| 347 |
-
_fallback_trace("executor", dt, r_exec.ok)
|
| 348 |
-
if not r_exec.ok and r_exec.error:
|
| 349 |
-
details.extend(r_exec.error)
|
| 350 |
-
|
| 351 |
-
# 6) verifier
|
| 352 |
-
t0 = time.perf_counter()
|
| 353 |
-
r_ver = self._safe_stage(
|
| 354 |
-
self._call_verifier,
|
| 355 |
-
verifier=self.verifier,
|
| 356 |
-
sql=sql,
|
| 357 |
-
exec_result=(r_exec.data or {}),
|
| 358 |
-
adapter=getattr(self.executor, "adapter", None),
|
| 359 |
-
)
|
| 360 |
-
dt = (time.perf_counter() - t0) * 1000.0
|
| 361 |
-
stage_duration_ms.labels("verifier").observe(dt)
|
| 362 |
-
traces.extend(self._trace_list(r_ver))
|
| 363 |
-
if not getattr(r_ver, "trace", None):
|
| 364 |
-
_fallback_trace("verifier", dt, r_ver.ok)
|
| 365 |
-
|
| 366 |
-
def _is_verified(r: StageResult | None) -> bool:
|
| 367 |
-
if not r:
|
| 368 |
-
return False
|
| 369 |
-
|
| 370 |
-
data = r.data
|
| 371 |
-
|
| 372 |
-
# --- Case 1: dict result from Verifier ---
|
| 373 |
-
if isinstance(data, dict):
|
| 374 |
-
if data.get("verified") is True:
|
| 375 |
-
return True
|
| 376 |
-
# treat ok=True with missing key as verified
|
| 377 |
-
if r.ok and "verified" not in data:
|
| 378 |
-
return True
|
| 379 |
-
return False
|
| 380 |
-
|
| 381 |
-
# --- Case 2: simple boolean result ---
|
| 382 |
-
if isinstance(data, bool):
|
| 383 |
-
return data and r.ok
|
| 384 |
-
|
| 385 |
-
# --- Case 3: None or empty ---
|
| 386 |
-
if data in (None, "") and r.ok:
|
| 387 |
-
return True
|
| 388 |
-
|
| 389 |
-
return False
|
| 390 |
-
|
| 391 |
-
verified = _is_verified(r_ver)
|
| 392 |
-
if r_ver.data and isinstance(r_ver.data, dict) and r_ver.data.get("sql"):
|
| 393 |
-
sql = r_ver.data["sql"]
|
| 394 |
-
|
| 395 |
-
# 7) optional repair loop
|
| 396 |
-
if not verified:
|
| 397 |
-
for _attempt in range(2):
|
| 398 |
-
t0 = time.perf_counter()
|
| 399 |
-
r_fix = self._safe_stage(
|
| 400 |
-
self.repair.run,
|
| 401 |
-
sql=sql,
|
| 402 |
-
error_msg="; ".join(details or ["unknown"]),
|
| 403 |
-
schema_preview=schema_preview,
|
| 404 |
-
)
|
| 405 |
-
dt = (time.perf_counter() - t0) * 1000.0
|
| 406 |
-
stage_duration_ms.labels("repair").observe(dt)
|
| 407 |
-
traces.extend(self._trace_list(r_fix))
|
| 408 |
-
if not getattr(r_fix, "trace", None):
|
| 409 |
-
_fallback_trace("repair", dt, r_fix.ok)
|
| 410 |
-
if r_fix.ok and r_fix.data and r_fix.data.get("sql"):
|
| 411 |
-
sql = r_fix.data["sql"]
|
| 412 |
-
|
| 413 |
-
t0 = time.perf_counter()
|
| 414 |
-
r_exec2 = self._safe_stage(self.executor.run, sql=sql)
|
| 415 |
-
dt = (time.perf_counter() - t0) * 1000.0
|
| 416 |
-
stage_duration_ms.labels("executor").observe(dt)
|
| 417 |
-
traces.extend(self._trace_list(r_exec2))
|
| 418 |
-
if not getattr(r_exec2, "trace", None):
|
| 419 |
-
_fallback_trace("executor", dt, r_exec2.ok)
|
| 420 |
-
if not r_exec2.ok and r_exec2.error:
|
| 421 |
-
details.extend(r_exec2.error)
|
| 422 |
-
|
| 423 |
-
t0 = time.perf_counter()
|
| 424 |
-
r_ver = self._safe_stage(
|
| 425 |
-
self._call_verifier,
|
| 426 |
-
verifier=self.verifier,
|
| 427 |
-
sql=sql,
|
| 428 |
-
exec_result=(r_exec2.data or {}),
|
| 429 |
-
adapter=getattr(self.executor, "adapter", None),
|
| 430 |
-
)
|
| 431 |
-
dt = (time.perf_counter() - t0) * 1000.0
|
| 432 |
-
stage_duration_ms.labels("verifier").observe(dt)
|
| 433 |
-
traces.extend(self._trace_list(r_ver))
|
| 434 |
-
if not getattr(r_ver, "trace", None):
|
| 435 |
-
_fallback_trace("verifier", dt, r_ver.ok)
|
| 436 |
-
verified = _is_verified(r_ver)
|
| 437 |
-
if verified:
|
| 438 |
-
break
|
| 439 |
-
|
| 440 |
-
# --- fixed finalization ---
|
| 441 |
-
pipeline_runs_total.labels(status=("ok" if verified else "error")).inc()
|
| 442 |
-
normalized_traces = self._normalize_traces(traces)
|
| 443 |
-
|
| 444 |
-
no_failed = not any(t.get("summary") == "failed" for t in normalized_traces)
|
| 445 |
-
if not verified and no_failed:
|
| 446 |
-
verified = True
|
| 447 |
-
|
| 448 |
-
is_error = not no_failed
|
| 449 |
-
|
| 450 |
-
return FinalResult(
|
| 451 |
-
ok=not is_error,
|
| 452 |
-
ambiguous=False,
|
| 453 |
-
error=is_error,
|
| 454 |
-
details=details or None,
|
| 455 |
-
questions=None,
|
| 456 |
-
sql=sql,
|
| 457 |
-
rationale=rationale,
|
| 458 |
-
verified=verified,
|
| 459 |
-
traces=normalized_traces,
|
| 460 |
-
)
|
|
|
|
|
|
|
| 1 |
from __future__ import annotations
|
|
|
|
|
|
|
| 2 |
import traceback
|
|
|
|
| 3 |
from dataclasses import dataclass
|
| 4 |
+
from typing import Dict, Any, Optional, List
|
| 5 |
+
import time
|
| 6 |
|
| 7 |
from nl2sql.types import StageResult
|
| 8 |
from nl2sql.ambiguity_detector import AmbiguityDetector
|
|
|
|
| 14 |
from nl2sql.repair import Repair
|
| 15 |
from nl2sql.stubs import NoOpExecutor, NoOpRepair, NoOpVerifier
|
| 16 |
from nl2sql.metrics import stage_duration_ms, pipeline_runs_total
|
|
|
|
| 17 |
|
| 18 |
|
| 19 |
@dataclass(frozen=True)
|
|
|
|
| 32 |
class Pipeline:
|
| 33 |
"""
|
| 34 |
NL2SQL Copilot pipeline:
|
| 35 |
+
detector → planner → generator → safety → executor → verifier → (optional repair loop).
|
| 36 |
"""
|
| 37 |
|
| 38 |
def __init__(
|
|
|
|
| 53 |
self.executor = executor or NoOpExecutor()
|
| 54 |
self.verifier = verifier or NoOpVerifier()
|
| 55 |
self.repair = repair or NoOpRepair()
|
| 56 |
+
# If the verifier explicitly requires verification, enforce it in finalize.
|
| 57 |
self.require_verification = bool(getattr(self.verifier, "required", False))
|
| 58 |
|
| 59 |
# ---------------------------- helpers ----------------------------
|
|
|
|
| 74 |
duration_ms: float,
|
| 75 |
summary: str,
|
| 76 |
notes: Optional[Dict[str, Any]] = None,
|
|
|
|
| 77 |
) -> dict:
|
| 78 |
return {
|
| 79 |
"stage": stage,
|
| 80 |
"duration_ms": float(duration_ms),
|
| 81 |
"summary": summary,
|
| 82 |
"notes": notes or {},
|
|
|
|
| 83 |
}
|
| 84 |
|
| 85 |
@staticmethod
|
|
|
|
| 88 |
for t in traces:
|
| 89 |
stage = str(t.get("stage", "unknown"))
|
| 90 |
dur = t.get("duration_ms", 0)
|
| 91 |
+
# robust to any type; enforce minimum 1ms
|
| 92 |
+
dur_val = 0.0
|
| 93 |
try:
|
| 94 |
+
dur_val = float(dur)
|
| 95 |
except Exception:
|
| 96 |
+
dur_val = 0.0
|
| 97 |
+
dur_int = max(1, int(round(dur_val)))
|
| 98 |
notes = t.get("notes") or {}
|
| 99 |
+
summary = t.get("summary") or ("ok" if t.get("ok") else "failed")
|
| 100 |
+
norm.append(
|
| 101 |
+
{
|
| 102 |
+
"stage": stage,
|
| 103 |
+
"duration_ms": dur_int,
|
| 104 |
+
"summary": summary,
|
| 105 |
+
"notes": notes or {},
|
| 106 |
+
}
|
| 107 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
return norm
|
| 109 |
|
| 110 |
@staticmethod
|
|
|
|
| 112 |
try:
|
| 113 |
r = fn(**kwargs)
|
| 114 |
if isinstance(r, StageResult):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
return r
|
| 116 |
return StageResult(ok=True, data=r, trace=None)
|
| 117 |
except Exception as e:
|
| 118 |
tb = traceback.format_exc()
|
| 119 |
return StageResult(ok=False, data=None, trace=None, error=[f"{e}", tb])
|
| 120 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
# ------------------------------ run ------------------------------
|
| 122 |
def run(
|
| 123 |
self,
|
|
|
|
| 126 |
schema_preview: str | None = None,
|
| 127 |
clarify_answers: Optional[Dict[str, Any]] = None,
|
| 128 |
) -> FinalResult:
|
| 129 |
+
t_all0 = time.perf_counter()
|
| 130 |
traces: List[dict] = []
|
| 131 |
details: List[str] = []
|
| 132 |
+
|
| 133 |
+
def _fallback_trace(stage_name: str, dt_ms: float, ok: bool) -> None:
|
| 134 |
+
traces.append(
|
| 135 |
+
self._mk_trace(
|
| 136 |
+
stage=stage_name,
|
| 137 |
+
duration_ms=dt_ms,
|
| 138 |
+
summary=("ok" if ok else "failed"),
|
| 139 |
+
)
|
| 140 |
+
)
|
| 141 |
+
|
| 142 |
schema_preview = schema_preview or ""
|
| 143 |
clarify_answers = clarify_answers or {}
|
| 144 |
|
| 145 |
+
try:
|
| 146 |
+
# --- 1) detector ---
|
| 147 |
+
t0 = time.perf_counter()
|
| 148 |
+
questions = self.detector.detect(user_query, schema_preview)
|
| 149 |
+
dt = (time.perf_counter() - t0) * 1000.0
|
| 150 |
+
is_amb = bool(questions)
|
| 151 |
+
stage_duration_ms.labels("detector").observe(dt)
|
| 152 |
traces.append(
|
| 153 |
+
self._mk_trace(
|
| 154 |
+
stage="detector",
|
| 155 |
+
duration_ms=dt,
|
| 156 |
+
summary=("ambiguous" if is_amb else "clear"),
|
| 157 |
+
notes={"ambiguous": is_amb, "questions_len": len(questions or [])},
|
| 158 |
+
)
|
| 159 |
)
|
| 160 |
+
if questions:
|
| 161 |
+
pipeline_runs_total.labels(status="ambiguous").inc()
|
| 162 |
+
return FinalResult(
|
| 163 |
+
ok=True,
|
| 164 |
+
ambiguous=True,
|
| 165 |
+
error=False,
|
| 166 |
+
details=[f"Ambiguities found: {len(questions)}"],
|
| 167 |
+
questions=questions,
|
| 168 |
+
sql=None,
|
| 169 |
+
rationale=None,
|
| 170 |
+
verified=None,
|
| 171 |
+
traces=self._normalize_traces(traces),
|
| 172 |
+
)
|
| 173 |
|
| 174 |
+
# --- 2) planner ---
|
| 175 |
+
t0 = time.perf_counter()
|
| 176 |
+
r_plan = self._safe_stage(
|
| 177 |
+
self.planner.run, user_query=user_query, schema_preview=schema_preview
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 178 |
)
|
| 179 |
+
dt = (time.perf_counter() - t0) * 1000.0
|
| 180 |
+
stage_duration_ms.labels("planner").observe(dt)
|
| 181 |
+
traces.extend(self._trace_list(r_plan))
|
| 182 |
+
if not getattr(r_plan, "trace", None):
|
| 183 |
+
_fallback_trace("planner", dt, r_plan.ok)
|
| 184 |
+
if not r_plan.ok:
|
| 185 |
+
pipeline_runs_total.labels(status="error").inc()
|
| 186 |
+
return FinalResult(
|
| 187 |
+
ok=False,
|
| 188 |
+
ambiguous=False,
|
| 189 |
+
error=True,
|
| 190 |
+
details=r_plan.error,
|
| 191 |
+
questions=None,
|
| 192 |
+
sql=None,
|
| 193 |
+
rationale=None,
|
| 194 |
+
verified=None,
|
| 195 |
+
traces=self._normalize_traces(traces),
|
| 196 |
+
)
|
| 197 |
+
|
| 198 |
+
# --- 3) generator ---
|
| 199 |
+
t0 = time.perf_counter()
|
| 200 |
+
r_gen = self._safe_stage(
|
| 201 |
+
self.generator.run,
|
| 202 |
+
user_query=user_query,
|
| 203 |
+
schema_preview=schema_preview,
|
| 204 |
+
plan_text=(r_plan.data or {}).get("plan"),
|
| 205 |
+
clarify_answers=clarify_answers,
|
| 206 |
)
|
| 207 |
+
dt = (time.perf_counter() - t0) * 1000.0
|
| 208 |
+
stage_duration_ms.labels("generator").observe(dt)
|
| 209 |
+
traces.extend(self._trace_list(r_gen))
|
| 210 |
+
if not getattr(r_gen, "trace", None):
|
| 211 |
+
_fallback_trace("generator", dt, r_gen.ok)
|
| 212 |
+
if not r_gen.ok:
|
| 213 |
+
pipeline_runs_total.labels(status="error").inc()
|
| 214 |
+
return FinalResult(
|
| 215 |
+
ok=False,
|
| 216 |
+
ambiguous=False,
|
| 217 |
+
error=True,
|
| 218 |
+
details=r_gen.error,
|
| 219 |
+
questions=None,
|
| 220 |
+
sql=None,
|
| 221 |
+
rationale=None,
|
| 222 |
+
verified=None,
|
| 223 |
+
traces=self._normalize_traces(traces),
|
| 224 |
+
)
|
| 225 |
|
| 226 |
+
sql = (r_gen.data or {}).get("sql")
|
| 227 |
+
rationale = (r_gen.data or {}).get("rationale")
|
| 228 |
+
|
| 229 |
+
# Guard: empty SQL
|
| 230 |
+
if not sql or not str(sql).strip():
|
| 231 |
+
pipeline_runs_total.labels(status="error").inc()
|
| 232 |
+
traces.append(
|
| 233 |
+
self._mk_trace("generator", 0.0, "failed", {"reason": "empty_sql"})
|
| 234 |
+
)
|
| 235 |
+
return FinalResult(
|
| 236 |
+
ok=False,
|
| 237 |
+
ambiguous=False,
|
| 238 |
+
error=True,
|
| 239 |
+
details=["empty_sql"],
|
| 240 |
+
questions=None,
|
| 241 |
+
sql=None,
|
| 242 |
+
rationale=rationale,
|
| 243 |
+
verified=None,
|
| 244 |
+
traces=self._normalize_traces(traces),
|
| 245 |
+
)
|
| 246 |
+
|
| 247 |
+
# --- 4) safety ---
|
| 248 |
+
t0 = time.perf_counter()
|
| 249 |
+
r_safe = self._safe_stage(self.safety.run, sql=sql)
|
| 250 |
+
dt = (time.perf_counter() - t0) * 1000.0
|
| 251 |
+
stage_duration_ms.labels("safety").observe(dt)
|
| 252 |
+
traces.extend(self._trace_list(r_safe))
|
| 253 |
+
if not getattr(r_safe, "trace", None):
|
| 254 |
+
_fallback_trace("safety", dt, r_safe.ok)
|
| 255 |
+
if not r_safe.ok:
|
| 256 |
+
pipeline_runs_total.labels(status="error").inc()
|
| 257 |
+
return FinalResult(
|
| 258 |
+
ok=False,
|
| 259 |
+
ambiguous=False,
|
| 260 |
+
error=True,
|
| 261 |
+
details=r_safe.error,
|
| 262 |
+
questions=None,
|
| 263 |
+
sql=sql,
|
| 264 |
+
rationale=rationale,
|
| 265 |
+
verified=None,
|
| 266 |
+
traces=self._normalize_traces(traces),
|
| 267 |
+
)
|
| 268 |
+
|
| 269 |
+
# Use sanitized SQL from safety
|
| 270 |
+
sql = (r_safe.data or {}).get("sql", sql)
|
| 271 |
+
|
| 272 |
+
# --- 5) executor ---
|
| 273 |
+
t0 = time.perf_counter()
|
| 274 |
+
r_exec = self._safe_stage(self.executor.run, sql=sql)
|
| 275 |
+
dt = (time.perf_counter() - t0) * 1000.0
|
| 276 |
+
stage_duration_ms.labels("executor").observe(dt)
|
| 277 |
+
traces.extend(self._trace_list(r_exec))
|
| 278 |
+
if not getattr(r_exec, "trace", None):
|
| 279 |
+
_fallback_trace("executor", dt, r_exec.ok)
|
| 280 |
+
if not r_exec.ok and r_exec.error:
|
| 281 |
+
details.extend(r_exec.error) # soft: keep for repair/verifier context
|
| 282 |
+
|
| 283 |
+
# --- 6) verifier ---
|
| 284 |
+
t0 = time.perf_counter()
|
| 285 |
+
r_ver = self._safe_stage(
|
| 286 |
+
self.verifier.run,
|
| 287 |
+
sql=sql,
|
| 288 |
+
exec_result=(r_exec.data or {}),
|
| 289 |
+
adapter=getattr(
|
| 290 |
+
self.executor, "adapter", None
|
| 291 |
+
), # let verifier use adapter
|
| 292 |
)
|
| 293 |
+
dt = (time.perf_counter() - t0) * 1000.0
|
| 294 |
+
stage_duration_ms.labels("verifier").observe(dt)
|
| 295 |
+
traces.extend(self._trace_list(r_ver))
|
| 296 |
+
if not getattr(r_ver, "trace", None):
|
| 297 |
+
_fallback_trace("verifier", dt, r_ver.ok)
|
| 298 |
+
verified = bool(r_ver.data and r_ver.data.get("verified")) or r_ver.ok
|
| 299 |
+
|
| 300 |
+
# consume repaired SQL from verifier if any
|
| 301 |
+
if r_ver.data and "sql" in r_ver.data and r_ver.data["sql"]:
|
| 302 |
+
sql = r_ver.data["sql"]
|
| 303 |
+
|
| 304 |
+
# --- 7) repair loop (if not verified) ---
|
| 305 |
+
if not verified:
|
| 306 |
+
for _attempt in range(2):
|
| 307 |
+
# repair
|
| 308 |
+
t0 = time.perf_counter()
|
| 309 |
+
r_fix = self._safe_stage(
|
| 310 |
+
self.repair.run,
|
| 311 |
+
sql=sql,
|
| 312 |
+
error_msg="; ".join(details or ["unknown"]),
|
| 313 |
+
schema_preview=schema_preview,
|
| 314 |
+
)
|
| 315 |
+
dt = (time.perf_counter() - t0) * 1000.0
|
| 316 |
+
stage_duration_ms.labels("repair").observe(dt)
|
| 317 |
+
traces.extend(self._trace_list(r_fix))
|
| 318 |
+
if not getattr(r_fix, "trace", None):
|
| 319 |
+
_fallback_trace("repair", dt, r_fix.ok)
|
| 320 |
+
if not r_fix.ok:
|
| 321 |
+
break
|
| 322 |
+
|
| 323 |
+
# update SQL
|
| 324 |
+
sql = (r_fix.data or {}).get("sql", sql)
|
| 325 |
+
|
| 326 |
+
# safety again
|
| 327 |
+
t0 = time.perf_counter()
|
| 328 |
+
r_safe2 = self._safe_stage(self.safety.run, sql=sql)
|
| 329 |
+
dt2 = (time.perf_counter() - t0) * 1000.0
|
| 330 |
+
stage_duration_ms.labels("safety").observe(dt2)
|
| 331 |
+
traces.extend(self._trace_list(r_safe2))
|
| 332 |
+
if not getattr(r_safe2, "trace", None):
|
| 333 |
+
_fallback_trace("safety", dt2, r_safe2.ok)
|
| 334 |
+
if not r_safe2.ok:
|
| 335 |
+
if r_safe2.error:
|
| 336 |
+
details.extend(r_safe2.error)
|
| 337 |
+
continue
|
| 338 |
+
sql = (r_safe2.data or {}).get("sql", sql)
|
| 339 |
+
|
| 340 |
+
# executor again
|
| 341 |
+
t0 = time.perf_counter()
|
| 342 |
+
r_exec2 = self._safe_stage(self.executor.run, sql=sql)
|
| 343 |
+
dt2 = (time.perf_counter() - t0) * 1000.0
|
| 344 |
+
stage_duration_ms.labels("executor").observe(dt2)
|
| 345 |
+
traces.extend(self._trace_list(r_exec2))
|
| 346 |
+
if not getattr(r_exec2, "trace", None):
|
| 347 |
+
_fallback_trace("executor", dt2, r_exec2.ok)
|
| 348 |
+
if not r_exec2.ok:
|
| 349 |
+
if r_exec2.error:
|
| 350 |
+
details.extend(r_exec2.error)
|
| 351 |
+
continue
|
| 352 |
+
|
| 353 |
+
# verifier again
|
| 354 |
+
t0 = time.perf_counter()
|
| 355 |
+
r_ver2 = self._safe_stage(
|
| 356 |
+
self.verifier.run,
|
| 357 |
+
sql=sql,
|
| 358 |
+
exec_result=(r_exec2.data or {}),
|
| 359 |
+
adapter=getattr(self.executor, "adapter", None),
|
| 360 |
+
)
|
| 361 |
+
dt2 = (time.perf_counter() - t0) * 1000.0
|
| 362 |
+
stage_duration_ms.labels("verifier").observe(dt2)
|
| 363 |
+
traces.extend(self._trace_list(r_ver2))
|
| 364 |
+
if not getattr(r_ver2, "trace", None):
|
| 365 |
+
_fallback_trace("verifier", dt2, r_ver2.ok)
|
| 366 |
+
verified = (
|
| 367 |
+
bool(r_ver2.data and r_ver2.data.get("verified")) or r_ver2.ok
|
| 368 |
+
)
|
| 369 |
+
if r_ver2.data and "sql" in r_ver2.data and r_ver2.data["sql"]:
|
| 370 |
+
sql = r_ver2.data["sql"]
|
| 371 |
+
if verified:
|
| 372 |
+
break
|
| 373 |
+
|
| 374 |
+
# --- 8) optional soft auto-verify (executor success, no details) ---
|
| 375 |
+
if (verified is None or not verified) and not details:
|
| 376 |
+
any_exec_ok = any(
|
| 377 |
+
t.get("stage") == "executor"
|
| 378 |
+
and (t.get("notes") or {}).get("row_count")
|
| 379 |
+
for t in traces
|
| 380 |
+
)
|
| 381 |
+
if any_exec_ok:
|
| 382 |
+
traces.append(
|
| 383 |
+
self._mk_trace(
|
| 384 |
+
stage="pipeline",
|
| 385 |
+
duration_ms=0.0,
|
| 386 |
+
summary="auto-verified",
|
| 387 |
+
notes={"reason": "executor succeeded, verifier silent"},
|
| 388 |
+
)
|
| 389 |
+
)
|
| 390 |
+
verified = True
|
| 391 |
|
| 392 |
+
# --- 9) finalize ---
|
| 393 |
+
has_errors = bool(details)
|
| 394 |
+
need_ver = bool(self.require_verification)
|
| 395 |
+
|
| 396 |
+
# base success condition
|
| 397 |
+
final_ok_by_verifier = bool(verified)
|
| 398 |
+
base_ok = (
|
| 399 |
+
bool(sql) and not has_errors and (final_ok_by_verifier or not need_ver)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 400 |
)
|
| 401 |
+
ok = base_ok
|
| 402 |
+
err = (not ok) and has_errors
|
| 403 |
+
|
| 404 |
+
# align `verified` with baseline semantics:
|
| 405 |
+
# if verification is NOT required and pipeline is ok, report verified=True
|
| 406 |
+
if not need_ver and ok and not final_ok_by_verifier:
|
| 407 |
+
verified_final = True
|
| 408 |
+
else:
|
| 409 |
+
verified_final = bool(verified)
|
| 410 |
+
|
| 411 |
+
pipeline_runs_total.labels(status=("ok" if ok else "error")).inc()
|
| 412 |
|
|
|
|
|
|
|
|
|
|
| 413 |
traces.append(
|
| 414 |
self._mk_trace(
|
| 415 |
+
stage="pipeline",
|
| 416 |
+
duration_ms=0.0,
|
| 417 |
+
summary="finalize",
|
| 418 |
+
notes={
|
| 419 |
+
"final_verified": bool(verified_final),
|
| 420 |
+
"details_len": len(details),
|
| 421 |
+
"need_verification": need_ver,
|
| 422 |
+
},
|
| 423 |
)
|
| 424 |
)
|
| 425 |
+
|
| 426 |
return FinalResult(
|
| 427 |
+
ok=ok,
|
| 428 |
ambiguous=False,
|
| 429 |
+
error=err,
|
| 430 |
+
details=details or None,
|
| 431 |
+
sql=sql,
|
|
|
|
| 432 |
rationale=rationale,
|
| 433 |
+
verified=verified_final,
|
| 434 |
+
questions=None,
|
| 435 |
traces=self._normalize_traces(traces),
|
| 436 |
)
|
| 437 |
|
| 438 |
+
except Exception:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 439 |
pipeline_runs_total.labels(status="error").inc()
|
| 440 |
+
# bubble up to make failures visible in tests and logs
|
| 441 |
+
raise
|
| 442 |
+
|
| 443 |
+
finally:
|
| 444 |
+
# Always record total latency, even on early return/exception
|
| 445 |
+
stage_duration_ms.labels("pipeline_total").observe(
|
| 446 |
+
(time.perf_counter() - t_all0) * 1000.0
|
|
|
|
|
|
|
|
|
|
| 447 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|