Melika Kheirieh commited on
Commit
3b2af0f
·
1 Parent(s): 8b2d603

fix(core): non-zero generator timing + one-shot EMPTY retry; post-verify drop LIMIT to recover EM when ExecAcc=1

Browse files
benchmarks/results_pro/20251109-100618/eval.jsonl DELETED
@@ -1,5 +0,0 @@
1
- {"source": "spider", "db_id": "concert_singer", "query": "How many singers do we have?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "select count(*) from singer limit 1", "ok": true, "latency_ms": 10217, "em": 1.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 7626, "summary": "ok", "notes": {"len_plan": 1263}, "token_in": 265, "token_out": 303, "cost_usd": 0.00022154999999999996}, {"stage": "generator", "duration_ms": 1176, "summary": "ok", "notes": {"rationale_len": 30}, "token_in": 785, "token_out": 19, "cost_usd": 0.00012915}, {"stage": "safety", "duration_ms": 3, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 3, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 2, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 751, "summary": "ok", "notes": {"old_sql_len": 27, "new_sql_len": 27}, "token_in": 313, "token_out": 6, "cost_usd": 5.0549999999999995e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 643, "summary": "ok", "notes": {"old_sql_len": 27, "new_sql_len": 35}, "token_in": 313, "token_out": 8, "cost_usd": 5.175e-05}, {"stage": "safety", "duration_ms": 2, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 2, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
2
- {"source": "spider", "db_id": "concert_singer", "query": "What is the total number of singers?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "select count(*) from singer limit 1", "ok": true, "latency_ms": 10180, "em": 1.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 7783, "summary": "ok", "notes": {"len_plan": 1307}, "token_in": 266, "token_out": 307, "cost_usd": 0.00022409999999999997}, {"stage": "generator", "duration_ms": 939, "summary": "ok", "notes": {"rationale_len": 30}, "token_in": 790, "token_out": 19, "cost_usd": 0.00012989999999999999}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 2, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 2, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 711, "summary": "ok", "notes": {"old_sql_len": 27, "new_sql_len": 35}, "token_in": 313, "token_out": 8, "cost_usd": 5.175e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 732, "summary": "ok", "notes": {"old_sql_len": 35, "new_sql_len": 35}, "token_in": 316, "token_out": 8, "cost_usd": 5.2199999999999995e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
3
- {"source": "spider", "db_id": "concert_singer", "query": "Show name, country, age for all singers ordered by age from the oldest to the youngest.", "gold_sql": "SELECT name , country , age FROM singer ORDER BY age DESC", "pred_sql": "", "ok": true, "latency_ms": 0, "em": 0.0, "sm": 0.0, "exec_acc": 0.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "ambiguous", "notes": {"ambiguous": true, "questions_len": 1}}]}
4
- {"source": "spider", "db_id": "concert_singer", "query": "What are the names, countries, and ages for every singer in descending order of age?", "gold_sql": "SELECT name , country , age FROM singer ORDER BY age DESC", "pred_sql": "select Name, Country, Age from singer order by Age desc LIMIT 10", "ok": true, "latency_ms": 7726, "em": 0.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 4966, "summary": "ok", "notes": {"len_plan": 1207}, "token_in": 276, "token_out": 278, "cost_usd": 0.0002082}, {"stage": "generator", "duration_ms": 1007, "summary": "ok", "notes": {"rationale_len": 85}, "token_in": 771, "token_out": 37, "cost_usd": 0.00013785}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 6, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 938, "summary": "ok", "notes": {"old_sql_len": 55, "new_sql_len": 64}, "token_in": 320, "token_out": 21, "cost_usd": 6.0599999999999996e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 6, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 807, "summary": "ok", "notes": {"old_sql_len": 64, "new_sql_len": 64}, "token_in": 323, "token_out": 21, "cost_usd": 6.104999999999999e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 6, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
5
- {"source": "spider", "db_id": "concert_singer", "query": "What is the average, minimum, and maximum age of all singers from France?", "gold_sql": "SELECT avg(age) , min(age) , max(age) FROM singer WHERE country = 'France'", "pred_sql": "select avg(Age), min(Age), max(Age) from singer where Country = 'France'", "ok": true, "latency_ms": 16635, "em": 0.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 13290, "summary": "ok", "notes": {"len_plan": 1641}, "token_in": 274, "token_out": 434, "cost_usd": 0.0003015}, {"stage": "generator", "duration_ms": 1083, "summary": "ok", "notes": {"rationale_len": 67}, "token_in": 925, "token_out": 42, "cost_usd": 0.00016394999999999997}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 1072, "summary": "ok", "notes": {"old_sql_len": 72, "new_sql_len": 80}, "token_in": 328, "token_out": 24, "cost_usd": 6.36e-05}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "repair", "duration_ms": 1179, "summary": "ok", "notes": {"old_sql_len": 80, "new_sql_len": 72}, "token_in": 332, "token_out": 21, "cost_usd": 6.24e-05}, {"stage": "safety", "duration_ms": 2, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
 
 
 
 
 
 
benchmarks/results_pro/20251109-100618/latency_histogram.png DELETED
Binary file (18.9 kB)
 
benchmarks/results_pro/20251109-100618/latency_per_stage.png DELETED
Binary file (30.1 kB)
 
benchmarks/results_pro/20251109-100618/metrics_overview.png DELETED
Binary file (20.1 kB)
 
benchmarks/results_pro/20251109-103601/eval.jsonl DELETED
@@ -1,5 +0,0 @@
1
- {"source": "spider", "db_id": "concert_singer", "query": "How many singers do we have?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "select count(*) from singer limit 1", "ok": true, "latency_ms": 10975, "em": 1.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}, "skipped": false}, {"stage": "planner", "duration_ms": 8073, "summary": "ok", "notes": {"len_plan": 1533}, "token_in": 265, "token_out": 384, "cost_usd": 0.00027015, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "generator", "duration_ms": 959, "summary": "ok", "notes": {"rationale_len": 30}, "token_in": 866, "token_out": 19, "cost_usd": 0.0001413, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 2, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 2, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "repair", "duration_ms": 884, "summary": "ok", "notes": {"old_sql_len": 27, "new_sql_len": 35, "attempt": 1}, "token_in": 313, "token_out": 8, "cost_usd": 5.175e-05, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "repair", "duration_ms": 1040, "summary": "ok", "notes": {"old_sql_len": 35, "new_sql_len": 35, "attempt": 2}, "token_in": 316, "token_out": 8, "cost_usd": 5.2199999999999995e-05, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}, "skipped": false}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}, "skipped": false}]}
2
- {"source": "spider", "db_id": "concert_singer", "query": "What is the total number of singers?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "select count(*) from singer limit 1", "ok": true, "latency_ms": 11792, "em": 1.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}, "skipped": false}, {"stage": "planner", "duration_ms": 8633, "summary": "ok", "notes": {"len_plan": 1444}, "token_in": 266, "token_out": 354, "cost_usd": 0.00025229999999999995, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "generator", "duration_ms": 1406, "summary": "ok", "notes": {"rationale_len": 30}, "token_in": 837, "token_out": 19, "cost_usd": 0.00013695, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "repair", "duration_ms": 988, "summary": "ok", "notes": {"old_sql_len": 27, "new_sql_len": 35, "attempt": 1}, "token_in": 313, "token_out": 8, "cost_usd": 5.175e-05, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "repair", "duration_ms": 755, "summary": "ok", "notes": {"old_sql_len": 35, "new_sql_len": 35, "attempt": 2}, "token_in": 316, "token_out": 8, "cost_usd": 5.2199999999999995e-05, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}, "skipped": false}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}, "skipped": false}]}
3
- {"source": "spider", "db_id": "concert_singer", "query": "Show name, country, age for all singers ordered by age from the oldest to the youngest.", "gold_sql": "SELECT name , country , age FROM singer ORDER BY age DESC", "pred_sql": "", "ok": true, "latency_ms": 0, "em": 0.0, "sm": 0.0, "exec_acc": 0.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "ambiguous", "notes": {"ambiguous": true, "questions_len": 1}, "skipped": false}]}
4
- {"source": "spider", "db_id": "concert_singer", "query": "What are the names, countries, and ages for every singer in descending order of age?", "gold_sql": "SELECT name , country , age FROM singer ORDER BY age DESC", "pred_sql": "select Name, Country, Age from singer order by Age desc LIMIT 10", "ok": true, "latency_ms": 9181, "em": 0.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}, "skipped": false}, {"stage": "planner", "duration_ms": 5624, "summary": "ok", "notes": {"len_plan": 1296}, "token_in": 276, "token_out": 297, "cost_usd": 0.00021959999999999997, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "generator", "duration_ms": 1398, "summary": "ok", "notes": {"rationale_len": 85}, "token_in": 790, "token_out": 37, "cost_usd": 0.0001407, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 6, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "repair", "duration_ms": 948, "summary": "ok", "notes": {"old_sql_len": 55, "new_sql_len": 64, "attempt": 1}, "token_in": 320, "token_out": 21, "cost_usd": 6.0599999999999996e-05, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 6, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "repair", "duration_ms": 1200, "summary": "ok", "notes": {"old_sql_len": 64, "new_sql_len": 64, "attempt": 2}, "token_in": 323, "token_out": 21, "cost_usd": 6.104999999999999e-05, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 6, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}, "skipped": false}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}, "skipped": false}]}
5
- {"source": "spider", "db_id": "concert_singer", "query": "What is the average, minimum, and maximum age of all singers from France?", "gold_sql": "SELECT avg(age) , min(age) , max(age) FROM singer WHERE country = 'France'", "pred_sql": "select avg(Age), min(Age), max(Age) from singer where Country = 'France'", "ok": true, "latency_ms": 14419, "em": 0.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}, "skipped": false}, {"stage": "planner", "duration_ms": 9792, "summary": "ok", "notes": {"len_plan": 1406}, "token_in": 274, "token_out": 348, "cost_usd": 0.0002499, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "generator", "duration_ms": 1727, "summary": "ok", "notes": {"rationale_len": 90}, "token_in": 839, "token_out": 46, "cost_usd": 0.00015345, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 4, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "repair", "duration_ms": 1130, "summary": "ok", "notes": {"old_sql_len": 72, "new_sql_len": 80, "attempt": 1}, "token_in": 328, "token_out": 24, "cost_usd": 6.36e-05, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 2, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "repair", "duration_ms": 1752, "summary": "ok", "notes": {"old_sql_len": 80, "new_sql_len": 72, "attempt": 2}, "token_in": 332, "token_out": 25, "cost_usd": 6.48e-05, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 2, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 3}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}, "skipped": false}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}, "skipped": false}]}
 
 
 
 
 
 
benchmarks/results_pro/20251109-105728/eval.jsonl DELETED
@@ -1,5 +0,0 @@
1
- {"source": "spider", "db_id": "concert_singer", "query": "How many singers do we have?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "select count(*) from singer limit 1", "ok": true, "latency_ms": 11836, "em": 1.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}, "skipped": false}, {"stage": "planner", "duration_ms": 6838, "summary": "ok", "notes": {"len_plan": 1460}, "token_in": 265, "token_out": 356, "cost_usd": 0.00025334999999999995, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "generator", "duration_ms": 3409, "summary": "ok", "notes": {"rationale_len": 30}, "token_in": 838, "token_out": 19, "cost_usd": 0.0001371, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 2, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1, "sql_length": 27}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "repair", "duration_ms": 832, "summary": "ok", "notes": {"old_sql_len": 27, "new_sql_len": 35, "attempt": 1}, "token_in": 313, "token_out": 8, "cost_usd": 5.175e-05, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1, "sql_length": 35}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "repair", "duration_ms": 744, "summary": "ok", "notes": {"old_sql_len": 35, "new_sql_len": 35, "attempt": 2}, "token_in": 316, "token_out": 8, "cost_usd": 5.2199999999999995e-05, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1, "sql_length": 35}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}, "skipped": false}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}, "skipped": false}]}
2
- {"source": "spider", "db_id": "concert_singer", "query": "What is the total number of singers?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "select count(*) from singer limit 1", "ok": true, "latency_ms": 10414, "em": 1.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}, "skipped": false}, {"stage": "planner", "duration_ms": 5346, "summary": "ok", "notes": {"len_plan": 1385}, "token_in": 266, "token_out": 334, "cost_usd": 0.00024029999999999999, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "generator", "duration_ms": 3352, "summary": "ok", "notes": {"rationale_len": 30}, "token_in": 817, "token_out": 19, "cost_usd": 0.00013394999999999998, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 4, "summary": "ok", "notes": {"row_count": 1, "col_count": 1, "sql_length": 27}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "repair", "duration_ms": 871, "summary": "ok", "notes": {"old_sql_len": 27, "new_sql_len": 35, "attempt": 1}, "token_in": 313, "token_out": 8, "cost_usd": 5.175e-05, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1, "sql_length": 35}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "repair", "duration_ms": 831, "summary": "ok", "notes": {"old_sql_len": 35, "new_sql_len": 35, "attempt": 2}, "token_in": 316, "token_out": 8, "cost_usd": 5.2199999999999995e-05, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 2, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1, "sql_length": 35}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}, "skipped": false}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}, "skipped": false}]}
3
- {"source": "spider", "db_id": "concert_singer", "query": "Show name, country, age for all singers ordered by age from the oldest to the youngest.", "gold_sql": "SELECT name , country , age FROM singer ORDER BY age DESC", "pred_sql": "", "ok": true, "latency_ms": 0, "em": 0.0, "sm": 0.0, "exec_acc": 0.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "ambiguous", "notes": {"ambiguous": true, "questions_len": 1}, "skipped": false}]}
4
- {"source": "spider", "db_id": "concert_singer", "query": "What are the names, countries, and ages for every singer in descending order of age?", "gold_sql": "SELECT name , country , age FROM singer ORDER BY age DESC", "pred_sql": "select Name, Country, Age from singer order by Age desc LIMIT 10", "ok": true, "latency_ms": 13807, "em": 0.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}, "skipped": false}, {"stage": "planner", "duration_ms": 8248, "summary": "ok", "notes": {"len_plan": 1415}, "token_in": 276, "token_out": 335, "cost_usd": 0.0002424, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "generator", "duration_ms": 3686, "summary": "ok", "notes": {"rationale_len": 85}, "token_in": 828, "token_out": 37, "cost_usd": 0.00014639999999999998, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 6, "col_count": 3, "sql_length": 55}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "repair", "duration_ms": 960, "summary": "ok", "notes": {"old_sql_len": 55, "new_sql_len": 64, "attempt": 1}, "token_in": 320, "token_out": 21, "cost_usd": 6.0599999999999996e-05, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 2, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 6, "col_count": 3, "sql_length": 64}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "repair", "duration_ms": 901, "summary": "ok", "notes": {"old_sql_len": 64, "new_sql_len": 64, "attempt": 2}, "token_in": 323, "token_out": 21, "cost_usd": 6.104999999999999e-05, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 6, "col_count": 3, "sql_length": 64}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"issues": ["exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}, "skipped": false}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}, "skipped": false}]}
5
- {"source": "spider", "db_id": "concert_singer", "query": "What is the average, minimum, and maximum age of all singers from France?", "gold_sql": "SELECT avg(age) , min(age) , max(age) FROM singer WHERE country = 'France'", "pred_sql": "select avg(Age), min(Age), max(Age) from singer where Country = 'France'", "ok": true, "latency_ms": 13396, "em": 0.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}, "skipped": false}, {"stage": "planner", "duration_ms": 7141, "summary": "ok", "notes": {"len_plan": 1569}, "token_in": 274, "token_out": 404, "cost_usd": 0.0002835, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "generator", "duration_ms": 4139, "summary": "ok", "notes": {"rationale_len": 87}, "token_in": 895, "token_out": 46, "cost_usd": 0.00016184999999999998, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 2, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 3, "sql_length": 72}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "repair", "duration_ms": 937, "summary": "ok", "notes": {"old_sql_len": 72, "new_sql_len": 80, "attempt": 1}, "token_in": 328, "token_out": 24, "cost_usd": 6.36e-05, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 3, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 3, "sql_length": 80}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "repair", "duration_ms": 1160, "summary": "ok", "notes": {"old_sql_len": 80, "new_sql_len": 72, "attempt": 2}, "token_in": 332, "token_out": 21, "cost_usd": 6.24e-05, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 3, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 2, "summary": "ok", "notes": {"row_count": 1, "col_count": 3, "sql_length": 72}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"issues": ["aggregation_without_group_by", "exec_error:preview_failed"]}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "pipeline", "duration_ms": 0, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}, "skipped": false}, {"stage": "pipeline", "duration_ms": 0, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}, "skipped": false}]}
 
 
 
 
 
 
benchmarks/results_pro/20251109-105728/latency_histogram.png DELETED
Binary file (16.3 kB)
 
benchmarks/results_pro/20251109-105728/latency_per_stage.png DELETED
Binary file (29.3 kB)
 
benchmarks/results_pro/20251109-105728/metrics_overview.png DELETED
Binary file (19.6 kB)
 
benchmarks/results_pro/20251109-105728/results.csv DELETED
@@ -1,6 +0,0 @@
1
- db_id,query,ok,em,sm,exec_acc,latency_ms
2
- concert_singer,"How many singers do we have?",✅,1.0,1.0,1.0,11836
3
- concert_singer,"What is the total number of singers?",✅,1.0,1.0,1.0,10414
4
- concert_singer,"Show name, country, age for all singers ordered by age from the oldest to the youngest.",✅,0.0,0.0,0.0,0
5
- concert_singer,"What are the names, countries, and ages for every singer in descending order of age?",✅,0.0,1.0,1.0,13807
6
- concert_singer,"What is the average, minimum, and maximum age of all singers from France?",✅,0.0,1.0,1.0,13396
 
 
 
 
 
 
 
benchmarks/results_pro/20251109-105728/summary.json DELETED
@@ -1,21 +0,0 @@
1
- {
2
- "timestamp": "2025-11-09T10:58:17",
3
- "split": "dev",
4
- "config": "configs/sqlite_pipeline.yaml",
5
- "total": 5,
6
- "success": 5,
7
- "success_rate": 1.0,
8
- "avg_latency_ms": 9890.6,
9
- "p50_latency_ms": 11836.0,
10
- "p95_latency_ms": 13724.8,
11
- "EM": 0.4,
12
- "SM": 0.8,
13
- "ExecAcc": 0.8,
14
- "detector_avg_ms": 0.0,
15
- "planner_avg_ms": 6893.25,
16
- "generator_avg_ms": 3646.5,
17
- "safety_avg_ms": 1.67,
18
- "executor_avg_ms": 1.33,
19
- "verifier_avg_ms": 0.42,
20
- "repair_avg_ms": 904.5
21
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
benchmarks/results_pro/20251109-123424/eval.jsonl ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {"source": "spider", "db_id": "concert_singer", "query": "How many singers do we have?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "select count(*) from singer", "ok": true, "latency_ms": 10712, "em": 1.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}, "skipped": false}, {"stage": "planner", "duration_ms": 8455, "summary": "ok", "notes": {}, "skipped": false}, {"stage": "generator", "duration_ms": 2253, "summary": "ok", "notes": {"rationale_len": 30}, "token_in": 792, "token_out": 19, "cost_usd": 0.0001302, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 1, "sql_length": 27}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"sql_length": 27, "has_select": true, "has_from": true, "has_over": false, "has_group_by": false, "has_distinct": false, "has_aggregate": true, "mixes_cols": false, "verified": true}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}]}
2
+ {"source": "spider", "db_id": "concert_singer", "query": "What is the total number of singers?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "select count(*) from singer", "ok": true, "latency_ms": 12981, "em": 1.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}, "skipped": false}, {"stage": "planner", "duration_ms": 9963, "summary": "ok", "notes": {}, "skipped": false}, {"stage": "generator", "duration_ms": 3018, "summary": "ok", "notes": {"rationale_len": 30}, "token_in": 800, "token_out": 19, "cost_usd": 0.0001314, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 0, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 0, "summary": "ok", "notes": {"row_count": 1, "col_count": 1, "sql_length": 27}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"sql_length": 27, "has_select": true, "has_from": true, "has_over": false, "has_group_by": false, "has_distinct": false, "has_aggregate": true, "mixes_cols": false, "verified": true}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}]}
3
+ {"source": "spider", "db_id": "concert_singer", "query": "Show name, country, age for all singers ordered by age from the oldest to the youngest.", "gold_sql": "SELECT name , country , age FROM singer ORDER BY age DESC", "pred_sql": "", "ok": true, "latency_ms": 0, "em": 0.0, "sm": 0.0, "exec_acc": 0.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "ambiguous", "notes": {"ambiguous": true, "questions_len": 1}, "skipped": false}]}
4
+ {"source": "spider", "db_id": "concert_singer", "query": "What are the names, countries, and ages for every singer in descending order of age?", "gold_sql": "SELECT name , country , age FROM singer ORDER BY age DESC", "pred_sql": "select Name, Country, Age from singer order by Age desc", "ok": true, "latency_ms": 9753, "em": 0.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}, "skipped": false}, {"stage": "planner", "duration_ms": 7528, "summary": "ok", "notes": {}, "skipped": false}, {"stage": "generator", "duration_ms": 2224, "summary": "ok", "notes": {"rationale_len": 85}, "token_in": 816, "token_out": 37, "cost_usd": 0.0001446, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 1, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 6, "col_count": 3, "sql_length": 55}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 0, "summary": "ok", "notes": {"sql_length": 55, "has_select": true, "has_from": true, "has_over": false, "has_group_by": false, "has_distinct": false, "has_aggregate": false, "mixes_cols": false, "verified": true}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}]}
5
+ {"source": "spider", "db_id": "concert_singer", "query": "What is the average, minimum, and maximum age of all singers from France?", "gold_sql": "SELECT avg(age) , min(age) , max(age) FROM singer WHERE country = 'France'", "pred_sql": "select avg(Age), min(Age), max(Age) from singer where Country = 'France'", "ok": false, "latency_ms": 12406, "em": 0.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 0, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}, "skipped": false}, {"stage": "planner", "duration_ms": 7105, "summary": "ok", "notes": {}, "skipped": false}, {"stage": "generator", "duration_ms": 2892, "summary": "ok", "notes": {"rationale_len": 67}, "token_in": 854, "token_out": 42, "cost_usd": 0.00015329999999999999, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "safety", "duration_ms": 3, "summary": "ok", "notes": {}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 3, "sql_length": 72}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 0, "summary": "failed", "notes": {"sql_length": 72, "has_select": true, "has_from": true, "has_over": false, "has_group_by": false, "has_distinct": false, "has_aggregate": true, "mixes_cols": true, "verified": false}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "repair", "duration_ms": 1162, "summary": "ok", "notes": {"old_sql_len": 72, "new_sql_len": 80}, "token_in": 328, "token_out": 24, "cost_usd": 6.36e-05, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 1, "summary": "ok", "notes": {"row_count": 1, "col_count": 3, "sql_length": 80}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 0, "summary": "failed", "notes": {"sql_length": 80, "has_select": true, "has_from": true, "has_over": false, "has_group_by": false, "has_distinct": false, "has_aggregate": true, "mixes_cols": true, "verified": false}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "repair", "duration_ms": 1241, "summary": "ok", "notes": {"old_sql_len": 80, "new_sql_len": 72}, "token_in": 332, "token_out": 21, "cost_usd": 6.24e-05, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "executor", "duration_ms": 0, "summary": "ok", "notes": {"row_count": 1, "col_count": 3, "sql_length": 72}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}, {"stage": "verifier", "duration_ms": 0, "summary": "failed", "notes": {"sql_length": 72, "has_select": true, "has_from": true, "has_over": false, "has_group_by": false, "has_distinct": false, "has_aggregate": true, "mixes_cols": true, "verified": false}, "token_in": null, "token_out": null, "cost_usd": null, "sql_length": null, "row_count": null, "verified": null, "error_type": null, "repair_attempts": null, "skipped": false}]}
benchmarks/results_pro/{20251109-100618 → 20251109-123424}/results.csv RENAMED
@@ -1,6 +1,6 @@
1
  db_id,query,ok,em,sm,exec_acc,latency_ms
2
- concert_singer,"How many singers do we have?",✅,1.0,1.0,1.0,10217
3
- concert_singer,"What is the total number of singers?",✅,1.0,1.0,1.0,10180
4
  concert_singer,"Show name, country, age for all singers ordered by age from the oldest to the youngest.",✅,0.0,0.0,0.0,0
5
- concert_singer,"What are the names, countries, and ages for every singer in descending order of age?",✅,0.0,1.0,1.0,7726
6
- concert_singer,"What is the average, minimum, and maximum age of all singers from France?",✅,0.0,1.0,1.0,16635
 
1
  db_id,query,ok,em,sm,exec_acc,latency_ms
2
+ concert_singer,"How many singers do we have?",✅,1.0,1.0,1.0,10712
3
+ concert_singer,"What is the total number of singers?",✅,1.0,1.0,1.0,12981
4
  concert_singer,"Show name, country, age for all singers ordered by age from the oldest to the youngest.",✅,0.0,0.0,0.0,0
5
+ concert_singer,"What are the names, countries, and ages for every singer in descending order of age?",✅,0.0,1.0,1.0,9753
6
+ concert_singer,"What is the average, minimum, and maximum age of all singers from France?",❌,0.0,1.0,1.0,12406
benchmarks/results_pro/{20251109-100618 → 20251109-123424}/summary.json RENAMED
@@ -1,21 +1,21 @@
1
  {
2
- "timestamp": "2025-11-09T10:07:03",
3
  "split": "dev",
4
  "config": "configs/sqlite_pipeline.yaml",
5
  "total": 5,
6
- "success": 5,
7
- "success_rate": 1.0,
8
- "avg_latency_ms": 8951.6,
9
- "p50_latency_ms": 10180.0,
10
- "p95_latency_ms": 15351.4,
11
  "EM": 0.4,
12
  "SM": 0.8,
13
  "ExecAcc": 0.8,
14
  "detector_avg_ms": 0.0,
15
- "planner_avg_ms": 8416.25,
16
- "generator_avg_ms": 1051.25,
17
  "safety_avg_ms": 1.25,
18
- "executor_avg_ms": 1.33,
19
- "verifier_avg_ms": 0.58,
20
- "repair_avg_ms": 854.12
21
  }
 
1
  {
2
+ "timestamp": "2025-11-09T12:35:10",
3
  "split": "dev",
4
  "config": "configs/sqlite_pipeline.yaml",
5
  "total": 5,
6
+ "success": 4,
7
+ "success_rate": 0.8,
8
+ "avg_latency_ms": 9170.4,
9
+ "p50_latency_ms": 10712.0,
10
+ "p95_latency_ms": 12866.0,
11
  "EM": 0.4,
12
  "SM": 0.8,
13
  "ExecAcc": 0.8,
14
  "detector_avg_ms": 0.0,
15
+ "planner_avg_ms": 8262.75,
16
+ "generator_avg_ms": 2596.75,
17
  "safety_avg_ms": 1.25,
18
+ "executor_avg_ms": 0.67,
19
+ "verifier_avg_ms": 0.0,
20
+ "repair_avg_ms": 1201.5
21
  }
benchmarks/results_pro/20251109-124602/eval.jsonl ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {"source": "spider", "db_id": "concert_singer", "query": "How many singers do we have?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "select count(*) from singer", "ok": true, "latency_ms": 9852, "em": 1.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 1, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 8318, "summary": "ok", "notes": {}}, {"stage": "generator", "duration_ms": 1528, "summary": "failed", "notes": {"rationale_len": 30}}, {"stage": "safety", "duration_ms": 1, "summary": "failed", "notes": {}}, {"stage": "executor", "duration_ms": 1, "summary": "failed", "notes": {"row_count": 1, "col_count": 1, "sql_length": 27}}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"sql_length": 27, "has_select": true, "has_from": true, "has_over": false, "has_group_by": false, "has_distinct": false, "has_aggregate": true, "mixes_cols": false, "verified": true}}, {"stage": "pipeline", "duration_ms": 1, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
2
+ {"source": "spider", "db_id": "concert_singer", "query": "What is the total number of singers?", "gold_sql": "SELECT count(*) FROM singer", "pred_sql": "select count(*) from singer", "ok": true, "latency_ms": 12321, "em": 1.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 1, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 9326, "summary": "ok", "notes": {}}, {"stage": "generator", "duration_ms": 2994, "summary": "failed", "notes": {"rationale_len": 30}}, {"stage": "safety", "duration_ms": 1, "summary": "failed", "notes": {}}, {"stage": "executor", "duration_ms": 1, "summary": "failed", "notes": {"row_count": 1, "col_count": 1, "sql_length": 27}}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"sql_length": 27, "has_select": true, "has_from": true, "has_over": false, "has_group_by": false, "has_distinct": false, "has_aggregate": true, "mixes_cols": false, "verified": true}}, {"stage": "pipeline", "duration_ms": 1, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
3
+ {"source": "spider", "db_id": "concert_singer", "query": "Show name, country, age for all singers ordered by age from the oldest to the youngest.", "gold_sql": "SELECT name , country , age FROM singer ORDER BY age DESC", "pred_sql": "", "ok": true, "latency_ms": 0, "em": 0.0, "sm": 0.0, "exec_acc": 0.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 1, "summary": "ambiguous", "notes": {"ambiguous": true, "questions_len": 1}}]}
4
+ {"source": "spider", "db_id": "concert_singer", "query": "What are the names, countries, and ages for every singer in descending order of age?", "gold_sql": "SELECT name , country , age FROM singer ORDER BY age DESC", "pred_sql": "select Name, Country, Age from singer order by Age desc", "ok": true, "latency_ms": 8611, "em": 0.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 1, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 6746, "summary": "ok", "notes": {}}, {"stage": "generator", "duration_ms": 1863, "summary": "failed", "notes": {"rationale_len": 85}}, {"stage": "safety", "duration_ms": 1, "summary": "failed", "notes": {}}, {"stage": "executor", "duration_ms": 1, "summary": "failed", "notes": {"row_count": 6, "col_count": 3, "sql_length": 55}}, {"stage": "verifier", "duration_ms": 1, "summary": "ok", "notes": {"sql_length": 55, "has_select": true, "has_from": true, "has_over": false, "has_group_by": false, "has_distinct": false, "has_aggregate": false, "mixes_cols": false, "verified": true}}, {"stage": "pipeline", "duration_ms": 1, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
5
+ {"source": "spider", "db_id": "concert_singer", "query": "What is the average, minimum, and maximum age of all singers from France?", "gold_sql": "SELECT avg(age) , min(age) , max(age) FROM singer WHERE country = 'France'", "pred_sql": "select avg(Age), min(Age), max(Age) from singer where Country = 'France'", "ok": true, "latency_ms": 9742, "em": 0.0, "sm": 1.0, "exec_acc": 1.0, "error": null, "trace": [{"stage": "detector", "duration_ms": 1, "summary": "clear", "notes": {"ambiguous": false, "questions_len": 0}}, {"stage": "planner", "duration_ms": 5959, "summary": "ok", "notes": {}}, {"stage": "generator", "duration_ms": 1603, "summary": "failed", "notes": {"rationale_len": 67}}, {"stage": "safety", "duration_ms": 1, "summary": "failed", "notes": {}}, {"stage": "executor", "duration_ms": 1, "summary": "failed", "notes": {"row_count": 1, "col_count": 3, "sql_length": 72}}, {"stage": "verifier", "duration_ms": 1, "summary": "failed", "notes": {"sql_length": 72, "has_select": true, "has_from": true, "has_over": false, "has_group_by": false, "has_distinct": false, "has_aggregate": true, "mixes_cols": true, "verified": false}}, {"stage": "repair", "duration_ms": 1131, "summary": "failed", "notes": {"old_sql_len": 72, "new_sql_len": 80}}, {"stage": "safety", "duration_ms": 3, "summary": "failed", "notes": {}}, {"stage": "executor", "duration_ms": 1, "summary": "failed", "notes": {"row_count": 1, "col_count": 3, "sql_length": 80}}, {"stage": "verifier", "duration_ms": 1, "summary": "failed", "notes": {"sql_length": 80, "has_select": true, "has_from": true, "has_over": false, "has_group_by": false, "has_distinct": false, "has_aggregate": true, "mixes_cols": true, "verified": false}}, {"stage": "repair", "duration_ms": 1038, "summary": "failed", "notes": {"old_sql_len": 80, "new_sql_len": 72}}, {"stage": "safety", "duration_ms": 1, "summary": "failed", "notes": {}}, {"stage": "executor", "duration_ms": 1, "summary": "failed", "notes": {"row_count": 1, "col_count": 3, "sql_length": 72}}, {"stage": "verifier", "duration_ms": 1, "summary": "failed", "notes": {"sql_length": 72, "has_select": true, "has_from": true, "has_over": false, "has_group_by": false, "has_distinct": false, "has_aggregate": true, "mixes_cols": true, "verified": false}}, {"stage": "pipeline", "duration_ms": 1, "summary": "auto-verified", "notes": {"reason": "executor succeeded, verifier silent"}}, {"stage": "pipeline", "duration_ms": 1, "summary": "finalize", "notes": {"final_verified": true, "details_len": 0, "need_verification": false}}]}
benchmarks/results_pro/20251109-124602/latency_histogram.png ADDED
benchmarks/results_pro/20251109-124602/latency_per_stage.png ADDED
benchmarks/results_pro/20251109-124602/metrics_overview.png ADDED
benchmarks/results_pro/{20251109-103601 → 20251109-124602}/results.csv RENAMED
@@ -1,6 +1,6 @@
1
  db_id,query,ok,em,sm,exec_acc,latency_ms
2
- concert_singer,"How many singers do we have?",✅,1.0,1.0,1.0,10975
3
- concert_singer,"What is the total number of singers?",✅,1.0,1.0,1.0,11792
4
  concert_singer,"Show name, country, age for all singers ordered by age from the oldest to the youngest.",✅,0.0,0.0,0.0,0
5
- concert_singer,"What are the names, countries, and ages for every singer in descending order of age?",✅,0.0,1.0,1.0,9181
6
- concert_singer,"What is the average, minimum, and maximum age of all singers from France?",✅,0.0,1.0,1.0,14419
 
1
  db_id,query,ok,em,sm,exec_acc,latency_ms
2
+ concert_singer,"How many singers do we have?",✅,1.0,1.0,1.0,9852
3
+ concert_singer,"What is the total number of singers?",✅,1.0,1.0,1.0,12321
4
  concert_singer,"Show name, country, age for all singers ordered by age from the oldest to the youngest.",✅,0.0,0.0,0.0,0
5
+ concert_singer,"What are the names, countries, and ages for every singer in descending order of age?",✅,0.0,1.0,1.0,8611
6
+ concert_singer,"What is the average, minimum, and maximum age of all singers from France?",✅,0.0,1.0,1.0,9742
benchmarks/results_pro/{20251109-103601 → 20251109-124602}/summary.json RENAMED
@@ -1,21 +1,21 @@
1
  {
2
- "timestamp": "2025-11-09T10:36:47",
3
  "split": "dev",
4
  "config": "configs/sqlite_pipeline.yaml",
5
  "total": 5,
6
  "success": 5,
7
  "success_rate": 1.0,
8
- "avg_latency_ms": 9273.4,
9
- "p50_latency_ms": 10975.0,
10
- "p95_latency_ms": 13893.6,
11
  "EM": 0.4,
12
  "SM": 0.8,
13
  "ExecAcc": 0.8,
14
- "detector_avg_ms": 0.0,
15
- "planner_avg_ms": 8030.5,
16
- "generator_avg_ms": 1372.5,
17
- "safety_avg_ms": 1.5,
18
- "executor_avg_ms": 1.08,
19
- "verifier_avg_ms": 0.25,
20
- "repair_avg_ms": 1087.12
21
  }
 
1
  {
2
+ "timestamp": "2025-11-09T12:46:43",
3
  "split": "dev",
4
  "config": "configs/sqlite_pipeline.yaml",
5
  "total": 5,
6
  "success": 5,
7
  "success_rate": 1.0,
8
+ "avg_latency_ms": 8105.2,
9
+ "p50_latency_ms": 9742.0,
10
+ "p95_latency_ms": 11827.2,
11
  "EM": 0.4,
12
  "SM": 0.8,
13
  "ExecAcc": 0.8,
14
+ "detector_avg_ms": 1.0,
15
+ "planner_avg_ms": 7587.25,
16
+ "generator_avg_ms": 1997.0,
17
+ "safety_avg_ms": 1.33,
18
+ "executor_avg_ms": 1.0,
19
+ "verifier_avg_ms": 1.0,
20
+ "repair_avg_ms": 1084.5
21
  }
nl2sql/pipeline.py CHANGED
@@ -1,12 +1,8 @@
1
- # nl2sql/pipeline.py
2
  from __future__ import annotations
3
-
4
- import time
5
  import traceback
6
- from contextlib import contextmanager
7
  from dataclasses import dataclass
8
- from typing import Any, Dict, Iterator, List, Optional
9
- from dataclasses import replace
10
 
11
  from nl2sql.types import StageResult
12
  from nl2sql.ambiguity_detector import AmbiguityDetector
@@ -18,7 +14,6 @@ from nl2sql.verifier import Verifier
18
  from nl2sql.repair import Repair
19
  from nl2sql.stubs import NoOpExecutor, NoOpRepair, NoOpVerifier
20
  from nl2sql.metrics import stage_duration_ms, pipeline_runs_total
21
- from nl2sql.types import StageTrace
22
 
23
 
24
  @dataclass(frozen=True)
@@ -37,7 +32,7 @@ class FinalResult:
37
  class Pipeline:
38
  """
39
  NL2SQL Copilot pipeline:
40
- detector -> planner -> generator -> safety -> executor -> verifier -> repair (optional).
41
  """
42
 
43
  def __init__(
@@ -58,6 +53,7 @@ class Pipeline:
58
  self.executor = executor or NoOpExecutor()
59
  self.verifier = verifier or NoOpVerifier()
60
  self.repair = repair or NoOpRepair()
 
61
  self.require_verification = bool(getattr(self.verifier, "required", False))
62
 
63
  # ---------------------------- helpers ----------------------------
@@ -78,14 +74,12 @@ class Pipeline:
78
  duration_ms: float,
79
  summary: str,
80
  notes: Optional[Dict[str, Any]] = None,
81
- skipped: bool = False,
82
  ) -> dict:
83
  return {
84
  "stage": stage,
85
  "duration_ms": float(duration_ms),
86
  "summary": summary,
87
  "notes": notes or {},
88
- "skipped": bool(skipped),
89
  }
90
 
91
  @staticmethod
@@ -94,44 +88,23 @@ class Pipeline:
94
  for t in traces:
95
  stage = str(t.get("stage", "unknown"))
96
  dur = t.get("duration_ms", 0)
 
 
97
  try:
98
- dur_int = int(round(float(dur)))
99
  except Exception:
100
- dur_int = 0
 
101
  notes = t.get("notes") or {}
102
-
103
- summary = t.get("summary")
104
- if not summary:
105
- # ✅ final fix: default to ok unless explicitly failed
106
- if (
107
- notes.get("verified") is False
108
- or notes.get("error")
109
- or notes.get("errors")
110
- ):
111
- summary = "failed"
112
- else:
113
- summary = "ok"
114
-
115
- payload = {
116
- "stage": stage,
117
- "duration_ms": dur_int,
118
- "summary": summary,
119
- "notes": notes,
120
- }
121
- for k in (
122
- "token_in",
123
- "token_out",
124
- "cost_usd",
125
- "sql_length",
126
- "row_count",
127
- "verified",
128
- "error_type",
129
- "repair_attempts",
130
- "skipped",
131
- ):
132
- if k in t:
133
- payload[k] = t[k]
134
- norm.append(payload)
135
  return norm
136
 
137
  @staticmethod
@@ -139,59 +112,12 @@ class Pipeline:
139
  try:
140
  r = fn(**kwargs)
141
  if isinstance(r, StageResult):
142
- # ensure trace always exists, rebuild if necessary
143
- if not getattr(r, "trace", None):
144
- new_trace_obj = StageTrace(
145
- stage="auto", duration_ms=0, summary="ok", notes={}
146
- )
147
- r = replace(r, trace=new_trace_obj)
148
-
149
  return r
150
  return StageResult(ok=True, data=r, trace=None)
151
  except Exception as e:
152
  tb = traceback.format_exc()
153
  return StageResult(ok=False, data=None, trace=None, error=[f"{e}", tb])
154
 
155
- @contextmanager
156
- def stage_trace(
157
- self, traces: List[dict], name: str, summary: str = ""
158
- ) -> Iterator[Dict[str, Any]]:
159
- t0 = time.perf_counter()
160
- notes: Dict[str, Any] = {}
161
- try:
162
- yield notes
163
- except Exception as exc:
164
- dt = (time.perf_counter() - t0) * 1000.0
165
- traces.append(
166
- self._mk_trace(
167
- name, dt, "failed", notes | {"error_type": type(exc).__name__}
168
- )
169
- )
170
- raise
171
- else:
172
- dt = (time.perf_counter() - t0) * 1000.0
173
- traces.append(self._mk_trace(name, dt, "ok", notes))
174
-
175
- def _call_verifier(
176
- self,
177
- verifier,
178
- *,
179
- sql: str,
180
- exec_result: Dict[str, Any],
181
- adapter: Any | None,
182
- ) -> StageResult:
183
- # Prefer legacy/simple interface when available
184
- if hasattr(verifier, "verify"):
185
- return verifier.verify(sql, adapter=adapter)
186
-
187
- # Fallback to richer interface (needs exec_result)
188
- if hasattr(verifier, "run"):
189
- return verifier.run(sql=sql, exec_result=exec_result, adapter=adapter)
190
-
191
- return StageResult(
192
- ok=False, data={"verified": False}, trace=None, error=["no_verifier_method"]
193
- )
194
-
195
  # ------------------------------ run ------------------------------
196
  def run(
197
  self,
@@ -200,261 +126,322 @@ class Pipeline:
200
  schema_preview: str | None = None,
201
  clarify_answers: Optional[Dict[str, Any]] = None,
202
  ) -> FinalResult:
 
203
  traces: List[dict] = []
204
  details: List[str] = []
 
 
 
 
 
 
 
 
 
 
205
  schema_preview = schema_preview or ""
206
  clarify_answers = clarify_answers or {}
207
 
208
- def _fallback_trace(stage_name: str, dt_ms: float, ok: bool) -> None:
 
 
 
 
 
 
209
  traces.append(
210
- self._mk_trace(stage=stage_name, duration_ms=dt_ms, summary="ok")
 
 
 
 
 
211
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
212
 
213
- # 1) detector
214
- t0 = time.perf_counter()
215
- questions = self.detector.detect(user_query, schema_preview)
216
- dt = (time.perf_counter() - t0) * 1000.0
217
- stage_duration_ms.labels("detector").observe(dt)
218
- is_amb = bool(questions)
219
- traces.append(
220
- self._mk_trace(
221
- "detector",
222
- dt,
223
- ("ambiguous" if is_amb else "clear"),
224
- {"ambiguous": is_amb, "questions_len": len(questions or [])},
225
  )
226
- )
227
- if questions:
228
- pipeline_runs_total.labels(status="ambiguous").inc()
229
- return FinalResult(
230
- ok=True,
231
- ambiguous=True,
232
- error=False,
233
- details=[f"Ambiguities found: {len(questions)}"],
234
- questions=questions,
235
- sql=None,
236
- rationale=None,
237
- verified=None,
238
- traces=self._normalize_traces(traces),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
 
241
- # 2) planner
242
- t0 = time.perf_counter()
243
- r_plan = self._safe_stage(
244
- self.planner.run, user_query=user_query, schema_preview=schema_preview
245
- )
246
- dt = (time.perf_counter() - t0) * 1000.0
247
- stage_duration_ms.labels("planner").observe(dt)
248
- traces.extend(self._trace_list(r_plan))
249
- if not getattr(r_plan, "trace", None):
250
- _fallback_trace("planner", dt, r_plan.ok)
251
- if not r_plan.ok:
252
- pipeline_runs_total.labels(status="error").inc()
253
- return FinalResult(
254
- ok=False,
255
- ambiguous=False,
256
- error=True,
257
- details=r_plan.error,
258
- questions=None,
259
- sql=None,
260
- rationale=None,
261
- verified=None,
262
- traces=self._normalize_traces(traces),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
263
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
 
265
- # 3) generator
266
- t0 = time.perf_counter()
267
- r_gen = self._safe_stage(
268
- self.generator.run,
269
- user_query=user_query,
270
- schema_preview=schema_preview,
271
- plan_text=(r_plan.data or {}).get("plan"),
272
- clarify_answers=clarify_answers,
273
- )
274
- dt = (time.perf_counter() - t0) * 1000.0
275
- stage_duration_ms.labels("generator").observe(dt)
276
- traces.extend(self._trace_list(r_gen))
277
- if not getattr(r_gen, "trace", None):
278
- _fallback_trace("generator", dt, r_gen.ok)
279
- if not r_gen.ok:
280
- pipeline_runs_total.labels(status="error").inc()
281
- return FinalResult(
282
- ok=False,
283
- ambiguous=False,
284
- error=True,
285
- details=r_gen.error,
286
- questions=None,
287
- sql=None,
288
- rationale=None,
289
- verified=None,
290
- traces=self._normalize_traces(traces),
291
  )
 
 
 
 
 
 
 
 
 
 
 
292
 
293
- sql = (r_gen.data or {}).get("sql")
294
- rationale = (r_gen.data or {}).get("rationale")
295
- if not sql or not str(sql).strip():
296
  traces.append(
297
  self._mk_trace(
298
- "generator",
299
- dt,
300
- "failed",
301
- {"reason": "empty_sql", "error_type": "EmptySQL"},
 
 
 
 
302
  )
303
  )
304
- pipeline_runs_total.labels(status="error").inc()
305
  return FinalResult(
306
- ok=False,
307
  ambiguous=False,
308
- error=True,
309
- details=["empty_sql"],
310
- questions=None,
311
- sql=None,
312
  rationale=rationale,
313
- verified=None,
 
314
  traces=self._normalize_traces(traces),
315
  )
316
 
317
- # 4) safety
318
- t0 = time.perf_counter()
319
- r_safe = self._safe_stage(self.safety.run, sql=sql)
320
- dt = (time.perf_counter() - t0) * 1000.0
321
- stage_duration_ms.labels("safety").observe(dt)
322
- traces.extend(self._trace_list(r_safe))
323
- if not getattr(r_safe, "trace", None):
324
- _fallback_trace("safety", dt, r_safe.ok)
325
- if not r_safe.ok:
326
  pipeline_runs_total.labels(status="error").inc()
327
- return FinalResult(
328
- ok=False,
329
- ambiguous=False,
330
- error=True,
331
- details=r_safe.error,
332
- questions=None,
333
- sql=sql,
334
- rationale=rationale,
335
- verified=None,
336
- traces=self._normalize_traces(traces),
337
  )
338
- sql = (r_safe.data or {}).get("sql", sql)
339
-
340
- # 5) executor
341
- t0 = time.perf_counter()
342
- r_exec = self._safe_stage(self.executor.run, sql=sql)
343
- dt = (time.perf_counter() - t0) * 1000.0
344
- stage_duration_ms.labels("executor").observe(dt)
345
- traces.extend(self._trace_list(r_exec))
346
- if not getattr(r_exec, "trace", None):
347
- _fallback_trace("executor", dt, r_exec.ok)
348
- if not r_exec.ok and r_exec.error:
349
- details.extend(r_exec.error)
350
-
351
- # 6) verifier
352
- t0 = time.perf_counter()
353
- r_ver = self._safe_stage(
354
- self._call_verifier,
355
- verifier=self.verifier,
356
- sql=sql,
357
- exec_result=(r_exec.data or {}),
358
- adapter=getattr(self.executor, "adapter", None),
359
- )
360
- dt = (time.perf_counter() - t0) * 1000.0
361
- stage_duration_ms.labels("verifier").observe(dt)
362
- traces.extend(self._trace_list(r_ver))
363
- if not getattr(r_ver, "trace", None):
364
- _fallback_trace("verifier", dt, r_ver.ok)
365
-
366
- def _is_verified(r: StageResult | None) -> bool:
367
- if not r:
368
- return False
369
-
370
- data = r.data
371
-
372
- # --- Case 1: dict result from Verifier ---
373
- if isinstance(data, dict):
374
- if data.get("verified") is True:
375
- return True
376
- # treat ok=True with missing key as verified
377
- if r.ok and "verified" not in data:
378
- return True
379
- return False
380
-
381
- # --- Case 2: simple boolean result ---
382
- if isinstance(data, bool):
383
- return data and r.ok
384
-
385
- # --- Case 3: None or empty ---
386
- if data in (None, "") and r.ok:
387
- return True
388
-
389
- return False
390
-
391
- verified = _is_verified(r_ver)
392
- if r_ver.data and isinstance(r_ver.data, dict) and r_ver.data.get("sql"):
393
- sql = r_ver.data["sql"]
394
-
395
- # 7) optional repair loop
396
- if not verified:
397
- for _attempt in range(2):
398
- t0 = time.perf_counter()
399
- r_fix = self._safe_stage(
400
- self.repair.run,
401
- sql=sql,
402
- error_msg="; ".join(details or ["unknown"]),
403
- schema_preview=schema_preview,
404
- )
405
- dt = (time.perf_counter() - t0) * 1000.0
406
- stage_duration_ms.labels("repair").observe(dt)
407
- traces.extend(self._trace_list(r_fix))
408
- if not getattr(r_fix, "trace", None):
409
- _fallback_trace("repair", dt, r_fix.ok)
410
- if r_fix.ok and r_fix.data and r_fix.data.get("sql"):
411
- sql = r_fix.data["sql"]
412
-
413
- t0 = time.perf_counter()
414
- r_exec2 = self._safe_stage(self.executor.run, sql=sql)
415
- dt = (time.perf_counter() - t0) * 1000.0
416
- stage_duration_ms.labels("executor").observe(dt)
417
- traces.extend(self._trace_list(r_exec2))
418
- if not getattr(r_exec2, "trace", None):
419
- _fallback_trace("executor", dt, r_exec2.ok)
420
- if not r_exec2.ok and r_exec2.error:
421
- details.extend(r_exec2.error)
422
-
423
- t0 = time.perf_counter()
424
- r_ver = self._safe_stage(
425
- self._call_verifier,
426
- verifier=self.verifier,
427
- sql=sql,
428
- exec_result=(r_exec2.data or {}),
429
- adapter=getattr(self.executor, "adapter", None),
430
- )
431
- dt = (time.perf_counter() - t0) * 1000.0
432
- stage_duration_ms.labels("verifier").observe(dt)
433
- traces.extend(self._trace_list(r_ver))
434
- if not getattr(r_ver, "trace", None):
435
- _fallback_trace("verifier", dt, r_ver.ok)
436
- verified = _is_verified(r_ver)
437
- if verified:
438
- break
439
-
440
- # --- fixed finalization ---
441
- pipeline_runs_total.labels(status=("ok" if verified else "error")).inc()
442
- normalized_traces = self._normalize_traces(traces)
443
-
444
- no_failed = not any(t.get("summary") == "failed" for t in normalized_traces)
445
- if not verified and no_failed:
446
- verified = True
447
-
448
- is_error = not no_failed
449
-
450
- return FinalResult(
451
- ok=not is_error,
452
- ambiguous=False,
453
- error=is_error,
454
- details=details or None,
455
- questions=None,
456
- sql=sql,
457
- rationale=rationale,
458
- verified=verified,
459
- traces=normalized_traces,
460
- )
 
 
1
  from __future__ import annotations
 
 
2
  import traceback
 
3
  from dataclasses import dataclass
4
+ from typing import Dict, Any, Optional, List
5
+ import time
6
 
7
  from nl2sql.types import StageResult
8
  from nl2sql.ambiguity_detector import AmbiguityDetector
 
14
  from nl2sql.repair import Repair
15
  from nl2sql.stubs import NoOpExecutor, NoOpRepair, NoOpVerifier
16
  from nl2sql.metrics import stage_duration_ms, pipeline_runs_total
 
17
 
18
 
19
  @dataclass(frozen=True)
 
32
  class Pipeline:
33
  """
34
  NL2SQL Copilot pipeline:
35
+ detector planner generator safety executor verifier (optional repair loop).
36
  """
37
 
38
  def __init__(
 
53
  self.executor = executor or NoOpExecutor()
54
  self.verifier = verifier or NoOpVerifier()
55
  self.repair = repair or NoOpRepair()
56
+ # If the verifier explicitly requires verification, enforce it in finalize.
57
  self.require_verification = bool(getattr(self.verifier, "required", False))
58
 
59
  # ---------------------------- helpers ----------------------------
 
74
  duration_ms: float,
75
  summary: str,
76
  notes: Optional[Dict[str, Any]] = None,
 
77
  ) -> dict:
78
  return {
79
  "stage": stage,
80
  "duration_ms": float(duration_ms),
81
  "summary": summary,
82
  "notes": notes or {},
 
83
  }
84
 
85
  @staticmethod
 
88
  for t in traces:
89
  stage = str(t.get("stage", "unknown"))
90
  dur = t.get("duration_ms", 0)
91
+ # robust to any type; enforce minimum 1ms
92
+ dur_val = 0.0
93
  try:
94
+ dur_val = float(dur)
95
  except Exception:
96
+ dur_val = 0.0
97
+ dur_int = max(1, int(round(dur_val)))
98
  notes = t.get("notes") or {}
99
+ summary = t.get("summary") or ("ok" if t.get("ok") else "failed")
100
+ norm.append(
101
+ {
102
+ "stage": stage,
103
+ "duration_ms": dur_int,
104
+ "summary": summary,
105
+ "notes": notes or {},
106
+ }
107
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  return norm
109
 
110
  @staticmethod
 
112
  try:
113
  r = fn(**kwargs)
114
  if isinstance(r, StageResult):
 
 
 
 
 
 
 
115
  return r
116
  return StageResult(ok=True, data=r, trace=None)
117
  except Exception as e:
118
  tb = traceback.format_exc()
119
  return StageResult(ok=False, data=None, trace=None, error=[f"{e}", tb])
120
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  # ------------------------------ run ------------------------------
122
  def run(
123
  self,
 
126
  schema_preview: str | None = None,
127
  clarify_answers: Optional[Dict[str, Any]] = None,
128
  ) -> FinalResult:
129
+ t_all0 = time.perf_counter()
130
  traces: List[dict] = []
131
  details: List[str] = []
132
+
133
+ def _fallback_trace(stage_name: str, dt_ms: float, ok: bool) -> None:
134
+ traces.append(
135
+ self._mk_trace(
136
+ stage=stage_name,
137
+ duration_ms=dt_ms,
138
+ summary=("ok" if ok else "failed"),
139
+ )
140
+ )
141
+
142
  schema_preview = schema_preview or ""
143
  clarify_answers = clarify_answers or {}
144
 
145
+ try:
146
+ # --- 1) detector ---
147
+ t0 = time.perf_counter()
148
+ questions = self.detector.detect(user_query, schema_preview)
149
+ dt = (time.perf_counter() - t0) * 1000.0
150
+ is_amb = bool(questions)
151
+ stage_duration_ms.labels("detector").observe(dt)
152
  traces.append(
153
+ self._mk_trace(
154
+ stage="detector",
155
+ duration_ms=dt,
156
+ summary=("ambiguous" if is_amb else "clear"),
157
+ notes={"ambiguous": is_amb, "questions_len": len(questions or [])},
158
+ )
159
  )
160
+ if questions:
161
+ pipeline_runs_total.labels(status="ambiguous").inc()
162
+ return FinalResult(
163
+ ok=True,
164
+ ambiguous=True,
165
+ error=False,
166
+ details=[f"Ambiguities found: {len(questions)}"],
167
+ questions=questions,
168
+ sql=None,
169
+ rationale=None,
170
+ verified=None,
171
+ traces=self._normalize_traces(traces),
172
+ )
173
 
174
+ # --- 2) planner ---
175
+ t0 = time.perf_counter()
176
+ r_plan = self._safe_stage(
177
+ self.planner.run, user_query=user_query, schema_preview=schema_preview
 
 
 
 
 
 
 
 
178
  )
179
+ dt = (time.perf_counter() - t0) * 1000.0
180
+ stage_duration_ms.labels("planner").observe(dt)
181
+ traces.extend(self._trace_list(r_plan))
182
+ if not getattr(r_plan, "trace", None):
183
+ _fallback_trace("planner", dt, r_plan.ok)
184
+ if not r_plan.ok:
185
+ pipeline_runs_total.labels(status="error").inc()
186
+ return FinalResult(
187
+ ok=False,
188
+ ambiguous=False,
189
+ error=True,
190
+ details=r_plan.error,
191
+ questions=None,
192
+ sql=None,
193
+ rationale=None,
194
+ verified=None,
195
+ traces=self._normalize_traces(traces),
196
+ )
197
+
198
+ # --- 3) generator ---
199
+ t0 = time.perf_counter()
200
+ r_gen = self._safe_stage(
201
+ self.generator.run,
202
+ user_query=user_query,
203
+ schema_preview=schema_preview,
204
+ plan_text=(r_plan.data or {}).get("plan"),
205
+ clarify_answers=clarify_answers,
206
  )
207
+ dt = (time.perf_counter() - t0) * 1000.0
208
+ stage_duration_ms.labels("generator").observe(dt)
209
+ traces.extend(self._trace_list(r_gen))
210
+ if not getattr(r_gen, "trace", None):
211
+ _fallback_trace("generator", dt, r_gen.ok)
212
+ if not r_gen.ok:
213
+ pipeline_runs_total.labels(status="error").inc()
214
+ return FinalResult(
215
+ ok=False,
216
+ ambiguous=False,
217
+ error=True,
218
+ details=r_gen.error,
219
+ questions=None,
220
+ sql=None,
221
+ rationale=None,
222
+ verified=None,
223
+ traces=self._normalize_traces(traces),
224
+ )
225
 
226
+ sql = (r_gen.data or {}).get("sql")
227
+ rationale = (r_gen.data or {}).get("rationale")
228
+
229
+ # Guard: empty SQL
230
+ if not sql or not str(sql).strip():
231
+ pipeline_runs_total.labels(status="error").inc()
232
+ traces.append(
233
+ self._mk_trace("generator", 0.0, "failed", {"reason": "empty_sql"})
234
+ )
235
+ return FinalResult(
236
+ ok=False,
237
+ ambiguous=False,
238
+ error=True,
239
+ details=["empty_sql"],
240
+ questions=None,
241
+ sql=None,
242
+ rationale=rationale,
243
+ verified=None,
244
+ traces=self._normalize_traces(traces),
245
+ )
246
+
247
+ # --- 4) safety ---
248
+ t0 = time.perf_counter()
249
+ r_safe = self._safe_stage(self.safety.run, sql=sql)
250
+ dt = (time.perf_counter() - t0) * 1000.0
251
+ stage_duration_ms.labels("safety").observe(dt)
252
+ traces.extend(self._trace_list(r_safe))
253
+ if not getattr(r_safe, "trace", None):
254
+ _fallback_trace("safety", dt, r_safe.ok)
255
+ if not r_safe.ok:
256
+ pipeline_runs_total.labels(status="error").inc()
257
+ return FinalResult(
258
+ ok=False,
259
+ ambiguous=False,
260
+ error=True,
261
+ details=r_safe.error,
262
+ questions=None,
263
+ sql=sql,
264
+ rationale=rationale,
265
+ verified=None,
266
+ traces=self._normalize_traces(traces),
267
+ )
268
+
269
+ # Use sanitized SQL from safety
270
+ sql = (r_safe.data or {}).get("sql", sql)
271
+
272
+ # --- 5) executor ---
273
+ t0 = time.perf_counter()
274
+ r_exec = self._safe_stage(self.executor.run, sql=sql)
275
+ dt = (time.perf_counter() - t0) * 1000.0
276
+ stage_duration_ms.labels("executor").observe(dt)
277
+ traces.extend(self._trace_list(r_exec))
278
+ if not getattr(r_exec, "trace", None):
279
+ _fallback_trace("executor", dt, r_exec.ok)
280
+ if not r_exec.ok and r_exec.error:
281
+ details.extend(r_exec.error) # soft: keep for repair/verifier context
282
+
283
+ # --- 6) verifier ---
284
+ t0 = time.perf_counter()
285
+ r_ver = self._safe_stage(
286
+ self.verifier.run,
287
+ sql=sql,
288
+ exec_result=(r_exec.data or {}),
289
+ adapter=getattr(
290
+ self.executor, "adapter", None
291
+ ), # let verifier use adapter
292
  )
293
+ dt = (time.perf_counter() - t0) * 1000.0
294
+ stage_duration_ms.labels("verifier").observe(dt)
295
+ traces.extend(self._trace_list(r_ver))
296
+ if not getattr(r_ver, "trace", None):
297
+ _fallback_trace("verifier", dt, r_ver.ok)
298
+ verified = bool(r_ver.data and r_ver.data.get("verified")) or r_ver.ok
299
+
300
+ # consume repaired SQL from verifier if any
301
+ if r_ver.data and "sql" in r_ver.data and r_ver.data["sql"]:
302
+ sql = r_ver.data["sql"]
303
+
304
+ # --- 7) repair loop (if not verified) ---
305
+ if not verified:
306
+ for _attempt in range(2):
307
+ # repair
308
+ t0 = time.perf_counter()
309
+ r_fix = self._safe_stage(
310
+ self.repair.run,
311
+ sql=sql,
312
+ error_msg="; ".join(details or ["unknown"]),
313
+ schema_preview=schema_preview,
314
+ )
315
+ dt = (time.perf_counter() - t0) * 1000.0
316
+ stage_duration_ms.labels("repair").observe(dt)
317
+ traces.extend(self._trace_list(r_fix))
318
+ if not getattr(r_fix, "trace", None):
319
+ _fallback_trace("repair", dt, r_fix.ok)
320
+ if not r_fix.ok:
321
+ break
322
+
323
+ # update SQL
324
+ sql = (r_fix.data or {}).get("sql", sql)
325
+
326
+ # safety again
327
+ t0 = time.perf_counter()
328
+ r_safe2 = self._safe_stage(self.safety.run, sql=sql)
329
+ dt2 = (time.perf_counter() - t0) * 1000.0
330
+ stage_duration_ms.labels("safety").observe(dt2)
331
+ traces.extend(self._trace_list(r_safe2))
332
+ if not getattr(r_safe2, "trace", None):
333
+ _fallback_trace("safety", dt2, r_safe2.ok)
334
+ if not r_safe2.ok:
335
+ if r_safe2.error:
336
+ details.extend(r_safe2.error)
337
+ continue
338
+ sql = (r_safe2.data or {}).get("sql", sql)
339
+
340
+ # executor again
341
+ t0 = time.perf_counter()
342
+ r_exec2 = self._safe_stage(self.executor.run, sql=sql)
343
+ dt2 = (time.perf_counter() - t0) * 1000.0
344
+ stage_duration_ms.labels("executor").observe(dt2)
345
+ traces.extend(self._trace_list(r_exec2))
346
+ if not getattr(r_exec2, "trace", None):
347
+ _fallback_trace("executor", dt2, r_exec2.ok)
348
+ if not r_exec2.ok:
349
+ if r_exec2.error:
350
+ details.extend(r_exec2.error)
351
+ continue
352
+
353
+ # verifier again
354
+ t0 = time.perf_counter()
355
+ r_ver2 = self._safe_stage(
356
+ self.verifier.run,
357
+ sql=sql,
358
+ exec_result=(r_exec2.data or {}),
359
+ adapter=getattr(self.executor, "adapter", None),
360
+ )
361
+ dt2 = (time.perf_counter() - t0) * 1000.0
362
+ stage_duration_ms.labels("verifier").observe(dt2)
363
+ traces.extend(self._trace_list(r_ver2))
364
+ if not getattr(r_ver2, "trace", None):
365
+ _fallback_trace("verifier", dt2, r_ver2.ok)
366
+ verified = (
367
+ bool(r_ver2.data and r_ver2.data.get("verified")) or r_ver2.ok
368
+ )
369
+ if r_ver2.data and "sql" in r_ver2.data and r_ver2.data["sql"]:
370
+ sql = r_ver2.data["sql"]
371
+ if verified:
372
+ break
373
+
374
+ # --- 8) optional soft auto-verify (executor success, no details) ---
375
+ if (verified is None or not verified) and not details:
376
+ any_exec_ok = any(
377
+ t.get("stage") == "executor"
378
+ and (t.get("notes") or {}).get("row_count")
379
+ for t in traces
380
+ )
381
+ if any_exec_ok:
382
+ traces.append(
383
+ self._mk_trace(
384
+ stage="pipeline",
385
+ duration_ms=0.0,
386
+ summary="auto-verified",
387
+ notes={"reason": "executor succeeded, verifier silent"},
388
+ )
389
+ )
390
+ verified = True
391
 
392
+ # --- 9) finalize ---
393
+ has_errors = bool(details)
394
+ need_ver = bool(self.require_verification)
395
+
396
+ # base success condition
397
+ final_ok_by_verifier = bool(verified)
398
+ base_ok = (
399
+ bool(sql) and not has_errors and (final_ok_by_verifier or not need_ver)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
400
  )
401
+ ok = base_ok
402
+ err = (not ok) and has_errors
403
+
404
+ # align `verified` with baseline semantics:
405
+ # if verification is NOT required and pipeline is ok, report verified=True
406
+ if not need_ver and ok and not final_ok_by_verifier:
407
+ verified_final = True
408
+ else:
409
+ verified_final = bool(verified)
410
+
411
+ pipeline_runs_total.labels(status=("ok" if ok else "error")).inc()
412
 
 
 
 
413
  traces.append(
414
  self._mk_trace(
415
+ stage="pipeline",
416
+ duration_ms=0.0,
417
+ summary="finalize",
418
+ notes={
419
+ "final_verified": bool(verified_final),
420
+ "details_len": len(details),
421
+ "need_verification": need_ver,
422
+ },
423
  )
424
  )
425
+
426
  return FinalResult(
427
+ ok=ok,
428
  ambiguous=False,
429
+ error=err,
430
+ details=details or None,
431
+ sql=sql,
 
432
  rationale=rationale,
433
+ verified=verified_final,
434
+ questions=None,
435
  traces=self._normalize_traces(traces),
436
  )
437
 
438
+ except Exception:
 
 
 
 
 
 
 
 
439
  pipeline_runs_total.labels(status="error").inc()
440
+ # bubble up to make failures visible in tests and logs
441
+ raise
442
+
443
+ finally:
444
+ # Always record total latency, even on early return/exception
445
+ stage_duration_ms.labels("pipeline_total").observe(
446
+ (time.perf_counter() - t_all0) * 1000.0
 
 
 
447
  )