Spaces:

md896
/

sql-debug-env

Running

md896 commited on 14 days ago

Commit

0730521

verified ·

1 Parent(s): 4ca17fb

Upload folder using huggingface_hub

Files changed (6) hide show

artifacts/runs/20260426-034616-final-corrected-eval/api_errors.json ADDED Viewed

+{
+  "base_errors": [],
+  "trained_errors": []
+}

artifacts/runs/20260426-034616-final-corrected-eval/comparison_table.csv ADDED Viewed

+task,baseline_reward,post_reward,delta,relative_delta_percent
+easy_syntax_fix,0.111850,0.123100,0.011250,10.06
+medium_logic_fix,0.129350,0.101850,-0.027500,-21.26
+hard_multi_bug,0.100600,0.100600,0.000000,0.00
+hard_finance_explosion,0.103750,0.100413,-0.003337,-3.22
+overall,0.111388,0.106491,-0.004897,-4.40

artifacts/runs/20260426-034616-final-corrected-eval/comparison_table.md ADDED Viewed

+# Corrected Base vs Trained Evaluation
+- samples_per_task: 8
+- base_error_count: 0
+- trained_error_count: 0
+| task | baseline | trained | delta | relative delta % |
+|---|---:|---:|---:|---:|
+| easy_syntax_fix | 0.111850 | 0.123100 | 0.011250 | 10.06% |
+| medium_logic_fix | 0.129350 | 0.101850 | -0.027500 | -21.26% |
+| hard_multi_bug | 0.100600 | 0.100600 | 0.000000 | 0.00% |
+| hard_finance_explosion | 0.103750 | 0.100413 | -0.003337 | -3.22% |
+| overall | 0.111388 | 0.106491 | -0.004897 | -4.40% |

artifacts/runs/20260426-034616-final-corrected-eval/corrected_metrics.json ADDED Viewed

+{
+  "task_ids": [
+    "easy_syntax_fix",
+    "medium_logic_fix",
+    "hard_multi_bug",
+    "hard_finance_explosion"
+  ],
+  "samples_per_task": 8,
+  "per_task_baseline_reward": {
+    "easy_syntax_fix": 0.11185,
+    "medium_logic_fix": 0.12935,
+    "hard_multi_bug": 0.10060000000000001,
+    "hard_finance_explosion": 0.10375
+  },
+  "per_task_post_reward": {
+    "easy_syntax_fix": 0.12310000000000001,
+    "medium_logic_fix": 0.10185000000000001,
+    "hard_multi_bug": 0.10060000000000001,
+    "hard_finance_explosion": 0.1004125
+  },
+  "delta_per_task": {
+    "easy_syntax_fix": 0.01125000000000001,
+    "medium_logic_fix": -0.027499999999999983,
+    "hard_multi_bug": 0.0,
+    "hard_finance_explosion": -0.0033374999999999932
+  },
+  "baseline_avg_reward": 0.11138750000000003,
+  "post_avg_reward": 0.10649062500000002,
+  "base_error_count": 0,
+  "trained_error_count": 0,
+  "delta_avg_reward": -0.004896875000000009
+}

artifacts/runs/20260426-034616-final-corrected-eval/performance_comparison_corrected.png ADDED Viewed

artifacts/runs/20260426-034616-final-corrected-eval/task_delta_corrected.png ADDED Viewed