md896 commited on
Commit
0730521
·
verified ·
1 Parent(s): 4ca17fb

Upload folder using huggingface_hub

Browse files
artifacts/runs/20260426-034616-final-corrected-eval/api_errors.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "base_errors": [],
3
+ "trained_errors": []
4
+ }
artifacts/runs/20260426-034616-final-corrected-eval/comparison_table.csv ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ task,baseline_reward,post_reward,delta,relative_delta_percent
2
+ easy_syntax_fix,0.111850,0.123100,0.011250,10.06
3
+ medium_logic_fix,0.129350,0.101850,-0.027500,-21.26
4
+ hard_multi_bug,0.100600,0.100600,0.000000,0.00
5
+ hard_finance_explosion,0.103750,0.100413,-0.003337,-3.22
6
+ overall,0.111388,0.106491,-0.004897,-4.40
artifacts/runs/20260426-034616-final-corrected-eval/comparison_table.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Corrected Base vs Trained Evaluation
2
+
3
+ - samples_per_task: 8
4
+ - base_error_count: 0
5
+ - trained_error_count: 0
6
+
7
+ | task | baseline | trained | delta | relative delta % |
8
+ |---|---:|---:|---:|---:|
9
+ | easy_syntax_fix | 0.111850 | 0.123100 | 0.011250 | 10.06% |
10
+ | medium_logic_fix | 0.129350 | 0.101850 | -0.027500 | -21.26% |
11
+ | hard_multi_bug | 0.100600 | 0.100600 | 0.000000 | 0.00% |
12
+ | hard_finance_explosion | 0.103750 | 0.100413 | -0.003337 | -3.22% |
13
+ | overall | 0.111388 | 0.106491 | -0.004897 | -4.40% |
artifacts/runs/20260426-034616-final-corrected-eval/corrected_metrics.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "task_ids": [
3
+ "easy_syntax_fix",
4
+ "medium_logic_fix",
5
+ "hard_multi_bug",
6
+ "hard_finance_explosion"
7
+ ],
8
+ "samples_per_task": 8,
9
+ "per_task_baseline_reward": {
10
+ "easy_syntax_fix": 0.11185,
11
+ "medium_logic_fix": 0.12935,
12
+ "hard_multi_bug": 0.10060000000000001,
13
+ "hard_finance_explosion": 0.10375
14
+ },
15
+ "per_task_post_reward": {
16
+ "easy_syntax_fix": 0.12310000000000001,
17
+ "medium_logic_fix": 0.10185000000000001,
18
+ "hard_multi_bug": 0.10060000000000001,
19
+ "hard_finance_explosion": 0.1004125
20
+ },
21
+ "delta_per_task": {
22
+ "easy_syntax_fix": 0.01125000000000001,
23
+ "medium_logic_fix": -0.027499999999999983,
24
+ "hard_multi_bug": 0.0,
25
+ "hard_finance_explosion": -0.0033374999999999932
26
+ },
27
+ "baseline_avg_reward": 0.11138750000000003,
28
+ "post_avg_reward": 0.10649062500000002,
29
+ "base_error_count": 0,
30
+ "trained_error_count": 0,
31
+ "delta_avg_reward": -0.004896875000000009
32
+ }
artifacts/runs/20260426-034616-final-corrected-eval/performance_comparison_corrected.png ADDED
artifacts/runs/20260426-034616-final-corrected-eval/task_delta_corrected.png ADDED