bensondccnqwc commited on
Commit
83dc46b
·
verified ·
1 Parent(s): ffd9f88

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. eval_results_avg32/global_step_0/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  2. eval_results_avg32/global_step_10/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  3. eval_results_avg32/global_step_10/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  4. eval_results_avg32/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  5. eval_results_avg32/global_step_20/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  6. eval_results_avg32/global_step_30/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  7. eval_results_avg32/global_step_30/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  8. eval_results_avg32/global_step_40/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  9. eval_results_avg32/global_step_40/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  10. eval_results_avg32/global_step_50/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  11. eval_results_avg32/global_step_50/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  12. eval_results_avg32/global_step_50/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
  13. eval_results_avg32/global_step_60/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  14. eval_results_avg32/global_step_60/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
  15. eval_results_avg32/global_step_60/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  16. eval_results_avg32/global_step_60/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
  17. eval_results_avg32/global_step_70/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  18. eval_results_avg32/global_step_70/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
  19. eval_results_avg32/global_step_70/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  20. eval_results_avg32/global_step_70/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
  21. eval_results_avg32/global_step_80/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  22. eval_results_avg32/global_step_80/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
  23. eval_results_avg32/global_step_80/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  24. eval_results_avg32/global_step_80/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
  25. eval_results_avg32/global_step_90/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  26. eval_results_avg32/global_step_90/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
  27. eval_results_avg32/global_step_90/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  28. eval_results_avg32/global_step_90/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
  29. eval_results_avg4/eval_results.csv +12 -0
  30. eval_results_avg4/global_step_0/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  31. eval_results_avg4/global_step_0/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  32. eval_results_avg4/global_step_0/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  33. eval_results_avg4/global_step_0/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  34. eval_results_avg4/global_step_0/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  35. eval_results_avg4/global_step_0/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  36. eval_results_avg4/global_step_10/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  37. eval_results_avg4/global_step_10/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  38. eval_results_avg4/global_step_10/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  39. eval_results_avg4/global_step_10/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  40. eval_results_avg4/global_step_10/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  41. eval_results_avg4/global_step_10/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  42. eval_results_avg4/global_step_100/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  43. eval_results_avg4/global_step_100/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  44. eval_results_avg4/global_step_100/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  45. eval_results_avg4/global_step_100/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  46. eval_results_avg4/global_step_100/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  47. eval_results_avg4/global_step_100/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  48. eval_results_avg4/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  49. eval_results_avg4/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  50. eval_results_avg4/global_step_20/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
eval_results_avg32/global_step_0/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg32/global_step_10/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg32/global_step_10/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg32/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg32/global_step_20/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg32/global_step_30/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg32/global_step_30/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg32/global_step_40/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg32/global_step_40/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg32/global_step_50/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg32/global_step_50/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg32/global_step_50/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 960,
4
+ "timeout_samples": 3,
5
+ "empty_samples": 0,
6
+ "acc": 3.3,
7
+ "pass_acc": 23.3,
8
+ "pass@k": {
9
+ "1": 3.3,
10
+ "2": 5.8,
11
+ "4": 9.3,
12
+ "8": 13.9,
13
+ "16": 19.3,
14
+ "32": 23.3
15
+ },
16
+ "time_use_in_second": 529.1417696475983,
17
+ "time_use_in_minite": "8:49"
18
+ }
eval_results_avg32/global_step_60/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg32/global_step_60/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 960,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 7.3,
7
+ "pass_acc": 23.3,
8
+ "pass@k": {
9
+ "1": 7.3,
10
+ "2": 11.1,
11
+ "4": 15.3,
12
+ "8": 19.4,
13
+ "16": 22.3,
14
+ "32": 23.3
15
+ },
16
+ "time_use_in_second": 819.9961113929749,
17
+ "time_use_in_minite": "13:39"
18
+ }
eval_results_avg32/global_step_60/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg32/global_step_60/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 960,
4
+ "timeout_samples": 1,
5
+ "empty_samples": 1,
6
+ "acc": 4.3,
7
+ "pass_acc": 30.0,
8
+ "pass@k": {
9
+ "1": 4.3,
10
+ "2": 7.1,
11
+ "4": 11.0,
12
+ "8": 16.1,
13
+ "16": 22.4,
14
+ "32": 30.0
15
+ },
16
+ "time_use_in_second": 577.8698544502258,
17
+ "time_use_in_minite": "9:37"
18
+ }
eval_results_avg32/global_step_70/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg32/global_step_70/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 960,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 6.5,
7
+ "pass_acc": 23.3,
8
+ "pass@k": {
9
+ "1": 6.5,
10
+ "2": 9.0,
11
+ "4": 12.0,
12
+ "8": 15.8,
13
+ "16": 20.1,
14
+ "32": 23.3
15
+ },
16
+ "time_use_in_second": 829.8540849685669,
17
+ "time_use_in_minite": "13:49"
18
+ }
eval_results_avg32/global_step_70/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg32/global_step_70/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 960,
4
+ "timeout_samples": 3,
5
+ "empty_samples": 0,
6
+ "acc": 3.6,
7
+ "pass_acc": 23.3,
8
+ "pass@k": {
9
+ "1": 3.6,
10
+ "2": 5.9,
11
+ "4": 8.7,
12
+ "8": 11.9,
13
+ "16": 16.3,
14
+ "32": 23.3
15
+ },
16
+ "time_use_in_second": 546.3630204200745,
17
+ "time_use_in_minite": "9:06"
18
+ }
eval_results_avg32/global_step_80/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg32/global_step_80/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 960,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 6.9,
7
+ "pass_acc": 26.7,
8
+ "pass@k": {
9
+ "1": 6.9,
10
+ "2": 10.1,
11
+ "4": 13.8,
12
+ "8": 18.1,
13
+ "16": 22.4,
14
+ "32": 26.7
15
+ },
16
+ "time_use_in_second": 647.9162139892578,
17
+ "time_use_in_minite": "10:47"
18
+ }
eval_results_avg32/global_step_80/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg32/global_step_80/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 960,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 3.2,
7
+ "pass_acc": 23.3,
8
+ "pass@k": {
9
+ "1": 3.2,
10
+ "2": 5.4,
11
+ "4": 8.2,
12
+ "8": 11.6,
13
+ "16": 16.3,
14
+ "32": 23.3
15
+ },
16
+ "time_use_in_second": 469.15487265586853,
17
+ "time_use_in_minite": "7:49"
18
+ }
eval_results_avg32/global_step_90/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg32/global_step_90/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 960,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 8.3,
7
+ "pass_acc": 33.3,
8
+ "pass@k": {
9
+ "1": 8.3,
10
+ "2": 12.4,
11
+ "4": 17.1,
12
+ "8": 22.3,
13
+ "16": 27.9,
14
+ "32": 33.3
15
+ },
16
+ "time_use_in_second": 582.5299682617188,
17
+ "time_use_in_minite": "9:42"
18
+ }
eval_results_avg32/global_step_90/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg32/global_step_90/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 960,
4
+ "timeout_samples": 2,
5
+ "empty_samples": 0,
6
+ "acc": 4.7,
7
+ "pass_acc": 23.3,
8
+ "pass@k": {
9
+ "1": 4.7,
10
+ "2": 7.5,
11
+ "4": 10.8,
12
+ "8": 14.6,
13
+ "16": 19.1,
14
+ "32": 23.3
15
+ },
16
+ "time_use_in_second": 463.4439477920532,
17
+ "time_use_in_minite": "7:43"
18
+ }
eval_results_avg4/eval_results.csv ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model,aime24_acc,aime24_pass_acc,aime24_tokens,aime24_keywords,aime24_correct_tokens,aime24_wrong_tokens,aime24_clip_ratio,aime24_stop_tokens,aime24_stop_ratio,aime24_box_ratio,aime24_repeat_ratio,aime25_acc,aime25_pass_acc,aime25_tokens,aime25_keywords,aime25_correct_tokens,aime25_wrong_tokens,aime25_clip_ratio,aime25_stop_tokens,aime25_stop_ratio,aime25_box_ratio,aime25_repeat_ratio,amc23_acc,amc23_pass_acc,amc23_tokens,amc23_keywords,amc23_correct_tokens,amc23_wrong_tokens,amc23_clip_ratio,amc23_stop_tokens,amc23_stop_ratio,amc23_box_ratio,amc23_repeat_ratio,avg_acc,avg_pass_acc,avg_tokens,avg_keywords,avg_correct_tokens,avg_wrong_tokens,avg_clip_ratio,avg_stop_tokens,avg_stop_ratio,avg_box_ratio,avg_repeat_ratio
2
+ eval_results_avg4-global_step_0,3.3,13.3,1018.9666666666667,0.13333333333333333,915.0,1022.551724137931,0.0,1018.9666666666667,1.0,0.9,0.6333333333333333,2.5,10.0,1953.8,0.3,0.0,1953.8,0.03333333333333333,1469.448275862069,0.9666666666666667,0.9,0.7333333333333333,22.5,52.5,1373.575,0.275,2476.1,1006.0666666666667,0.025,968.3846153846154,0.975,0.85,0.6,9.433333333333334,25.266666666666666,1448.7805555555553,0.23611111111111113,1130.3666666666666,1327.4727969348658,0.019444444444444445,1152.2665193044504,0.9805555555555556,0.8833333333333333,0.6555555555555556
3
+ eval_results_avg4-global_step_10,8.3,20.0,2643.266666666667,2.966666666666667,728.0,2856.074074074074,0.1,1160.5555555555557,0.9,0.8333333333333334,0.7666666666666667,4.2,13.3,2264.0333333333333,0.3,1845.0,2278.4827586206898,0.06666666666666667,1283.0714285714287,0.9333333333333333,0.8666666666666667,0.6333333333333333,30.0,52.5,1322.3,3.05,738.9166666666666,1572.3214285714287,0.025,943.5641025641025,0.975,0.975,0.775,14.166666666666666,28.599999999999998,2076.5333333333333,2.1055555555555556,1103.9722222222222,2235.6260870887304,0.0638888888888889,1129.0636955636955,0.9361111111111112,0.8916666666666667,0.725
4
+ eval_results_avg4-global_step_20,4.2,10.0,3127.3333333333335,1.2,0.0,3127.3333333333335,0.1,1697.148148148148,0.9,0.8,0.8,1.7,6.7,1809.2333333333333,0.2,0.0,1809.2333333333333,0.06666666666666667,795.6071428571429,0.9333333333333333,0.9333333333333333,0.7,32.5,57.5,1592.275,0.525,2081.0,1429.3666666666666,0.05,834.0263157894736,0.95,0.95,0.675,12.799999999999999,24.733333333333334,2176.2805555555556,0.6416666666666666,693.6666666666666,2121.9777777777776,0.07222222222222223,1108.9272022649216,0.9277777777777777,0.8944444444444445,0.725
5
+ eval_results_avg4-global_step_30,9.2,20.0,3154.3,0.9666666666666667,697.0,3329.8214285714284,0.13333333333333333,1178.1153846153845,0.8666666666666667,0.8,0.8,3.3,10.0,2426.0666666666666,0.3,1031.5,2525.6785714285716,0.1,918.0370370370371,0.9,0.9,0.7,37.5,60.0,2099.325,0.3,1655.1764705882354,2427.608695652174,0.05,1367.7894736842106,0.95,0.9,0.675,16.666666666666668,30.0,2559.8972222222224,0.5222222222222223,1127.892156862745,2761.036231884058,0.09444444444444444,1154.647298445544,0.9055555555555556,0.8666666666666667,0.725
6
+ eval_results_avg4-global_step_40,5.0,13.3,2390.133333333333,0.23333333333333334,738.5,2508.1071428571427,0.1,874.7777777777778,0.9,0.9,0.7666666666666667,3.3,13.3,1361.8333333333333,0.36666666666666664,690.0,1385.0,0.03333333333333333,857.0689655172414,0.9666666666666667,0.9666666666666667,0.8333333333333334,32.5,47.5,1607.7,0.25,720.1,1903.5666666666666,0.05,874.3947368421053,0.95,0.95,0.675,13.6,24.7,1786.5555555555554,0.2833333333333333,716.1999999999999,1932.2246031746029,0.061111111111111116,868.7471600457083,0.9388888888888888,0.9388888888888888,0.7583333333333334
7
+ eval_results_avg4-global_step_50,8.3,13.3,2279.633333333333,5.9,742.5,2389.4285714285716,0.1,761.3703703703703,0.9,0.9,0.6666666666666666,4.2,6.7,1324.0333333333333,0.5333333333333333,1383.0,1322.0,0.03333333333333333,817.9310344827586,0.9666666666666667,0.9666666666666667,0.5333333333333333,34.4,57.5,2250.7,0.225,643.1818181818181,2860.448275862069,0.075,1133.7027027027027,0.925,0.9,0.7,15.633333333333333,25.833333333333332,1951.4555555555555,2.2194444444444446,922.8939393939394,2190.6256157635466,0.06944444444444443,904.3347025186107,0.9305555555555557,0.9222222222222222,0.6333333333333333
8
+ eval_results_avg4-global_step_60,7.5,16.7,2607.233333333333,0.06666666666666667,640.3333333333334,2825.777777777778,0.1,1119.4074074074074,0.9,0.8333333333333334,0.7,3.3,6.7,2862.133333333333,0.36666666666666664,967.0,2997.5,0.13333333333333333,841.0384615384615,0.8666666666666667,0.8666666666666667,0.7,36.2,55.0,763.875,0.175,687.2,809.88,0.0,763.875,1.0,1.0,0.65,15.666666666666666,26.133333333333336,2077.7472222222223,0.20277777777777775,764.8444444444445,2211.0525925925926,0.07777777777777778,908.1069563152896,0.9222222222222222,0.9,0.6833333333333332
9
+ eval_results_avg4-global_step_70,10.0,13.3,2860.3,0.2,633.5,3019.3571428571427,0.1,1400.5185185185185,0.9,0.8666666666666667,0.8333333333333334,6.7,20.0,1896.0666666666666,7.866666666666666,864.5,2054.769230769231,0.06666666666666667,888.6071428571429,0.9333333333333333,0.9333333333333333,0.8333333333333334,33.8,50.0,899.225,0.175,651.1428571428571,1032.8076923076924,0.0,899.225,1.0,0.975,0.75,16.833333333333332,27.766666666666666,1885.1972222222223,2.7472222222222222,716.3809523809523,2035.6446886446886,0.05555555555555556,1062.783553791887,0.9444444444444445,0.9249999999999999,0.8055555555555557
10
+ eval_results_avg4-global_step_80,7.5,10.0,1627.2333333333333,0.43333333333333335,779.3333333333334,1721.4444444444443,0.03333333333333333,1131.655172413793,0.9666666666666667,0.9333333333333333,0.8333333333333334,4.2,10.0,1895.1,0.4,997.5,1959.2142857142858,0.06666666666666667,887.6428571428571,0.9333333333333333,0.9333333333333333,0.7,36.2,52.5,1330.375,0.3,692.25,1755.7916666666667,0.025,954.2564102564103,0.975,0.95,0.725,15.966666666666669,24.166666666666668,1617.5694444444443,0.37777777777777777,823.0277777777778,1812.1501322751321,0.041666666666666664,991.1848132710202,0.9583333333333334,0.9388888888888888,0.7527777777777778
11
+ eval_results_avg4-global_step_90,5.0,13.3,3021.133333333333,0.26666666666666666,1048.75,3324.576923076923,0.1,1579.4444444444443,0.9,0.8333333333333334,0.8666666666666667,4.2,6.7,1086.6333333333334,0.6666666666666666,620.0,1102.7241379310344,0.0,1086.6333333333334,1.0,0.9666666666666667,0.7,39.4,60.0,912.675,0.2,678.6666666666666,1053.08,0.0,912.675,1.0,1.0,0.75,16.2,26.666666666666668,1673.4805555555556,0.37777777777777777,782.4722222222222,1826.7936870026524,0.03333333333333333,1192.9175925925927,0.9666666666666667,0.9333333333333332,0.7722222222222221
12
+ eval_results_avg4-global_step_100,9.2,16.7,2744.766666666667,0.23333333333333334,1203.0,2797.9310344827586,0.1,1271.888888888889,0.9,0.9,0.7333333333333333,5.0,6.7,1921.2666666666667,0.43333333333333335,1063.0,1950.8620689655172,0.06666666666666667,915.6071428571429,0.9333333333333333,0.9333333333333333,0.7,39.4,57.5,1210.475,0.225,691.0,1556.7916666666667,0.025,831.2820512820513,0.975,0.975,0.725,17.866666666666664,26.96666666666667,1958.836111111111,0.2972222222222222,985.6666666666666,2101.8615900383143,0.0638888888888889,1006.259361009361,0.9361111111111112,0.9361111111111112,0.7194444444444444
eval_results_avg4/global_step_0/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_0/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 120,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 1,
6
+ "acc": 3.3,
7
+ "pass_acc": 13.3,
8
+ "pass@k": {
9
+ "1": 3.3,
10
+ "2": 6.7,
11
+ "4": 13.3
12
+ },
13
+ "time_use_in_second": 171.29286766052246,
14
+ "time_use_in_minite": "2:51"
15
+ }
eval_results_avg4/global_step_0/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_0/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 120,
4
+ "timeout_samples": 1,
5
+ "empty_samples": 0,
6
+ "acc": 2.5,
7
+ "pass_acc": 10.0,
8
+ "pass@k": {
9
+ "1": 2.5,
10
+ "2": 5.0,
11
+ "4": 10.0
12
+ },
13
+ "time_use_in_second": 177.67143607139587,
14
+ "time_use_in_minite": "2:57"
15
+ }
eval_results_avg4/global_step_0/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_0/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 40,
3
+ "num_scores": 160,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 2,
6
+ "acc": 22.5,
7
+ "pass_acc": 52.5,
8
+ "pass@k": {
9
+ "1": 22.5,
10
+ "2": 34.6,
11
+ "4": 52.5
12
+ },
13
+ "time_use_in_second": 174.92642736434937,
14
+ "time_use_in_minite": "2:54"
15
+ }
eval_results_avg4/global_step_10/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_10/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 120,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 8.3,
7
+ "pass_acc": 20.0,
8
+ "pass@k": {
9
+ "1": 8.3,
10
+ "2": 12.8,
11
+ "4": 20.0
12
+ },
13
+ "time_use_in_second": 225.51578545570374,
14
+ "time_use_in_minite": "3:45"
15
+ }
eval_results_avg4/global_step_10/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_10/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 120,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 1,
6
+ "acc": 4.2,
7
+ "pass_acc": 13.3,
8
+ "pass@k": {
9
+ "1": 4.2,
10
+ "2": 7.8,
11
+ "4": 13.3
12
+ },
13
+ "time_use_in_second": 197.61598801612854,
14
+ "time_use_in_minite": "3:17"
15
+ }
eval_results_avg4/global_step_10/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_10/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 40,
3
+ "num_scores": 160,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 30.0,
7
+ "pass_acc": 52.5,
8
+ "pass@k": {
9
+ "1": 30.0,
10
+ "2": 40.0,
11
+ "4": 52.5
12
+ },
13
+ "time_use_in_second": 197.73797011375427,
14
+ "time_use_in_minite": "3:17"
15
+ }
eval_results_avg4/global_step_100/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_100/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 120,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 9.2,
7
+ "pass_acc": 16.7,
8
+ "pass@k": {
9
+ "1": 9.2,
10
+ "2": 12.8,
11
+ "4": 16.7
12
+ },
13
+ "time_use_in_second": 192.01610326766968,
14
+ "time_use_in_minite": "3:12"
15
+ }
eval_results_avg4/global_step_100/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_100/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 120,
4
+ "timeout_samples": 1,
5
+ "empty_samples": 0,
6
+ "acc": 5.0,
7
+ "pass_acc": 6.7,
8
+ "pass@k": {
9
+ "1": 5.0,
10
+ "2": 6.1,
11
+ "4": 6.7
12
+ },
13
+ "time_use_in_second": 188.81494522094727,
14
+ "time_use_in_minite": "3:08"
15
+ }
eval_results_avg4/global_step_100/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_100/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 40,
3
+ "num_scores": 160,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 39.4,
7
+ "pass_acc": 57.5,
8
+ "pass@k": {
9
+ "1": 39.4,
10
+ "2": 50.4,
11
+ "4": 57.5
12
+ },
13
+ "time_use_in_second": 175.83723950386047,
14
+ "time_use_in_minite": "2:55"
15
+ }
eval_results_avg4/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 120,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 4.2,
7
+ "pass_acc": 10.0,
8
+ "pass@k": {
9
+ "1": 4.2,
10
+ "2": 6.7,
11
+ "4": 10.0
12
+ },
13
+ "time_use_in_second": 197.9232029914856,
14
+ "time_use_in_minite": "3:17"
15
+ }
eval_results_avg4/global_step_20/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff