bensondccnqwc commited on
Commit
d8daeaa
·
verified ·
1 Parent(s): 0d07736

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. eval_results_avg32/global_step_0/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  2. eval_results_avg32/global_step_10/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  3. eval_results_avg32/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  4. eval_results_avg32/global_step_30/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  5. eval_results_avg32/global_step_30/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  6. eval_results_avg32/global_step_40/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  7. eval_results_avg32/global_step_40/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  8. eval_results_avg32/global_step_50/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  9. eval_results_avg32/global_step_50/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  10. eval_results_avg32/global_step_50/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
  11. eval_results_avg32/global_step_60/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  12. eval_results_avg32/global_step_60/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
  13. eval_results_avg32/global_step_60/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  14. eval_results_avg32/global_step_60/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
  15. eval_results_avg32/global_step_70/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  16. eval_results_avg32/global_step_70/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
  17. eval_results_avg32/global_step_70/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  18. eval_results_avg32/global_step_70/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
  19. eval_results_avg32/global_step_80/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  20. eval_results_avg32/global_step_80/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
  21. eval_results_avg32/global_step_80/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  22. eval_results_avg32/global_step_80/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
  23. eval_results_avg32/global_step_90/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  24. eval_results_avg32/global_step_90/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
  25. eval_results_avg32/global_step_90/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  26. eval_results_avg32/global_step_90/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
  27. eval_results_avg4/eval_results.csv +12 -0
  28. eval_results_avg4/global_step_0/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  29. eval_results_avg4/global_step_0/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  30. eval_results_avg4/global_step_0/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  31. eval_results_avg4/global_step_0/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  32. eval_results_avg4/global_step_0/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  33. eval_results_avg4/global_step_0/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  34. eval_results_avg4/global_step_10/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  35. eval_results_avg4/global_step_10/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  36. eval_results_avg4/global_step_10/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  37. eval_results_avg4/global_step_10/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  38. eval_results_avg4/global_step_10/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  39. eval_results_avg4/global_step_10/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  40. eval_results_avg4/global_step_100/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  41. eval_results_avg4/global_step_100/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  42. eval_results_avg4/global_step_100/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  43. eval_results_avg4/global_step_100/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  44. eval_results_avg4/global_step_100/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  45. eval_results_avg4/global_step_100/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  46. eval_results_avg4/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  47. eval_results_avg4/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  48. eval_results_avg4/global_step_20/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  49. eval_results_avg4/global_step_20/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +15 -0
  50. eval_results_avg4/global_step_20/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
eval_results_avg32/global_step_0/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg32/global_step_10/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg32/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg32/global_step_30/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg32/global_step_30/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg32/global_step_40/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg32/global_step_40/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg32/global_step_50/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg32/global_step_50/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg32/global_step_50/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 960,
4
+ "timeout_samples": 4,
5
+ "empty_samples": 0,
6
+ "acc": 4.6,
7
+ "pass_acc": 33.3,
8
+ "pass@k": {
9
+ "1": 4.6,
10
+ "2": 7.7,
11
+ "4": 12.0,
12
+ "8": 18.2,
13
+ "16": 25.9,
14
+ "32": 33.3
15
+ },
16
+ "time_use_in_second": 515.7602069377899,
17
+ "time_use_in_minite": "8:35"
18
+ }
eval_results_avg32/global_step_60/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg32/global_step_60/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 960,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 6.7,
7
+ "pass_acc": 30.0,
8
+ "pass@k": {
9
+ "1": 6.7,
10
+ "2": 9.8,
11
+ "4": 14.1,
12
+ "8": 19.9,
13
+ "16": 26.2,
14
+ "32": 30.0
15
+ },
16
+ "time_use_in_second": 671.4177906513214,
17
+ "time_use_in_minite": "11:11"
18
+ }
eval_results_avg32/global_step_60/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg32/global_step_60/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 960,
4
+ "timeout_samples": 4,
5
+ "empty_samples": 0,
6
+ "acc": 3.9,
7
+ "pass_acc": 26.7,
8
+ "pass@k": {
9
+ "1": 3.9,
10
+ "2": 6.4,
11
+ "4": 9.7,
12
+ "8": 14.1,
13
+ "16": 19.9,
14
+ "32": 26.7
15
+ },
16
+ "time_use_in_second": 513.6733276844025,
17
+ "time_use_in_minite": "8:33"
18
+ }
eval_results_avg32/global_step_70/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg32/global_step_70/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 960,
4
+ "timeout_samples": 1,
5
+ "empty_samples": 0,
6
+ "acc": 6.7,
7
+ "pass_acc": 36.7,
8
+ "pass@k": {
9
+ "1": 6.7,
10
+ "2": 10.3,
11
+ "4": 14.7,
12
+ "8": 20.5,
13
+ "16": 28.0,
14
+ "32": 36.7
15
+ },
16
+ "time_use_in_second": 720.7702496051788,
17
+ "time_use_in_minite": "12:00"
18
+ }
eval_results_avg32/global_step_70/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg32/global_step_70/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 960,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 4.9,
7
+ "pass_acc": 30.0,
8
+ "pass@k": {
9
+ "1": 4.9,
10
+ "2": 8.1,
11
+ "4": 12.3,
12
+ "8": 17.5,
13
+ "16": 24.1,
14
+ "32": 30.0
15
+ },
16
+ "time_use_in_second": 463.51481533050537,
17
+ "time_use_in_minite": "7:43"
18
+ }
eval_results_avg32/global_step_80/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg32/global_step_80/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 960,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 6.2,
7
+ "pass_acc": 26.7,
8
+ "pass@k": {
9
+ "1": 6.2,
10
+ "2": 9.7,
11
+ "4": 13.8,
12
+ "8": 18.6,
13
+ "16": 23.5,
14
+ "32": 26.7
15
+ },
16
+ "time_use_in_second": 645.1512818336487,
17
+ "time_use_in_minite": "10:45"
18
+ }
eval_results_avg32/global_step_80/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg32/global_step_80/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 960,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 4.6,
7
+ "pass_acc": 30.0,
8
+ "pass@k": {
9
+ "1": 4.6,
10
+ "2": 6.9,
11
+ "4": 9.8,
12
+ "8": 13.9,
13
+ "16": 19.8,
14
+ "32": 30.0
15
+ },
16
+ "time_use_in_second": 432.0119540691376,
17
+ "time_use_in_minite": "7:12"
18
+ }
eval_results_avg32/global_step_90/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg32/global_step_90/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 960,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 7.7,
7
+ "pass_acc": 33.3,
8
+ "pass@k": {
9
+ "1": 7.7,
10
+ "2": 11.7,
11
+ "4": 16.2,
12
+ "8": 21.2,
13
+ "16": 26.7,
14
+ "32": 33.3
15
+ },
16
+ "time_use_in_second": 590.3402507305145,
17
+ "time_use_in_minite": "9:50"
18
+ }
eval_results_avg32/global_step_90/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg32/global_step_90/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 960,
4
+ "timeout_samples": 2,
5
+ "empty_samples": 0,
6
+ "acc": 5.2,
7
+ "pass_acc": 26.7,
8
+ "pass@k": {
9
+ "1": 5.2,
10
+ "2": 8.3,
11
+ "4": 11.9,
12
+ "8": 16.1,
13
+ "16": 21.0,
14
+ "32": 26.7
15
+ },
16
+ "time_use_in_second": 473.4737560749054,
17
+ "time_use_in_minite": "7:53"
18
+ }
eval_results_avg4/eval_results.csv ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model,aime24_acc,aime24_pass_acc,aime24_tokens,aime24_keywords,aime24_correct_tokens,aime24_wrong_tokens,aime24_clip_ratio,aime24_stop_tokens,aime24_stop_ratio,aime24_box_ratio,aime24_repeat_ratio,aime25_acc,aime25_pass_acc,aime25_tokens,aime25_keywords,aime25_correct_tokens,aime25_wrong_tokens,aime25_clip_ratio,aime25_stop_tokens,aime25_stop_ratio,aime25_box_ratio,aime25_repeat_ratio,amc23_acc,amc23_pass_acc,amc23_tokens,amc23_keywords,amc23_correct_tokens,amc23_wrong_tokens,amc23_clip_ratio,amc23_stop_tokens,amc23_stop_ratio,amc23_box_ratio,amc23_repeat_ratio,avg_acc,avg_pass_acc,avg_tokens,avg_keywords,avg_correct_tokens,avg_wrong_tokens,avg_clip_ratio,avg_stop_tokens,avg_stop_ratio,avg_box_ratio,avg_repeat_ratio
2
+ eval_results_avg4-global_step_0,5.0,13.3,1426.8666666666666,0.3,611.0,1485.142857142857,0.03333333333333333,924.3448275862069,0.9666666666666667,0.8666666666666667,0.5333333333333333,3.3,10.0,2181.5666666666666,0.9,1284.0,2245.6785714285716,0.06666666666666667,1194.5,0.9333333333333333,0.8,0.7333333333333333,26.9,55.0,1568.425,0.225,777.0,1907.607142857143,0.05,809.0,0.95,0.875,0.725,11.733333333333334,26.099999999999998,1725.6194444444445,0.47500000000000003,890.6666666666666,1879.4761904761906,0.05000000000000001,975.948275862069,0.9499999999999998,0.8472222222222223,0.6638888888888889
3
+ eval_results_avg4-global_step_10,5.8,10.0,1305.3,0.4,786.0,1342.392857142857,0.03333333333333333,798.551724137931,0.9666666666666667,0.8666666666666667,0.8,0.8,3.3,1669.7333333333333,0.43333333333333335,0.0,1669.7333333333333,0.03333333333333333,1177.4137931034484,0.9666666666666667,0.9666666666666667,0.7,27.5,55.0,1846.1,1.4,634.0714285714286,2498.730769230769,0.05,1101.1315789473683,0.95,0.925,0.575,11.366666666666667,22.766666666666666,1607.0444444444445,0.7444444444444445,473.35714285714283,1836.95231990232,0.03888888888888889,1025.6990320629159,0.9611111111111111,0.9194444444444446,0.6916666666666668
4
+ eval_results_avg4-global_step_20,3.3,3.3,3027.3,0.5333333333333333,726.0,3106.655172413793,0.13333333333333333,1035.576923076923,0.8666666666666667,0.8666666666666667,0.8666666666666667,3.3,6.7,2293.4,0.26666666666666666,759.0,2346.310344827586,0.1,770.4814814814815,0.9,0.9,0.7333333333333333,33.1,62.5,1164.075,0.225,692.875,1478.2083333333333,0.0,1164.075,1.0,0.975,0.775,13.233333333333334,24.166666666666668,2161.5916666666667,0.34166666666666673,725.9583333333334,2310.391283524904,0.07777777777777778,990.0444681861348,0.9222222222222222,0.9138888888888889,0.7916666666666666
5
+ eval_results_avg4-global_step_30,4.2,10.0,2811.7,0.43333333333333335,0.0,2811.7,0.13333333333333333,782.7692307692307,0.8666666666666667,0.8666666666666667,0.7666666666666667,0.8,3.3,1411.9333333333334,0.36666666666666664,0.0,1411.9333333333334,0.03333333333333333,908.8965517241379,0.9666666666666667,0.9666666666666667,0.8333333333333334,30.6,57.5,1902.8,0.225,570.7272727272727,2408.0689655172414,0.075,755.7567567567568,0.925,0.925,0.7,11.866666666666667,23.599999999999998,2042.1444444444444,0.34166666666666673,190.24242424242425,2210.5674329501912,0.08055555555555555,815.807513083375,0.9194444444444446,0.9194444444444446,0.7666666666666666
6
+ eval_results_avg4-global_step_40,6.7,10.0,3329.766666666667,0.26666666666666666,699.5,3517.6428571428573,0.13333333333333333,1380.7692307692307,0.8666666666666667,0.8333333333333334,0.8,4.2,10.0,1305.2333333333333,0.5,638.0,1352.892857142857,0.03333333333333333,798.551724137931,0.9666666666666667,0.9666666666666667,0.6666666666666666,37.5,60.0,884.425,0.325,629.3571428571429,1021.7692307692307,0.0,884.425,1.0,0.975,0.575,16.133333333333333,26.666666666666668,1839.8083333333334,0.3638888888888889,655.6190476190476,1964.1016483516485,0.05555555555555555,1021.2486516357206,0.9444444444444445,0.9249999999999999,0.6805555555555557
7
+ eval_results_avg4-global_step_50,7.5,13.3,2812.766666666667,0.5,651.0,2887.310344827586,0.13333333333333333,784.1153846153846,0.8666666666666667,0.8666666666666667,0.7,4.2,10.0,2894.366666666667,0.26666666666666666,0.0,2894.366666666667,0.13333333333333333,878.3076923076923,0.8666666666666667,0.8666666666666667,0.7666666666666667,32.5,50.0,1760.95,0.15,625.7,2139.366666666667,0.05,1011.578947368421,0.95,0.925,0.775,14.733333333333334,24.433333333333334,2489.361111111111,0.3055555555555555,425.56666666666666,2640.3478927203064,0.10555555555555556,891.334008097166,0.8944444444444445,0.8861111111111111,0.7472222222222222
8
+ eval_results_avg4-global_step_60,6.7,10.0,2675.5666666666666,0.3,617.0,2822.6071428571427,0.1,1195.037037037037,0.9,0.8666666666666667,0.8666666666666667,4.2,6.7,1860.5,7.566666666666666,960.0,1891.551724137931,0.06666666666666667,850.5357142857143,0.9333333333333333,0.9333333333333333,0.7666666666666667,37.5,52.5,1172.65,0.3,729.0769230769231,1386.2222222222222,0.025,792.4102564102565,0.975,0.975,0.6,16.133333333333333,23.066666666666666,1902.9055555555558,2.722222222222222,768.6923076923076,2033.4603630724316,0.0638888888888889,945.9943359110025,0.9361111111111112,0.9249999999999999,0.7444444444444445
9
+ eval_results_avg4-global_step_70,7.5,13.3,2323.1,2.6666666666666665,663.0,2507.5555555555557,0.1,804.8518518518518,0.9,0.9,0.8666666666666667,4.2,10.0,1029.3666666666666,0.36666666666666664,956.5,1034.5714285714287,0.0,1029.3666666666666,1.0,0.9666666666666667,0.6333333333333333,38.1,57.5,2180.425,0.275,685.8461538461538,2900.037037037037,0.075,1059.918918918919,0.925,0.9,0.75,16.599999999999998,26.933333333333334,1844.2972222222222,1.1027777777777776,768.4487179487179,2147.3880070546734,0.05833333333333333,964.7124791458124,0.9416666666666668,0.9222222222222222,0.75
10
+ eval_results_avg4-global_step_80,6.7,16.7,3168.5,0.3,773.0,3251.103448275862,0.13333333333333333,1194.4615384615386,0.8666666666666667,0.8333333333333334,0.7666666666666667,3.3,3.3,967.0333333333333,0.5333333333333333,1042.0,964.448275862069,0.0,967.0333333333333,1.0,1.0,0.6333333333333333,38.1,60.0,1234.025,2.8,685.1538461538462,1498.2962962962963,0.025,855.4102564102565,0.975,0.975,0.55,16.033333333333335,26.666666666666668,1789.8527777777774,1.211111111111111,833.3846153846154,1904.6160068114093,0.05277777777777778,1005.6350427350427,0.9472222222222223,0.9361111111111112,0.65
11
+ eval_results_avg4-global_step_90,9.2,20.0,3377.366666666667,0.3,795.75,3774.5384615384614,0.16666666666666666,852.76,0.8333333333333334,0.8333333333333334,0.9,3.3,6.7,1786.9333333333334,0.5333333333333333,0.0,1786.9333333333334,0.06666666666666667,783.1785714285714,0.9333333333333333,0.9333333333333333,0.7,38.1,60.0,1326.225,0.275,689.5,1599.107142857143,0.025,949.974358974359,0.975,0.975,0.775,16.866666666666667,28.900000000000002,2163.508333333333,0.36944444444444446,495.0833333333333,2386.859645909646,0.08611111111111112,861.9709768009767,0.9138888888888889,0.9138888888888889,0.7916666666666666
12
+ eval_results_avg4-global_step_100,7.5,10.0,2870.633333333333,0.5333333333333333,606.0,3032.3928571428573,0.06666666666666667,1948.892857142857,0.9333333333333333,0.8666666666666667,0.8333333333333334,7.5,16.7,1585.2,0.36666666666666664,922.0,1632.5714285714287,0.03333333333333333,1088.1724137931035,0.9666666666666667,0.9333333333333333,0.7333333333333333,36.9,60.0,1372.1,0.225,655.2307692307693,1717.2592592592594,0.025,996.8205128205128,0.975,0.95,0.625,17.3,28.900000000000002,1942.6444444444442,0.375,727.7435897435898,2127.4078483245153,0.041666666666666664,1344.628594585491,0.9583333333333334,0.9166666666666666,0.7305555555555555
eval_results_avg4/global_step_0/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_0/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 120,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 5.0,
7
+ "pass_acc": 13.3,
8
+ "pass@k": {
9
+ "1": 5.0,
10
+ "2": 8.9,
11
+ "4": 13.3
12
+ },
13
+ "time_use_in_second": 185.94950437545776,
14
+ "time_use_in_minite": "3:05"
15
+ }
eval_results_avg4/global_step_0/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_0/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 120,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 1,
6
+ "acc": 3.3,
7
+ "pass_acc": 10.0,
8
+ "pass@k": {
9
+ "1": 3.3,
10
+ "2": 6.1,
11
+ "4": 10.0
12
+ },
13
+ "time_use_in_second": 193.96814250946045,
14
+ "time_use_in_minite": "3:13"
15
+ }
eval_results_avg4/global_step_0/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_0/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 40,
3
+ "num_scores": 160,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 4,
6
+ "acc": 26.9,
7
+ "pass_acc": 55.0,
8
+ "pass@k": {
9
+ "1": 26.9,
10
+ "2": 41.2,
11
+ "4": 55.0
12
+ },
13
+ "time_use_in_second": 202.22902059555054,
14
+ "time_use_in_minite": "3:22"
15
+ }
eval_results_avg4/global_step_10/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_10/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 120,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 5.8,
7
+ "pass_acc": 10.0,
8
+ "pass@k": {
9
+ "1": 5.8,
10
+ "2": 7.8,
11
+ "4": 10.0
12
+ },
13
+ "time_use_in_second": 172.10288619995117,
14
+ "time_use_in_minite": "2:52"
15
+ }
eval_results_avg4/global_step_10/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_10/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 120,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 0.8,
7
+ "pass_acc": 3.3,
8
+ "pass@k": {
9
+ "1": 0.8,
10
+ "2": 1.7,
11
+ "4": 3.3
12
+ },
13
+ "time_use_in_second": 188.8493616580963,
14
+ "time_use_in_minite": "3:08"
15
+ }
eval_results_avg4/global_step_10/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_10/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 40,
3
+ "num_scores": 160,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 27.5,
7
+ "pass_acc": 55.0,
8
+ "pass@k": {
9
+ "1": 27.5,
10
+ "2": 41.2,
11
+ "4": 55.0
12
+ },
13
+ "time_use_in_second": 193.1791069507599,
14
+ "time_use_in_minite": "3:13"
15
+ }
eval_results_avg4/global_step_100/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_100/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 120,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 7.5,
7
+ "pass_acc": 10.0,
8
+ "pass@k": {
9
+ "1": 7.5,
10
+ "2": 9.4,
11
+ "4": 10.0
12
+ },
13
+ "time_use_in_second": 186.0230197906494,
14
+ "time_use_in_minite": "3:06"
15
+ }
eval_results_avg4/global_step_100/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_100/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 120,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 7.5,
7
+ "pass_acc": 16.7,
8
+ "pass@k": {
9
+ "1": 7.5,
10
+ "2": 11.1,
11
+ "4": 16.7
12
+ },
13
+ "time_use_in_second": 176.03479647636414,
14
+ "time_use_in_minite": "2:56"
15
+ }
eval_results_avg4/global_step_100/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_100/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 40,
3
+ "num_scores": 160,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 36.9,
7
+ "pass_acc": 60.0,
8
+ "pass@k": {
9
+ "1": 36.9,
10
+ "2": 48.3,
11
+ "4": 60.0
12
+ },
13
+ "time_use_in_second": 170.67507100105286,
14
+ "time_use_in_minite": "2:50"
15
+ }
eval_results_avg4/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 120,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 3.3,
7
+ "pass_acc": 3.3,
8
+ "pass@k": {
9
+ "1": 3.3,
10
+ "2": 3.3,
11
+ "4": 3.3
12
+ },
13
+ "time_use_in_second": 190.46656322479248,
14
+ "time_use_in_minite": "3:10"
15
+ }
eval_results_avg4/global_step_20/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg4/global_step_20/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 120,
4
+ "timeout_samples": 1,
5
+ "empty_samples": 0,
6
+ "acc": 3.3,
7
+ "pass_acc": 6.7,
8
+ "pass@k": {
9
+ "1": 3.3,
10
+ "2": 5.0,
11
+ "4": 6.7
12
+ },
13
+ "time_use_in_second": 179.1738040447235,
14
+ "time_use_in_minite": "2:59"
15
+ }
eval_results_avg4/global_step_20/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff