johndoe123345 commited on
Commit
a7d5f62
·
verified ·
1 Parent(s): 8657be2

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. eval_results_avg32/eval_results.csv +14 -0
  3. eval_results_avg32/global_step_0/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  4. eval_results_avg32/global_step_0/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
  5. eval_results_avg32/global_step_0/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  6. eval_results_avg32/global_step_0/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
  7. eval_results_avg32/global_step_0/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  8. eval_results_avg32/global_step_0/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
  9. eval_results_avg32/global_step_10/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  10. eval_results_avg32/global_step_10/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
  11. eval_results_avg32/global_step_10/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  12. eval_results_avg32/global_step_10/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
  13. eval_results_avg32/global_step_10/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  14. eval_results_avg32/global_step_10/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
  15. eval_results_avg32/global_step_100/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
  16. eval_results_avg32/global_step_100/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
  17. eval_results_avg32/global_step_100/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
  18. eval_results_avg32/global_step_110/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
  19. eval_results_avg32/global_step_110/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
  20. eval_results_avg32/global_step_110/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
  21. eval_results_avg32/global_step_120/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
  22. eval_results_avg32/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
  23. eval_results_avg32/global_step_20/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
  24. eval_results_avg32/global_step_20/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
  25. eval_results_avg32/global_step_30/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
  26. eval_results_avg32/global_step_30/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
  27. eval_results_avg32/global_step_30/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
  28. eval_results_avg32/global_step_40/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
  29. eval_results_avg32/global_step_40/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
  30. eval_results_avg32/global_step_40/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
  31. eval_results_avg32/global_step_50/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
  32. eval_results_avg32/global_step_50/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
  33. eval_results_avg32/global_step_50/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
  34. eval_results_avg32/global_step_60/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
  35. eval_results_avg32/global_step_60/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
  36. eval_results_avg32/global_step_60/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
  37. eval_results_avg32/global_step_70/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
  38. eval_results_avg32/global_step_70/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
  39. eval_results_avg32/global_step_70/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
  40. eval_results_avg32/global_step_80/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
  41. eval_results_avg32/global_step_80/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
  42. eval_results_avg32/global_step_80/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
  43. eval_results_avg32/global_step_90/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
  44. eval_results_avg32/global_step_90/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
  45. eval_results_avg32/global_step_90/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +18 -0
  46. global_step_0/actor/huggingface/.cp_done +0 -0
  47. global_step_0/actor/huggingface/README.md +57 -0
  48. global_step_0/actor/huggingface/config.json +30 -0
  49. global_step_0/actor/huggingface/generation_config.json +7 -0
  50. global_step_0/actor/huggingface/merges.txt +0 -0
.gitattributes CHANGED
@@ -37,3 +37,4 @@ global_step_10/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -t
37
  global_step_60/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
38
  global_step_70/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
39
  global_step_90/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 
 
37
  global_step_60/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
38
  global_step_70/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
39
  global_step_90/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
40
+ global_step_20/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
eval_results_avg32/eval_results.csv ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model,amc23_acc,amc23_pass_acc,amc23_tokens,amc23_keywords,amc23_correct_tokens,amc23_wrong_tokens,amc23_clip_ratio,amc23_stop_tokens,amc23_stop_ratio,amc23_box_ratio,amc23_repeat_ratio,aime24_acc,aime24_pass_acc,aime24_tokens,aime24_keywords,aime24_correct_tokens,aime24_wrong_tokens,aime24_clip_ratio,aime24_stop_tokens,aime24_stop_ratio,aime24_box_ratio,aime24_repeat_ratio,aime25_acc,aime25_pass_acc,aime25_tokens,aime25_keywords,aime25_correct_tokens,aime25_wrong_tokens,aime25_clip_ratio,aime25_stop_tokens,aime25_stop_ratio,aime25_box_ratio,aime25_repeat_ratio,avg_acc,avg_pass_acc,avg_tokens,avg_keywords,avg_correct_tokens,avg_wrong_tokens,avg_clip_ratio,avg_stop_tokens,avg_stop_ratio,avg_box_ratio,avg_repeat_ratio
2
+ eval_results_avg32-global_step_0,45.2,92.5,2035.525,0.375,682.8823529411765,3035.304347826087,0.05,1302.2631578947369,0.95,0.95,0.7,8.6,50.0,1352.6333333333334,1.6333333333333333,1353.8,1352.4,0.0,1352.6333333333334,1.0,0.9666666666666667,0.8,9.0,43.3,3110.4333333333334,2.8666666666666667,1641.3333333333333,3273.6666666666665,0.1,1678.2592592592594,0.9,0.9,0.8,20.933333333333334,61.93333333333334,2166.1972222222225,1.625,1226.00522875817,2553.7903381642514,0.05000000000000001,1444.3852501624433,0.9500000000000001,0.9388888888888888,0.7666666666666666
3
+ eval_results_avg32-global_step_10,57.0,92.5,1043.025,0.425,1009.5,1093.3125,0.0,1043.025,1.0,1.0,0.65,15.4,46.7,3891.133333333333,20.6,1913.3333333333333,4385.583333333333,0.13333333333333333,2018.3846153846155,0.8666666666666667,0.8666666666666667,0.9333333333333333,15.9,46.7,2289.6,2.8,1297.0,2488.12,0.03333333333333333,1817.344827586207,0.9666666666666667,0.9,0.8,29.433333333333337,61.96666666666666,2407.9194444444443,7.941666666666667,1406.611111111111,2655.6719444444443,0.05555555555555555,1626.2514809902743,0.9444444444444445,0.9222222222222222,0.7944444444444446
4
+ eval_results_avg32-global_step_20,65.9,95.0,3410.85,10.175,1599.904761904762,5412.421052631579,0.0,3410.85,1.0,0.9,0.825,20.1,60.0,5245.9,9.9,2209.0,6170.173913043478,0.06666666666666667,4484.285714285715,0.9333333333333333,0.9,0.9333333333333333,19.7,56.7,5815.833333333333,25.1,1198.6,6739.28,0.16666666666666666,3771.84,0.8333333333333334,0.7666666666666667,0.9333333333333333,35.233333333333334,70.56666666666666,4824.194444444444,15.058333333333335,1669.168253968254,6107.291655225018,0.07777777777777778,3888.9919047619046,0.9222222222222222,0.8555555555555556,0.8972222222222221
5
+ eval_results_avg32-global_step_30,65.5,97.5,3705.225,6.975,1677.84,7084.2,0.05,3058.3684210526317,0.95,0.9,0.9,22.6,53.3,6512.466666666666,25.033333333333335,2363.0,7775.347826086957,0.2,4131.666666666667,0.8,0.8333333333333334,0.9333333333333333,19.1,46.7,6655.966666666666,32.266666666666666,2808.8571428571427,7826.826086956522,0.16666666666666666,4794.32,0.8333333333333334,0.7,0.9333333333333333,35.73333333333333,65.83333333333333,5624.552777777778,21.425,2283.232380952381,7562.124637681159,0.13888888888888887,3994.7850292397657,0.8611111111111112,0.8111111111111112,0.9222222222222222
6
+ eval_results_avg32-global_step_40,65.9,97.5,3205.425,6.275,2023.9583333333333,4977.625,0.025,2877.5897435897436,0.975,0.925,0.8,22.7,63.3,6863.566666666667,25.733333333333334,1904.0,7855.48,0.2,4582.291666666667,0.8,0.8,1.0,18.8,46.7,5614.433333333333,19.033333333333335,3041.3333333333335,6717.190476190476,0.06666666666666667,4874.428571428572,0.9333333333333333,0.8333333333333334,0.9666666666666667,35.800000000000004,69.16666666666667,5227.808333333333,17.01388888888889,2323.097222222222,6516.765158730159,0.09722222222222222,4111.436660561661,0.9027777777777777,0.8527777777777779,0.9222222222222222
7
+ eval_results_avg32-global_step_50,67.4,95.0,2904.2,5.85,2216.464285714286,4508.916666666667,0.0,2904.2,1.0,1.0,0.85,24.2,56.7,7269.233333333334,23.833333333333332,2324.1428571428573,8774.260869565218,0.16666666666666666,5532.16,0.8333333333333334,0.7,1.0,20.1,43.3,4984.733333333334,16.3,2275.1666666666665,5662.125,0.1,3759.259259259259,0.9,0.8,0.9666666666666667,37.23333333333334,65.0,5052.722222222223,15.327777777777778,2271.924603174603,6315.100845410628,0.08888888888888889,4065.2064197530867,0.9111111111111111,0.8333333333333334,0.938888888888889
8
+ eval_results_avg32-global_step_60,68.7,100.0,3392.675,11.4,1836.1612903225807,8754.0,0.0,3392.675,1.0,0.925,0.8,25.3,66.7,5566.133333333333,10.666666666666666,2268.3333333333335,6979.476190476191,0.06666666666666667,4820.928571428572,0.9333333333333333,0.7666666666666667,0.9666666666666667,18.6,46.7,5406.566666666667,19.8,1464.5,6392.083333333333,0.03333333333333333,5042.206896551724,0.9666666666666667,0.8333333333333334,1.0,37.53333333333333,71.13333333333333,4788.458333333333,13.955555555555556,1856.3315412186382,7375.186507936508,0.03333333333333333,4418.603489326765,0.9666666666666667,0.8416666666666668,0.9222222222222222
9
+ eval_results_avg32-global_step_70,66.5,97.5,3594.4,8.125,1865.111111111111,7186.0,0.075,2587.0,0.925,0.9,0.85,23.8,66.7,6385.6,21.966666666666665,2025.2,7257.68,0.13333333333333333,4904.846153846154,0.8666666666666667,0.7333333333333333,0.9666666666666667,19.8,53.3,3941.3333333333335,9.966666666666667,1601.0,4409.4,0.06666666666666667,3080.8571428571427,0.9333333333333333,0.9333333333333333,0.9333333333333333,36.699999999999996,72.5,4640.444444444444,13.352777777777776,1830.437037037037,6284.360000000001,0.09166666666666666,3524.2344322344325,0.9083333333333333,0.8555555555555555,0.9166666666666666
10
+ eval_results_avg32-global_step_80,69.2,95.0,3304.975,6.125,1791.52,5827.4,0.025,2979.3076923076924,0.975,0.925,0.875,23.9,63.3,6490.1,15.733333333333333,1745.6,7439.0,0.1,5433.407407407408,0.9,0.8333333333333334,1.0,20.1,40.0,4986.133333333333,37.266666666666666,3472.5,5219.0,0.13333333333333333,3279.230769230769,0.8666666666666667,0.8666666666666667,1.0,37.73333333333333,66.10000000000001,4927.069444444444,19.708333333333332,2336.54,6161.8,0.0861111111111111,3897.315289648623,0.9138888888888889,0.875,0.9583333333333334
11
+ eval_results_avg32-global_step_90,69.5,92.5,3154.3,5.825,1839.344827586207,6621.0,0.05,2478.7894736842104,0.95,0.925,0.85,25.0,60.0,6710.733333333334,10.233333333333333,2679.5555555555557,8438.380952380952,0.1,5690.62962962963,0.9,0.7333333333333333,1.0,21.2,46.7,4466.533333333334,11.3,1602.0,5182.666666666667,0.03333333333333333,4069.7586206896553,0.9666666666666667,0.8666666666666667,0.9666666666666667,38.56666666666667,66.39999999999999,4777.188888888889,9.119444444444445,2040.300127713921,6747.349206349206,0.061111111111111116,4079.725908001165,0.938888888888889,0.8416666666666667,0.938888888888889
12
+ eval_results_avg32-global_step_100,71.7,95.0,3401.55,7.8,1984.7,7652.1,0.025,3078.4615384615386,0.975,0.925,0.825,24.5,63.3,5761.1,33.166666666666664,1925.1666666666667,6720.083333333333,0.13333333333333333,4175.076923076923,0.8666666666666667,0.8,0.9666666666666667,21.4,56.7,5179.433333333333,14.2,1998.2,5815.68,0.1,3982.4074074074074,0.9,0.8666666666666667,0.9666666666666667,39.199999999999996,71.66666666666667,4780.694444444445,18.388888888888886,1969.3555555555556,6729.2877777777785,0.0861111111111111,3745.3152896486226,0.9138888888888889,0.8638888888888889,0.9194444444444444
13
+ eval_results_avg32-global_step_110,70.3,97.5,2541.175,5.325,1793.8214285714287,4285.0,0.0,2541.175,1.0,1.0,0.9,25.3,56.7,6663.333333333333,20.4,2143.3333333333335,7793.333333333333,0.2,4324.708333333333,0.8,0.8333333333333334,1.0,20.9,46.7,5134.666666666667,16.233333333333334,2306.0,5995.565217391304,0.06666666666666667,4356.071428571428,0.9333333333333333,0.9,0.9666666666666667,38.833333333333336,66.96666666666665,4779.724999999999,13.986111111111109,2081.0515873015875,6024.632850241545,0.08888888888888889,3740.6515873015874,0.9111111111111111,0.9111111111111111,0.9555555555555556
14
+ eval_results_avg32-global_step_120,,,,,,,,,,,,24.7,60.0,4787.933333333333,18.533333333333335,2546.25,5603.090909090909,0.1,3534.8518518518517,0.9,0.9,1.0,,,,,,,,,,,,24.7,60.0,4787.933333333333,18.533333333333335,2546.25,5603.090909090909,0.1,3534.8518518518517,0.9,0.9,1.0
eval_results_avg32/global_step_0/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg32/global_step_0/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 960,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 1,
6
+ "acc": 8.6,
7
+ "pass_acc": 50.0,
8
+ "pass@k": {
9
+ "1": 8.6,
10
+ "2": 14.5,
11
+ "4": 21.9,
12
+ "8": 30.2,
13
+ "16": 40.2,
14
+ "32": 50.0
15
+ },
16
+ "time_use_in_second": 1784.518862247467,
17
+ "time_use_in_minite": "29:44"
18
+ }
eval_results_avg32/global_step_0/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg32/global_step_0/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 960,
4
+ "timeout_samples": 1,
5
+ "empty_samples": 0,
6
+ "acc": 9.0,
7
+ "pass_acc": 43.3,
8
+ "pass@k": {
9
+ "1": 9.0,
10
+ "2": 14.7,
11
+ "4": 21.4,
12
+ "8": 28.4,
13
+ "16": 35.9,
14
+ "32": 43.3
15
+ },
16
+ "time_use_in_second": 1362.0080647468567,
17
+ "time_use_in_minite": "22:42"
18
+ }
eval_results_avg32/global_step_0/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg32/global_step_0/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 40,
3
+ "num_scores": 1280,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 45.2,
7
+ "pass_acc": 92.5,
8
+ "pass@k": {
9
+ "1": 45.2,
10
+ "2": 61.1,
11
+ "4": 73.2,
12
+ "8": 81.2,
13
+ "16": 87.4,
14
+ "32": 92.5
15
+ },
16
+ "time_use_in_second": 895.1434328556061,
17
+ "time_use_in_minite": "14:55"
18
+ }
eval_results_avg32/global_step_10/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg32/global_step_10/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 960,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 15.4,
7
+ "pass_acc": 46.7,
8
+ "pass@k": {
9
+ "1": 15.4,
10
+ "2": 20.7,
11
+ "4": 26.1,
12
+ "8": 32.6,
13
+ "16": 40.1,
14
+ "32": 46.7
15
+ },
16
+ "time_use_in_second": 2649.3592751026154,
17
+ "time_use_in_minite": "44:09"
18
+ }
eval_results_avg32/global_step_10/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg32/global_step_10/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 960,
4
+ "timeout_samples": 2,
5
+ "empty_samples": 0,
6
+ "acc": 15.9,
7
+ "pass_acc": 46.7,
8
+ "pass@k": {
9
+ "1": 15.9,
10
+ "2": 22.3,
11
+ "4": 28.5,
12
+ "8": 35.0,
13
+ "16": 41.5,
14
+ "32": 46.7
15
+ },
16
+ "time_use_in_second": 1616.1507635116577,
17
+ "time_use_in_minite": "26:56"
18
+ }
eval_results_avg32/global_step_10/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg32/global_step_10/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 40,
3
+ "num_scores": 1280,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 57.0,
7
+ "pass_acc": 92.5,
8
+ "pass@k": {
9
+ "1": 57.0,
10
+ "2": 69.0,
11
+ "4": 78.5,
12
+ "8": 85.0,
13
+ "16": 89.6,
14
+ "32": 92.5
15
+ },
16
+ "time_use_in_second": 1042.0152261257172,
17
+ "time_use_in_minite": "17:22"
18
+ }
eval_results_avg32/global_step_100/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 960,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 1,
6
+ "acc": 24.5,
7
+ "pass_acc": 63.3,
8
+ "pass@k": {
9
+ "1": 24.5,
10
+ "2": 31.4,
11
+ "4": 38.9,
12
+ "8": 47.7,
13
+ "16": 57.0,
14
+ "32": 63.3
15
+ },
16
+ "time_use_in_second": 4571.880812883377,
17
+ "time_use_in_minite": "76:11"
18
+ }
eval_results_avg32/global_step_100/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 960,
4
+ "timeout_samples": 1,
5
+ "empty_samples": 0,
6
+ "acc": 21.4,
7
+ "pass_acc": 56.7,
8
+ "pass@k": {
9
+ "1": 21.4,
10
+ "2": 27.5,
11
+ "4": 34.1,
12
+ "8": 40.6,
13
+ "16": 47.3,
14
+ "32": 56.7
15
+ },
16
+ "time_use_in_second": 2078.3840227127075,
17
+ "time_use_in_minite": "34:38"
18
+ }
eval_results_avg32/global_step_100/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 40,
3
+ "num_scores": 1280,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 71.7,
7
+ "pass_acc": 95.0,
8
+ "pass@k": {
9
+ "1": 71.7,
10
+ "2": 81.6,
11
+ "4": 88.0,
12
+ "8": 91.2,
13
+ "16": 93.1,
14
+ "32": 95.0
15
+ },
16
+ "time_use_in_second": 1446.8758385181427,
17
+ "time_use_in_minite": "24:06"
18
+ }
eval_results_avg32/global_step_110/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 960,
4
+ "timeout_samples": 1,
5
+ "empty_samples": 0,
6
+ "acc": 25.3,
7
+ "pass_acc": 56.7,
8
+ "pass@k": {
9
+ "1": 25.3,
10
+ "2": 31.5,
11
+ "4": 37.7,
12
+ "8": 43.7,
13
+ "16": 49.4,
14
+ "32": 56.7
15
+ },
16
+ "time_use_in_second": 4158.005015134811,
17
+ "time_use_in_minite": "69:18"
18
+ }
eval_results_avg32/global_step_110/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 960,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 20.9,
7
+ "pass_acc": 46.7,
8
+ "pass@k": {
9
+ "1": 20.9,
10
+ "2": 25.8,
11
+ "4": 31.0,
12
+ "8": 36.5,
13
+ "16": 41.6,
14
+ "32": 46.7
15
+ },
16
+ "time_use_in_second": 1925.1086626052856,
17
+ "time_use_in_minite": "32:05"
18
+ }
eval_results_avg32/global_step_110/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 40,
3
+ "num_scores": 1280,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 70.3,
7
+ "pass_acc": 97.5,
8
+ "pass@k": {
9
+ "1": 70.3,
10
+ "2": 81.0,
11
+ "4": 88.5,
12
+ "8": 93.2,
13
+ "16": 95.8,
14
+ "32": 97.5
15
+ },
16
+ "time_use_in_second": 1532.4638538360596,
17
+ "time_use_in_minite": "25:32"
18
+ }
eval_results_avg32/global_step_120/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 960,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 24.7,
7
+ "pass_acc": 60.0,
8
+ "pass@k": {
9
+ "1": 24.7,
10
+ "2": 31.2,
11
+ "4": 38.2,
12
+ "8": 45.7,
13
+ "16": 53.5,
14
+ "32": 60.0
15
+ },
16
+ "time_use_in_second": 3940.4616816043854,
17
+ "time_use_in_minite": "65:40"
18
+ }
eval_results_avg32/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 960,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 20.1,
7
+ "pass_acc": 60.0,
8
+ "pass@k": {
9
+ "1": 20.1,
10
+ "2": 26.7,
11
+ "4": 33.5,
12
+ "8": 40.9,
13
+ "16": 49.5,
14
+ "32": 60.0
15
+ },
16
+ "time_use_in_second": 5466.352454662323,
17
+ "time_use_in_minite": "91:06"
18
+ }
eval_results_avg32/global_step_20/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 960,
4
+ "timeout_samples": 1,
5
+ "empty_samples": 0,
6
+ "acc": 19.7,
7
+ "pass_acc": 56.7,
8
+ "pass@k": {
9
+ "1": 19.7,
10
+ "2": 26.4,
11
+ "4": 33.6,
12
+ "8": 41.3,
13
+ "16": 49.5,
14
+ "32": 56.7
15
+ },
16
+ "time_use_in_second": 4279.769654989243,
17
+ "time_use_in_minite": "71:19"
18
+ }
eval_results_avg32/global_step_20/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 40,
3
+ "num_scores": 1280,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 65.9,
7
+ "pass_acc": 95.0,
8
+ "pass@k": {
9
+ "1": 65.9,
10
+ "2": 78.0,
11
+ "4": 87.1,
12
+ "8": 92.1,
13
+ "16": 94.4,
14
+ "32": 95.0
15
+ },
16
+ "time_use_in_second": 2676.16796708107,
17
+ "time_use_in_minite": "44:36"
18
+ }
eval_results_avg32/global_step_30/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 960,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 22.6,
7
+ "pass_acc": 53.3,
8
+ "pass@k": {
9
+ "1": 22.6,
10
+ "2": 29.2,
11
+ "4": 35.7,
12
+ "8": 42.0,
13
+ "16": 47.9,
14
+ "32": 53.3
15
+ },
16
+ "time_use_in_second": 5582.685553073883,
17
+ "time_use_in_minite": "93:02"
18
+ }
eval_results_avg32/global_step_30/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 960,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 19.1,
7
+ "pass_acc": 46.7,
8
+ "pass@k": {
9
+ "1": 19.1,
10
+ "2": 24.8,
11
+ "4": 31.0,
12
+ "8": 37.5,
13
+ "16": 43.0,
14
+ "32": 46.7
15
+ },
16
+ "time_use_in_second": 4776.448954820633,
17
+ "time_use_in_minite": "79:36"
18
+ }
eval_results_avg32/global_step_30/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 40,
3
+ "num_scores": 1280,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 65.5,
7
+ "pass_acc": 97.5,
8
+ "pass@k": {
9
+ "1": 65.5,
10
+ "2": 78.1,
11
+ "4": 87.2,
12
+ "8": 92.3,
13
+ "16": 94.9,
14
+ "32": 97.5
15
+ },
16
+ "time_use_in_second": 3280.3285813331604,
17
+ "time_use_in_minite": "54:40"
18
+ }
eval_results_avg32/global_step_40/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 960,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 22.7,
7
+ "pass_acc": 63.3,
8
+ "pass@k": {
9
+ "1": 22.7,
10
+ "2": 29.8,
11
+ "4": 37.9,
12
+ "8": 46.6,
13
+ "16": 55.8,
14
+ "32": 63.3
15
+ },
16
+ "time_use_in_second": 5851.321416378021,
17
+ "time_use_in_minite": "97:31"
18
+ }
eval_results_avg32/global_step_40/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 960,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 18.8,
7
+ "pass_acc": 46.7,
8
+ "pass@k": {
9
+ "1": 18.8,
10
+ "2": 24.8,
11
+ "4": 30.7,
12
+ "8": 36.4,
13
+ "16": 42.0,
14
+ "32": 46.7
15
+ },
16
+ "time_use_in_second": 4337.850275754929,
17
+ "time_use_in_minite": "72:17"
18
+ }
eval_results_avg32/global_step_40/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 40,
3
+ "num_scores": 1280,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 65.9,
7
+ "pass_acc": 97.5,
8
+ "pass@k": {
9
+ "1": 65.9,
10
+ "2": 77.9,
11
+ "4": 86.9,
12
+ "8": 93.1,
13
+ "16": 96.7,
14
+ "32": 97.5
15
+ },
16
+ "time_use_in_second": 3077.686858654022,
17
+ "time_use_in_minite": "51:17"
18
+ }
eval_results_avg32/global_step_50/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 960,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 24.2,
7
+ "pass_acc": 56.7,
8
+ "pass@k": {
9
+ "1": 24.2,
10
+ "2": 31.4,
11
+ "4": 38.6,
12
+ "8": 45.5,
13
+ "16": 51.8,
14
+ "32": 56.7
15
+ },
16
+ "time_use_in_second": 6076.494002819061,
17
+ "time_use_in_minite": "101:16"
18
+ }
eval_results_avg32/global_step_50/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 960,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 20.1,
7
+ "pass_acc": 43.3,
8
+ "pass@k": {
9
+ "1": 20.1,
10
+ "2": 25.1,
11
+ "4": 30.6,
12
+ "8": 36.0,
13
+ "16": 40.3,
14
+ "32": 43.3
15
+ },
16
+ "time_use_in_second": 4568.545344591141,
17
+ "time_use_in_minite": "76:08"
18
+ }
eval_results_avg32/global_step_50/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 40,
3
+ "num_scores": 1280,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 67.4,
7
+ "pass_acc": 95.0,
8
+ "pass@k": {
9
+ "1": 67.4,
10
+ "2": 79.0,
11
+ "4": 86.9,
12
+ "8": 91.3,
13
+ "16": 93.4,
14
+ "32": 95.0
15
+ },
16
+ "time_use_in_second": 3109.3695197105408,
17
+ "time_use_in_minite": "51:49"
18
+ }
eval_results_avg32/global_step_60/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 960,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 25.3,
7
+ "pass_acc": 66.7,
8
+ "pass@k": {
9
+ "1": 25.3,
10
+ "2": 32.5,
11
+ "4": 40.2,
12
+ "8": 48.8,
13
+ "16": 58.0,
14
+ "32": 66.7
15
+ },
16
+ "time_use_in_second": 5106.328058481216,
17
+ "time_use_in_minite": "85:06"
18
+ }
eval_results_avg32/global_step_60/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 960,
4
+ "timeout_samples": 10,
5
+ "empty_samples": 0,
6
+ "acc": 18.6,
7
+ "pass_acc": 46.7,
8
+ "pass@k": {
9
+ "1": 18.6,
10
+ "2": 23.7,
11
+ "4": 29.3,
12
+ "8": 35.2,
13
+ "16": 41.0,
14
+ "32": 46.7
15
+ },
16
+ "time_use_in_second": 4359.83723783493,
17
+ "time_use_in_minite": "72:39"
18
+ }
eval_results_avg32/global_step_60/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 40,
3
+ "num_scores": 1280,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 68.7,
7
+ "pass_acc": 100.0,
8
+ "pass@k": {
9
+ "1": 68.7,
10
+ "2": 81.0,
11
+ "4": 89.2,
12
+ "8": 94.1,
13
+ "16": 97.2,
14
+ "32": 100.0
15
+ },
16
+ "time_use_in_second": 2730.307808637619,
17
+ "time_use_in_minite": "45:30"
18
+ }
eval_results_avg32/global_step_70/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 960,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 23.8,
7
+ "pass_acc": 66.7,
8
+ "pass@k": {
9
+ "1": 23.7,
10
+ "2": 30.2,
11
+ "4": 37.5,
12
+ "8": 46.2,
13
+ "16": 56.3,
14
+ "32": 66.7
15
+ },
16
+ "time_use_in_second": 4905.1872408390045,
17
+ "time_use_in_minite": "81:45"
18
+ }
eval_results_avg32/global_step_70/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 960,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 19.8,
7
+ "pass_acc": 53.3,
8
+ "pass@k": {
9
+ "1": 19.8,
10
+ "2": 25.2,
11
+ "4": 30.8,
12
+ "8": 37.1,
13
+ "16": 44.2,
14
+ "32": 53.3
15
+ },
16
+ "time_use_in_second": 3888.2481014728546,
17
+ "time_use_in_minite": "64:48"
18
+ }
eval_results_avg32/global_step_70/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 40,
3
+ "num_scores": 1280,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 66.5,
7
+ "pass_acc": 97.5,
8
+ "pass@k": {
9
+ "1": 66.5,
10
+ "2": 77.9,
11
+ "4": 86.1,
12
+ "8": 91.4,
13
+ "16": 95.0,
14
+ "32": 97.5
15
+ },
16
+ "time_use_in_second": 2869.08872961998,
17
+ "time_use_in_minite": "47:49"
18
+ }
eval_results_avg32/global_step_80/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 960,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 23.9,
7
+ "pass_acc": 63.3,
8
+ "pass@k": {
9
+ "1": 23.9,
10
+ "2": 30.4,
11
+ "4": 37.6,
12
+ "8": 45.9,
13
+ "16": 54.9,
14
+ "32": 63.3
15
+ },
16
+ "time_use_in_second": 4782.373523235321,
17
+ "time_use_in_minite": "79:42"
18
+ }
eval_results_avg32/global_step_80/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 960,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 20.1,
7
+ "pass_acc": 40.0,
8
+ "pass@k": {
9
+ "1": 20.1,
10
+ "2": 25.5,
11
+ "4": 30.7,
12
+ "8": 35.4,
13
+ "16": 38.7,
14
+ "32": 40.0
15
+ },
16
+ "time_use_in_second": 3829.8007233142853,
17
+ "time_use_in_minite": "63:49"
18
+ }
eval_results_avg32/global_step_80/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 40,
3
+ "num_scores": 1280,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 69.2,
7
+ "pass_acc": 95.0,
8
+ "pass@k": {
9
+ "1": 69.2,
10
+ "2": 80.1,
11
+ "4": 87.7,
12
+ "8": 91.9,
13
+ "16": 94.2,
14
+ "32": 95.0
15
+ },
16
+ "time_use_in_second": 2605.6274497509003,
17
+ "time_use_in_minite": "43:25"
18
+ }
eval_results_avg32/global_step_90/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 960,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 25.0,
7
+ "pass_acc": 60.0,
8
+ "pass@k": {
9
+ "1": 25.0,
10
+ "2": 31.3,
11
+ "4": 37.5,
12
+ "8": 44.1,
13
+ "16": 51.7,
14
+ "32": 60.0
15
+ },
16
+ "time_use_in_second": 4906.192798376083,
17
+ "time_use_in_minite": "81:46"
18
+ }
eval_results_avg32/global_step_90/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 960,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 21.2,
7
+ "pass_acc": 46.7,
8
+ "pass@k": {
9
+ "1": 21.2,
10
+ "2": 26.6,
11
+ "4": 32.1,
12
+ "8": 37.3,
13
+ "16": 42.1,
14
+ "32": 46.7
15
+ },
16
+ "time_use_in_second": 3431.4238605499268,
17
+ "time_use_in_minite": "57:11"
18
+ }
eval_results_avg32/global_step_90/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 40,
3
+ "num_scores": 1280,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 69.5,
7
+ "pass_acc": 92.5,
8
+ "pass@k": {
9
+ "1": 69.5,
10
+ "2": 80.1,
11
+ "4": 87.4,
12
+ "8": 91.1,
13
+ "16": 92.3,
14
+ "32": 92.5
15
+ },
16
+ "time_use_in_second": 2578.663235425949,
17
+ "time_use_in_minite": "42:58"
18
+ }
global_step_0/actor/huggingface/.cp_done ADDED
File without changes
global_step_0/actor/huggingface/README.md ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ library_name: transformers
4
+ ---
5
+ # Qwen3-8B-Base
6
+
7
+ ## Qwen3 Highlights
8
+
9
+ Qwen3 is the latest generation of large language models in Qwen series, offering a comprehensive suite of dense and mixture-of-experts (MoE) models.
10
+ Building upon extensive advancements in training data, model architecture, and optimization techniques, Qwen3 delivers the following key improvements over the previously released Qwen2.5:
11
+
12
+ - **Expanded Higher-Quality Pre-training Corpus:** Qwen3 is pre-trained on 36 trillion tokens across 119 languages — tripling the language coverage of Qwen2.5 — with a much richer mix of high-quality data, including coding, STEM, reasoning, book, multilingual, and synthetic data.
13
+ - **Training Techniques and Model Architecture:** Qwen3 incorporates a series of training techiques and architectural refinements, including global-batch load balancing loss for MoE models and qk layernorm for all models, leading to improved stability and overall performance.
14
+ - **Three-stage Pre-training:** Stage 1 focuses on broad language modeling and general knowledge acquisition, Stage 2 improves reasoning skills like STEM, coding, and logical reasoning, and Stage 3 enhances long-context comprehension by extending training sequence lengths up to 32k tokens.
15
+ - **Scaling Law Guided Hyperparameter Tuning:** Through comprehensive scaling law studies across the three-stage pre-training pipeline, Qwen3 systematically tunes critical hyperparameters — such as learning rate scheduler and batch size — separately for dense and MoE models, resulting in better training dynamics and final performance across different model scales.
16
+
17
+ ## Model Overview
18
+
19
+ **Qwen3-8B-Base** has the following features:
20
+ - Type: Causal Language Models
21
+ - Training Stage: Pretraining
22
+ - Number of Parameters: 8.2B
23
+ - Number of Paramaters (Non-Embedding): 6.95B
24
+ - Number of Layers: 36
25
+ - Number of Attention Heads (GQA): 32 for Q and 8 for KV
26
+ - Context Length: 32,768
27
+
28
+ For more details, including benchmark evaluation, hardware requirements, and inference performance, please refer to our [blog](https://qwenlm.github.io/blog/qwen3/), [GitHub](https://github.com/QwenLM/Qwen3), and [Documentation](https://qwen.readthedocs.io/en/latest/).
29
+
30
+ ## Requirements
31
+
32
+ The code of Qwen3 has been in the latest Hugging Face `transformers` and we advise you to use the latest version of `transformers`.
33
+
34
+ With `transformers<4.51.0`, you will encounter the following error:
35
+ ```
36
+ KeyError: 'qwen3'
37
+ ```
38
+
39
+ ## Evaluation & Performance
40
+
41
+ Detailed evaluation results are reported in this [📑 blog](https://qwenlm.github.io/blog/qwen3/).
42
+
43
+ ### Citation
44
+
45
+ If you find our work helpful, feel free to give us a cite.
46
+
47
+ ```
48
+ @misc{qwen3technicalreport,
49
+ title={Qwen3 Technical Report},
50
+ author={Qwen Team},
51
+ year={2025},
52
+ eprint={2505.09388},
53
+ archivePrefix={arXiv},
54
+ primaryClass={cs.CL},
55
+ url={https://arxiv.org/abs/2505.09388},
56
+ }
57
+ ```
global_step_0/actor/huggingface/config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3ForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 151643,
8
+ "eos_token_id": 151643,
9
+ "head_dim": 128,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 4096,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 12288,
14
+ "max_position_embeddings": 32768,
15
+ "max_window_layers": 36,
16
+ "model_type": "qwen3",
17
+ "num_attention_heads": 32,
18
+ "num_hidden_layers": 36,
19
+ "num_key_value_heads": 8,
20
+ "rms_norm_eps": 1e-06,
21
+ "rope_scaling": null,
22
+ "rope_theta": 1000000,
23
+ "sliding_window": null,
24
+ "tie_word_embeddings": false,
25
+ "torch_dtype": "bfloat16",
26
+ "transformers_version": "4.51.0",
27
+ "use_cache": true,
28
+ "use_sliding_window": false,
29
+ "vocab_size": 151936
30
+ }
global_step_0/actor/huggingface/generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": false,
4
+ "eos_token_id": 151643,
5
+ "max_new_tokens": 2048,
6
+ "transformers_version": "4.37.0"
7
+ }
global_step_0/actor/huggingface/merges.txt ADDED
The diff for this file is too large to render. See raw diff