johndoe123345 commited on
Commit
c9ba06f
·
verified ·
1 Parent(s): feb0990

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +2 -0
  2. eval_results/eval_results.csv +23 -0
  3. eval_results/global_step_0/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  4. eval_results/global_step_0/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  5. eval_results/global_step_0/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  6. eval_results/global_step_0/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  7. eval_results/global_step_0/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  8. eval_results/global_step_0/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  9. eval_results/global_step_0/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  10. eval_results/global_step_0/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  11. eval_results/global_step_0/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  12. eval_results/global_step_0/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +24 -0
  13. eval_results/global_step_0/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  14. eval_results/global_step_0/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  15. eval_results/global_step_10/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  16. eval_results/global_step_10/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  17. eval_results/global_step_10/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  18. eval_results/global_step_10/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  19. eval_results/global_step_10/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  20. eval_results/global_step_10/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  21. eval_results/global_step_10/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  22. eval_results/global_step_10/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  23. eval_results/global_step_10/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  24. eval_results/global_step_10/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +24 -0
  25. eval_results/global_step_10/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  26. eval_results/global_step_10/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  27. eval_results/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  28. eval_results/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  29. eval_results/global_step_20/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  30. eval_results/global_step_20/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  31. eval_results/global_step_20/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  32. eval_results/global_step_20/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  33. eval_results/global_step_20/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  34. eval_results/global_step_20/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  35. eval_results/global_step_20/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  36. eval_results/global_step_20/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +24 -0
  37. eval_results/global_step_20/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  38. eval_results/global_step_20/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  39. eval_results/global_step_45/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  40. eval_results/global_step_45/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  41. eval_results/global_step_45/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  42. eval_results/global_step_45/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  43. eval_results/global_step_45/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  44. eval_results/global_step_45/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  45. eval_results/global_step_45/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  46. eval_results/global_step_45/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  47. eval_results/global_step_45/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +24 -0
  48. eval_results/global_step_60/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  49. eval_results/global_step_60/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  50. eval_results/global_step_60/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
.gitattributes CHANGED
@@ -48,3 +48,5 @@ global_step_55/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -t
48
  global_step_15/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
49
  global_step_30/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
50
  global_step_95/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 
 
 
48
  global_step_15/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
49
  global_step_30/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
50
  global_step_95/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
51
+ global_step_100/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
52
+ global_step_40/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
eval_results/eval_results.csv ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model,minerva_math_acc,minerva_math_pass_acc,minerva_math_tokens,minerva_math_keywords,minerva_math_correct_tokens,minerva_math_wrong_tokens,minerva_math_clip_ratio,minerva_math_stop_tokens,minerva_math_stop_ratio,minerva_math_box_ratio,minerva_math_repeat_ratio,amc23_acc,amc23_pass_acc,amc23_tokens,amc23_keywords,amc23_correct_tokens,amc23_wrong_tokens,amc23_clip_ratio,amc23_stop_tokens,amc23_stop_ratio,amc23_box_ratio,amc23_repeat_ratio,aime24_acc,aime24_pass_acc,aime24_tokens,aime24_keywords,aime24_correct_tokens,aime24_wrong_tokens,aime24_clip_ratio,aime24_stop_tokens,aime24_stop_ratio,aime24_box_ratio,aime24_repeat_ratio,gsm8k_acc,gsm8k_pass_acc,gsm8k_tokens,gsm8k_keywords,gsm8k_correct_tokens,gsm8k_wrong_tokens,gsm8k_clip_ratio,gsm8k_stop_tokens,gsm8k_stop_ratio,gsm8k_box_ratio,gsm8k_repeat_ratio,math500_acc,math500_pass_acc,math500_tokens,math500_keywords,math500_correct_tokens,math500_wrong_tokens,math500_clip_ratio,math500_stop_tokens,math500_stop_ratio,math500_box_ratio,math500_repeat_ratio,olympiadbench_acc,olympiadbench_pass_acc,olympiadbench_tokens,olympiadbench_keywords,olympiadbench_correct_tokens,olympiadbench_wrong_tokens,olympiadbench_clip_ratio,olympiadbench_stop_tokens,olympiadbench_stop_ratio,olympiadbench_box_ratio,olympiadbench_repeat_ratio,avg_acc,avg_pass_acc,avg_tokens,avg_keywords,avg_correct_tokens,avg_wrong_tokens,avg_clip_ratio,avg_stop_tokens,avg_stop_ratio,avg_box_ratio,avg_repeat_ratio
2
+ eval_results-global_step_0,19.1,19.1,634.0625,0.15441176470588236,514.6538461538462,662.2863636363636,0.0,634.0625,1.0,0.7610294117647058,0.44485294117647056,35.0,35.0,867.125,0.25,1288.857142857143,640.0384615384615,0.0,867.125,1.0,0.85,0.625,3.3,3.3,1002.0,0.13333333333333333,713.0,1011.9655172413793,0.0,1002.0,1.0,0.7333333333333333,0.7666666666666667,74.8,74.8,341.27445034116755,0.0356330553449583,286.6764705882353,502.93693693693695,0.0,341.27445034116755,1.0,0.8377558756633814,0.2623199393479909,52.0,52.0,631.896,0.166,483.5692307692308,792.5833333333334,0.0,631.896,1.0,0.828,0.514,18.2,18.2,822.5377777777778,0.23851851851851852,652.9918699186992,860.3170289855072,0.0,822.5377777777778,1.0,0.7822222222222223,0.6251851851851852,33.73333333333333,33.73333333333333,716.4826213531575,0.16298277865044875,656.624760047859,745.021273611997,0.0,716.4826213531575,1.0,0.798723473830607,0.5396707887293856
3
+ eval_results-global_step_5,28.7,28.7,646.5661764705883,0.13970588235294118,465.1923076923077,719.4896907216495,0.0,646.5661764705883,1.0,0.9852941176470589,0.44485294117647056,42.5,42.5,819.075,0.375,674.3529411764706,926.0434782608696,0.0,819.075,1.0,1.0,0.7,6.7,6.7,1234.2666666666667,0.3333333333333333,706.5,1271.9642857142858,0.0,1234.2666666666667,1.0,0.9333333333333333,0.6666666666666666,87.9,87.9,297.9673995451099,0.024260803639120546,281.0353753235548,420.61875,0.0,297.9673995451099,1.0,0.9977255496588324,0.2092494313874147,67.4,67.4,621.562,0.174,490.16617210682494,893.2208588957055,0.0,621.562,1.0,0.998,0.472,34.4,34.4,896.794074074074,0.37037037037037035,699.7327586206897,999.9954853273138,0.0,896.794074074074,1.0,0.9807407407407407,0.6681481481481482,44.6,44.6,752.7052194594065,0.23611173161596088,552.8299258199746,871.8887581533041,0.0,752.7052194594065,1.0,0.9825156235633276,0.5268195312297833
4
+ eval_results-global_step_10,29.8,29.8,667.3272058823529,0.11029411764705882,529.5185185185185,725.7696335078534,0.0,667.3272058823529,1.0,0.9963235294117647,0.45955882352941174,42.5,42.5,857.625,0.175,666.1764705882352,999.1304347826087,0.0,857.625,1.0,1.0,0.5,10.0,10.0,1121.2,0.23333333333333334,1034.6666666666667,1130.8148148148148,0.0,1121.2,1.0,0.9666666666666667,0.6333333333333333,90.3,90.3,317.0538286580743,0.02122820318423048,305.9731318219983,420.15625,0.0,317.0538286580743,1.0,0.9992418498862775,0.17589082638362397,72.4,72.4,626.22,0.158,536.0165745856353,862.8405797101449,0.0,626.22,1.0,1.0,0.47,33.9,33.9,945.2385185185185,0.2311111111111111,725.5240174672489,1058.0515695067265,0.0,945.2385185185185,1.0,0.9807407407407407,0.6888888888888889,46.48333333333333,46.48333333333333,755.7774255098243,0.15482779421262227,632.9792299413838,866.1272137203581,0.0,755.7774255098243,1.0,0.9904954644509082,0.48794531202254304
5
+ eval_results-global_step_15,33.8,33.8,694.3566176470588,0.125,559.9891304347826,763.0333333333333,0.0,694.3566176470588,1.0,0.9963235294117647,0.46691176470588236,50.0,50.0,906.825,0.15,898.4,915.25,0.0,906.825,1.0,1.0,0.55,13.3,13.3,1512.0666666666666,0.16666666666666666,927.0,1602.076923076923,0.0,1512.0666666666666,1.0,0.9333333333333333,0.7666666666666667,89.6,89.6,337.1008339651251,0.022744503411675512,325.97800338409473,433.06569343065695,0.0,337.1008339651251,1.0,0.9977255496588324,0.14783927217589082,73.8,73.8,650.856,0.144,551.1653116531165,931.6641221374045,0.0,650.856,1.0,0.998,0.466,35.4,35.4,950.1955555555555,0.2325925925925926,736.3430962343097,1067.4220183486239,0.0,950.1955555555555,1.0,0.9718518518518519,0.6637037037037037,49.31666666666666,49.31666666666666,841.9001123057345,0.1401672937784891,666.4792569510506,952.0853483878237,0.0,841.9001123057345,1.0,0.9828723773759638,0.5101869012086906
6
+ eval_results-global_step_20,33.8,33.8,721.0441176470588,0.125,572.2934782608696,797.0722222222222,0.0,721.0441176470588,1.0,0.9963235294117647,0.48161764705882354,52.5,52.5,938.95,0.225,791.3333333333334,1102.1052631578948,0.0,938.95,1.0,0.975,0.75,13.3,13.3,1399.3666666666666,0.16666666666666666,935.0,1470.8076923076924,0.0,1399.3666666666666,1.0,0.9333333333333333,0.8666666666666667,89.7,89.7,332.289613343442,0.028051554207733132,323.70076077768385,407.0,0.0,332.289613343442,1.0,0.9992418498862775,0.16224412433661864,74.6,74.6,652.914,0.146,547.0187667560322,963.9291338582677,0.0,652.914,1.0,1.0,0.478,37.8,37.8,936.9748148148149,0.28296296296296297,763.435294117647,1042.3380952380953,0.0,936.9748148148149,1.0,0.9837037037037037,0.6488888888888888,50.28333333333333,50.28333333333333,830.256535411997,0.16228019730622711,655.4636055409277,963.8754011306955,0.0,830.256535411997,1.0,0.9812670693891797,0.564569554491833
7
+ eval_results-global_step_25,32.4,32.4,688.5588235294117,0.125,554.0681818181819,752.8804347826087,0.0,688.5588235294117,1.0,0.9963235294117647,0.4227941176470588,50.0,50.0,996.1,0.25,799.9,1192.3,0.0,996.1,1.0,1.0,0.725,10.0,10.0,1130.4666666666667,0.3333333333333333,783.6666666666666,1169.0,0.0,1130.4666666666667,1.0,0.9333333333333333,0.7333333333333333,90.3,90.3,333.89310083396515,0.019711902956785442,321.36104114189754,450.5,0.0,333.89310083396515,1.0,0.9992418498862775,0.15769522365428354,76.6,76.6,644.042,0.134,551.8694516971279,945.7692307692307,0.0,644.042,1.0,0.994,0.478,37.6,37.6,929.6785185185186,0.2874074074074074,723.8779527559055,1053.8432304038004,0.0,929.6785185185186,1.0,0.9807407407407407,0.6444444444444445,49.48333333333333,49.48333333333333,787.1231849247603,0.19157544061625434,622.4572156799633,927.3821493259401,0.0,787.1231849247603,1.0,0.9839399088953527,0.5268778531798534
8
+ eval_results-global_step_30,32.4,32.4,689.9632352941177,0.09191176470588236,542.0340909090909,760.7119565217391,0.0,689.9632352941177,1.0,0.9963235294117647,0.47794117647058826,60.0,60.0,936.1,0.3,838.125,1083.0625,0.0,936.1,1.0,1.0,0.725,16.7,16.7,1308.8666666666666,0.4666666666666667,988.2,1373.0,0.0,1308.8666666666666,1.0,0.9333333333333333,0.7333333333333333,91.3,91.3,327.7414708112206,0.022744503411675512,320.4892026578073,403.6695652173913,0.0,327.7414708112206,1.0,0.9992418498862775,0.1645185746777862,76.0,76.0,646.47,0.156,542.8552631578947,974.5833333333334,0.0,646.47,1.0,0.994,0.478,36.7,36.7,985.7733333333333,0.3348148148148148,756.2540322580645,1119.0772833723654,0.0,985.7733333333333,1.0,0.9733333333333334,0.6503703703703704,52.18333333333333,52.18333333333333,815.819117684223,0.2286896249331732,664.6595981638095,952.3507730741381,0.0,815.819117684223,1.0,0.9827053409941181,0.5381939091420129
9
+ eval_results-global_step_35,37.1,37.1,690.1507352941177,0.125,575.9108910891089,757.625730994152,0.0,690.1507352941177,1.0,1.0,0.5036764705882353,57.5,57.5,961.375,0.175,738.1304347826087,1263.4117647058824,0.0,961.375,1.0,1.0,0.65,13.3,13.3,1410.7,0.7,892.0,1490.5,0.0,1410.7,1.0,0.9333333333333333,0.8666666666666667,90.4,90.4,326.88703563305535,0.016679302501895376,317.86839899413246,412.27777777777777,0.0,326.88703563305535,1.0,0.9992418498862775,0.15238817285822592,75.2,75.2,642.932,0.172,537.8803191489362,961.4758064516129,0.0,642.932,1.0,0.998,0.512,37.9,37.9,972.2474074074074,0.2785185185185185,773.57421875,1093.6324582338902,0.0,972.2474074074074,1.0,0.9733333333333334,0.6592592592592592,51.9,51.9,834.0486963890967,0.244532970170069,639.2273771274644,996.4872563605526,0.0,834.0486963890967,1.0,0.9839847527588241,0.5573317615620645
10
+ eval_results-global_step_40,34.6,34.6,730.5073529411765,0.13602941176470587,573.7234042553191,813.3033707865169,0.0,730.5073529411765,1.0,0.9963235294117647,0.4889705882352941,57.5,57.5,926.85,0.35,766.0,1144.4705882352941,0.0,926.85,1.0,0.975,0.7,13.3,13.3,1355.9,0.43333333333333335,809.75,1439.923076923077,0.0,1355.9,1.0,0.9,0.8333333333333334,91.4,91.4,329.8673237300986,0.028051554207733132,322.12603648424545,412.4867256637168,0.0,329.8673237300986,1.0,0.9992418498862775,0.15769522365428354,75.8,75.8,636.426,0.156,546.9182058047494,916.7851239669421,0.0,636.426,1.0,1.0,0.458,39.6,39.6,979.6103703703703,0.3274074074074074,759.4531835205993,1123.6838235294117,0.0,979.6103703703703,1.0,0.9674074074074074,0.6562962962962963,52.03333333333334,52.03333333333334,826.5268411736075,0.23847028445219662,629.661805010819,975.1087848508264,0.0,826.5268411736075,1.0,0.9729954644509081,0.5490492402532012
11
+ eval_results-global_step_45,34.9,34.9,700.3272058823529,0.13970588235294118,574.6947368421053,767.7570621468926,0.0,700.3272058823529,1.0,0.9963235294117647,0.5367647058823529,52.5,52.5,1008.3,0.25,791.5238095238095,1247.8947368421052,0.0,1008.3,1.0,1.0,0.75,16.7,16.7,1122.3333333333333,0.5333333333333333,870.6,1172.68,0.0,1122.3333333333333,1.0,0.9666666666666667,0.7,91.1,91.1,329.82714177407126,0.01819560272934041,320.93588676103246,420.3220338983051,0.0,329.82714177407126,1.0,0.9992418498862775,0.16148597422289612,76.6,76.6,663.268,0.16,551.1436031331592,1030.3076923076924,0.0,663.268,1.0,0.99,0.47,39.1,39.1,937.2607407407407,0.35555555555555557,768.2045454545455,1045.851581508516,0.0,937.2607407407407,1.0,0.9866666666666667,0.6666666666666666,51.81666666666666,51.81666666666666,793.5527369550831,0.24279839566186176,646.1837636191086,947.4688511172518,0.0,793.5527369550831,1.0,0.9898164521052292,0.5474862244619859
12
+ eval_results-global_step_50,35.3,35.3,721.4117647058823,0.15808823529411764,573.0833333333334,802.3181818181819,0.0,721.4117647058823,1.0,1.0,0.47794117647058826,47.5,47.5,1100.625,0.5,730.578947368421,1435.4285714285713,0.0,1100.625,1.0,0.95,0.7,13.3,13.3,1167.8,0.8,829.5,1219.8461538461538,0.0,1167.8,1.0,0.9666666666666667,0.9,91.1,91.1,335.8779378316907,0.019711902956785442,323.6447587354409,461.55555555555554,0.0,335.8779378316907,1.0,0.9984836997725549,0.1516300227445034,76.0,76.0,673.072,0.22,567.1473684210526,1008.5,0.0,673.072,1.0,0.988,0.454,40.9,40.9,973.7348148148149,0.3718518518518519,752.9710144927536,1126.4436090225563,0.0,973.7348148148149,1.0,0.9777777777777777,0.6681481481481482,50.68333333333333,50.68333333333333,828.7535862253981,0.34494199835045913,629.4875703918336,1009.0153452785031,0.0,828.7535862253981,1.0,0.9801546907028332,0.5586198912272068
13
+ eval_results-global_step_55,34.2,34.2,711.6691176470588,0.1323529411764706,538.7849462365591,801.4916201117319,0.0,711.6691176470588,1.0,1.0,0.4522058823529412,57.5,57.5,1152.75,0.325,800.695652173913,1629.0588235294117,0.0,1152.75,1.0,0.95,0.8,16.7,16.7,1314.5666666666666,1.8666666666666667,865.6,1404.36,0.0,1314.5666666666666,1.0,0.9333333333333333,0.8,91.5,91.5,348.7475360121304,0.022744503411675512,330.13587406793704,549.3214285714286,0.0,348.7475360121304,1.0,0.9984836997725549,0.15238817285822592,75.8,75.8,677.508,0.198,539.8496042216359,1108.685950413223,0.0,677.508,1.0,0.992,0.45,38.5,38.5,1022.8844444444444,0.30074074074074075,713.55,1216.6843373493975,0.0,1022.8844444444444,1.0,0.965925925925926,0.6681481481481482,52.36666666666667,52.36666666666667,871.3542941283835,0.47425080866592556,631.436012783341,1118.267026662532,0.0,871.3542941283835,1.0,0.973290493171969,0.5537903672265526
14
+ eval_results-global_step_60,37.1,37.1,733.7352941176471,0.1875,563.3069306930693,834.3976608187135,0.0,733.7352941176471,1.0,0.9963235294117647,0.4889705882352941,55.0,55.0,991.675,0.275,827.8636363636364,1191.888888888889,0.0,991.675,1.0,1.0,0.725,10.0,10.0,1125.8666666666666,0.4666666666666667,845.6666666666666,1157.0,0.0,1125.8666666666666,1.0,0.9333333333333333,0.7,90.7,90.7,373.92115238817286,0.04397270659590599,340.8035117056856,695.9430894308944,0.0,373.92115238817286,1.0,0.9984836997725549,0.1645185746777862,76.8,76.8,678.958,0.146,548.7994791666666,1109.8275862068965,0.0,678.958,1.0,0.992,0.51,40.6,40.6,1049.9422222222222,0.3288888888888889,779.4890510948906,1234.7406483790523,0.0,1049.9422222222222,1.0,0.9703703703703703,0.6785185185185185,51.70000000000001,51.70000000000001,825.6830558991181,0.24133804369191028,650.9882126151025,1037.2996456207409,0.0,825.6830558991181,1.0,0.9817518221480039,0.5445012802385998
15
+ eval_results-global_step_65,37.1,37.1,739.7904411764706,0.15808823529411764,571.4158415841584,839.2397660818714,0.0,739.7904411764706,1.0,0.9963235294117647,0.5110294117647058,52.5,52.5,1005.925,0.275,744.0,1295.421052631579,0.0,1005.925,1.0,0.975,0.725,13.3,13.3,1491.0333333333333,0.36666666666666664,902.5,1581.576923076923,0.0,1491.0333333333333,1.0,0.9,0.8,90.6,90.6,352.14859742228964,0.0356330553449583,335.63430962343097,511.2983870967742,0.0,352.14859742228964,1.0,0.9992418498862775,0.17664897649734648,77.0,77.0,676.31,0.182,560.987012987013,1062.391304347826,0.0,676.31,1.0,0.992,0.484,41.5,41.5,979.8533333333334,0.3422222222222222,772.4607142857143,1126.8658227848102,0.0,979.8533333333334,1.0,0.9837037037037037,0.6814814814814815,52.0,52.0,874.1767842109044,0.22660169658799414,647.8329797467194,1069.465542669964,0.0,874.1767842109044,1.0,0.9743781805002909,0.5630266449572557
16
+ eval_results-global_step_70,35.7,35.7,749.5367647058823,0.16911764705882354,589.0515463917526,838.4914285714286,0.0,749.5367647058823,1.0,0.9963235294117647,0.5441176470588235,40.0,40.0,1033.2,0.25,672.25,1273.8333333333333,0.0,1033.2,1.0,0.975,0.675,13.3,13.3,1227.4,0.4666666666666667,892.75,1278.8846153846155,0.0,1227.4,1.0,0.9666666666666667,0.8666666666666667,91.4,91.4,398.0068233510235,0.030326004548900682,360.90464344941955,793.9823008849557,0.000758150113722517,386.1752655538695,0.9992418498862775,0.9992418498862775,0.19257012888551933,76.6,76.6,701.854,0.186,550.3185378590078,1197.905982905983,0.0,701.854,1.0,0.992,0.462,41.8,41.8,1046.6577777777777,0.33925925925925926,790.7056737588653,1230.3180661577608,0.0,1046.6577777777777,1.0,0.9733333333333334,0.6962962962962963,49.800000000000004,49.800000000000004,859.4425609724473,0.24022826292227503,642.6634002431742,1102.2359545396794,0.00012635835228708617,857.4706346729216,0.9998736416477129,0.9837608965496737,0.5727751231512177
17
+ eval_results-global_step_75,34.9,34.9,787.4227941176471,0.17279411764705882,575.0105263157894,901.4293785310734,0.0,787.4227941176471,1.0,0.9926470588235294,0.5110294117647058,55.0,55.0,1115.85,0.25,882.5454545454545,1401.0,0.0,1115.85,1.0,0.95,0.775,20.0,20.0,1656.8,4.166666666666667,949.6666666666666,1833.5833333333333,0.0,1656.8,1.0,0.9,0.8666666666666667,90.4,90.4,417.6664139499621,0.04094010614101592,385.6501677852349,718.1653543307086,0.000758150113722517,405.86191198786037,0.9992418498862775,0.9977255496588324,0.20318423047763456,76.0,76.0,681.972,0.18,549.6473684210526,1101.0,0.0,681.972,1.0,0.992,0.488,42.2,42.2,1065.4459259259258,0.37925925925925924,799.9578947368421,1259.4564102564102,0.0,1065.4459259259258,1.0,0.9748148148148148,0.72,53.083333333333336,53.083333333333336,954.192855665589,0.8649433582856668,690.4130130785067,1202.4390794085875,0.00012635835228708617,952.2254386719055,0.9998736416477129,0.9678645705495295,0.5939800514848345
18
+ eval_results-global_step_80,34.9,34.9,787.0,0.14338235294117646,590.1052631578947,892.6779661016949,0.0,787.0,1.0,0.9926470588235294,0.5220588235294118,50.0,50.0,1035.375,0.3,859.1,1211.65,0.0,1035.375,1.0,1.0,0.7,13.3,13.3,1263.3333333333333,0.8,840.5,1328.3846153846155,0.0,1263.3333333333333,1.0,0.9333333333333333,0.7666666666666667,89.2,89.2,565.0629264594389,0.06595905989385899,492.38690476190476,1162.7342657342658,0.000758150113722517,553.3528072837632,0.9992418498862775,0.9992418498862775,0.221379833206975,74.2,74.2,695.19,0.174,578.7816711590297,1029.9767441860465,0.0,695.19,1.0,0.994,0.488,39.7,39.7,1121.6918518518519,0.36,809.1529850746268,1327.4914004914006,0.0,1121.6918518518519,1.0,0.96,0.674074074074074,50.21666666666667,50.21666666666667,911.2755186074373,0.3072235688058393,695.0044706922425,1158.8191653163374,0.00012635835228708617,909.323832078158,0.9998736416477129,0.9798703736738567,0.5620298995795213
19
+ eval_results-global_step_85,34.2,34.2,771.1323529411765,0.16176470588235295,565.0967741935484,878.1787709497206,0.0,771.1323529411765,1.0,0.9963235294117647,0.5,52.5,52.5,1032.075,0.325,770.7142857142857,1320.9473684210527,0.0,1032.075,1.0,1.0,0.75,10.0,10.0,1220.4,0.7333333333333333,1064.0,1237.7777777777778,0.0,1220.4,1.0,0.9666666666666667,0.7,89.8,89.8,508.84685367702804,0.04169825625473844,442.0287162162162,1094.8666666666666,0.0,508.84685367702804,1.0,0.9992418498862775,0.22441243366186506,77.2,77.2,663.266,0.192,563.9974093264249,999.3859649122807,0.0,663.266,1.0,0.994,0.508,38.2,38.2,1132.1303703703704,0.3422222222222222,798.3798449612403,1338.6235011990407,0.0,1132.1303703703704,1.0,0.9644444444444444,0.6918518518518518,50.31666666666666,50.31666666666666,887.9750961647625,0.29933641961544116,700.7028384019526,1144.9633416544232,0.0,887.9750961647625,1.0,0.9867794150681921,0.5623773809189528
20
+ eval_results-global_step_90,37.1,37.1,740.9632352941177,0.18382352941176472,577.1386138613861,837.7251461988304,0.0,740.9632352941177,1.0,0.9926470588235294,0.5220588235294118,55.0,55.0,1142.25,0.15,845.1818181818181,1505.3333333333333,0.0,1142.25,1.0,0.975,0.6,6.7,6.7,1225.1666666666667,0.9,811.5,1254.7142857142858,0.0,1225.1666666666667,1.0,0.9666666666666667,0.6666666666666666,91.5,91.5,353.385140257771,0.026535253980288095,336.3927091963546,536.5089285714286,0.0,353.385140257771,1.0,0.9977255496588324,0.19181197877179681,76.0,76.0,702.334,0.212,541.4421052631579,1211.825,0.0,702.334,1.0,0.99,0.488,39.6,39.6,1097.1392592592592,0.5733333333333334,788.9812734082396,1298.8014705882354,0.0,1097.1392592592592,1.0,0.9674074074074074,0.6814814814814815,50.98333333333334,50.98333333333334,876.8730502463023,0.3409486861208977,650.1060866518261,1107.4846940676855,0.0,876.8730502463023,1.0,0.9815744470927393,0.5250031584082261
21
+ eval_results-global_step_95,36.0,36.0,718.7720588235294,0.19852941176470587,567.6020408163265,803.9137931034483,0.0,718.7720588235294,1.0,1.0,0.5257352941176471,47.5,47.5,1109.0,0.375,806.0,1383.142857142857,0.0,1109.0,1.0,1.0,0.725,26.7,26.7,1233.7666666666667,0.6333333333333333,912.5,1350.590909090909,0.0,1233.7666666666667,1.0,0.9666666666666667,0.8,92.6,92.6,338.8718726307809,0.022744503411675512,322.027846027846,548.734693877551,0.0,338.8718726307809,1.0,0.9977255496588324,0.17664897649734648,75.8,75.8,700.232,0.206,554.7282321899736,1155.98347107438,0.0,700.232,1.0,0.988,0.482,41.6,41.6,1011.9318518518519,0.38222222222222224,783.9715302491103,1174.51269035533,0.0,1011.9318518518519,1.0,0.9807407407407407,0.6844444444444444,53.366666666666674,53.366666666666674,852.0957416621382,0.3029715784553228,657.8049415472095,1069.4797357740792,0.0,852.0957416621382,1.0,0.9888554928443734,0.565638119176573
22
+ eval_results-global_step_100,33.5,33.5,754.2058823529412,0.17279411764705882,564.1318681318681,849.767955801105,0.0,754.2058823529412,1.0,0.9963235294117647,0.49264705882352944,42.5,42.5,1364.225,0.225,774.1176470588235,1800.391304347826,0.0,1364.225,1.0,0.95,0.725,16.7,16.7,1428.3666666666666,0.9,976.0,1518.84,0.0,1428.3666666666666,1.0,0.9333333333333333,0.8,92.5,92.5,359.01743745261564,0.026535253980288095,338.1155737704918,616.5959595959596,0.000758150113722517,347.1502276176024,0.9992418498862775,0.9977255496588324,0.1819560272934041,75.4,75.4,714.972,0.222,555.710875331565,1203.1138211382113,0.0,714.972,1.0,0.988,0.512,41.2,41.2,1173.245925925926,0.36444444444444446,796.0539568345324,1437.375314861461,0.0,1173.245925925926,1.0,0.9614814814814815,0.674074074074074,50.300000000000004,50.300000000000004,965.6721520663582,0.3184623026786319,667.3549868545468,1237.680725957427,0.00012635835228708617,963.6942837605226,0.9998736416477129,0.9711439823142353,0.5642795266985012
23
+ eval_results-global_step_105,37.9,37.9,777.3455882352941,0.13970588235294118,609.5145631067961,879.6331360946746,0.0,777.3455882352941,1.0,0.9926470588235294,0.5036764705882353,52.5,52.5,1020.075,0.175,838.0952380952381,1221.2105263157894,0.0,1020.075,1.0,0.975,0.7,16.7,16.7,1761.2666666666667,0.8,1185.4,1876.44,0.0,1761.2666666666667,1.0,0.8666666666666667,0.8333333333333334,91.6,91.6,339.6785443517816,0.026535253980288095,318.83029801324506,566.5675675675676,0.0,339.6785443517816,1.0,0.9984836997725549,0.20621683093252463,77.4,77.4,786.982,0.164,564.8630490956073,1547.6902654867256,0.0,786.982,1.0,0.978,0.52,39.0,39.0,1170.4977777777779,0.36148148148148146,758.9733840304183,1433.1941747572816,0.0,1170.4977777777779,1.0,0.96,0.6992592592592592,52.51666666666667,52.51666666666667,975.9742628385867,0.27778710296911846,712.6127553902174,1254.122611703673,0.0,975.9742628385867,1.0,0.9617995708771252,0.5770809823522254
eval_results/global_step_0/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_0/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 30,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 1,
6
+ "acc": 3.3,
7
+ "pass_acc": 3.3,
8
+ "pass@k": {
9
+ "1": 3.3
10
+ },
11
+ "time_use_in_second": 40.46293258666992,
12
+ "time_use_in_minite": "0:40"
13
+ }
eval_results/global_step_0/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_0/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 40,
3
+ "num_scores": 40,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 1,
6
+ "acc": 35.0,
7
+ "pass_acc": 35.0,
8
+ "pass@k": {
9
+ "1": 35.0
10
+ },
11
+ "time_use_in_second": 41.554773569107056,
12
+ "time_use_in_minite": "0:41"
13
+ }
eval_results/global_step_0/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_0/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 1319,
3
+ "num_scores": 1319,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 18,
6
+ "acc": 74.8,
7
+ "pass_acc": 74.8,
8
+ "pass@k": {
9
+ "1": 74.8
10
+ },
11
+ "time_use_in_second": 96.72446346282959,
12
+ "time_use_in_minite": "1:36"
13
+ }
eval_results/global_step_0/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_0/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 500,
3
+ "num_scores": 500,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 9,
6
+ "acc": 52.0,
7
+ "pass_acc": 52.0,
8
+ "pass@k": {
9
+ "1": 52.0
10
+ },
11
+ "time_use_in_second": 72.89108872413635,
12
+ "time_use_in_minite": "1:12"
13
+ }
eval_results/global_step_0/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_0/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 272,
3
+ "num_scores": 272,
4
+ "timeout_samples": 1,
5
+ "empty_samples": 21,
6
+ "acc": 19.1,
7
+ "pass_acc": 19.1,
8
+ "pass@k": {
9
+ "1": 19.1
10
+ },
11
+ "type_acc": {
12
+ "Differential Equations (18.03 Spring 2010)": 37.5,
13
+ "Dynamics and Control (2.003 Spring 2005)": 23.1,
14
+ "Ecology I (1.018J Fall 2009)": 20.0,
15
+ "Information and Entropy (6.050J Spring 2008)": 33.3,
16
+ "Introduction to Astronomy (8.282J Spring 2006)": 11.3,
17
+ "Introduction to Solid State Chemistry (3.091 Fall 2010)": 12.4,
18
+ "Physical Chemistry (5.61 Fall 2017)": 0.0,
19
+ "Principles of Microeconomics (14.01 Fall 2011)": 33.3,
20
+ "Relativity (8.033 Fall 2006)": 18.2
21
+ },
22
+ "time_use_in_second": 51.470000982284546,
23
+ "time_use_in_minite": "0:51"
24
+ }
eval_results/global_step_0/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_0/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 675,
3
+ "num_scores": 675,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 20,
6
+ "acc": 18.2,
7
+ "pass_acc": 18.2,
8
+ "pass@k": {
9
+ "1": 18.2
10
+ },
11
+ "time_use_in_second": 101.84683203697205,
12
+ "time_use_in_minite": "1:41"
13
+ }
eval_results/global_step_10/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_10/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 30,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 10.0,
7
+ "pass_acc": 10.0,
8
+ "pass@k": {
9
+ "1": 10.0
10
+ },
11
+ "time_use_in_second": 22.103896141052246,
12
+ "time_use_in_minite": "0:22"
13
+ }
eval_results/global_step_10/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_10/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 40,
3
+ "num_scores": 40,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 42.5,
7
+ "pass_acc": 42.5,
8
+ "pass@k": {
9
+ "1": 42.5
10
+ },
11
+ "time_use_in_second": 11.616541385650635,
12
+ "time_use_in_minite": "0:11"
13
+ }
eval_results/global_step_10/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_10/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 1319,
3
+ "num_scores": 1319,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 90.3,
7
+ "pass_acc": 90.3,
8
+ "pass@k": {
9
+ "1": 90.3
10
+ },
11
+ "time_use_in_second": 75.52257251739502,
12
+ "time_use_in_minite": "1:15"
13
+ }
eval_results/global_step_10/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_10/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 500,
3
+ "num_scores": 500,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 1,
6
+ "acc": 72.4,
7
+ "pass_acc": 72.4,
8
+ "pass@k": {
9
+ "1": 72.4
10
+ },
11
+ "time_use_in_second": 45.3357515335083,
12
+ "time_use_in_minite": "0:45"
13
+ }
eval_results/global_step_10/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_10/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 272,
3
+ "num_scores": 272,
4
+ "timeout_samples": 1,
5
+ "empty_samples": 0,
6
+ "acc": 29.8,
7
+ "pass_acc": 29.8,
8
+ "pass@k": {
9
+ "1": 29.8
10
+ },
11
+ "type_acc": {
12
+ "Differential Equations (18.03 Spring 2010)": 56.2,
13
+ "Dynamics and Control (2.003 Spring 2005)": 46.2,
14
+ "Ecology I (1.018J Fall 2009)": 20.0,
15
+ "Information and Entropy (6.050J Spring 2008)": 0.0,
16
+ "Introduction to Astronomy (8.282J Spring 2006)": 17.0,
17
+ "Introduction to Solid State Chemistry (3.091 Fall 2010)": 17.5,
18
+ "Physical Chemistry (5.61 Fall 2017)": 18.2,
19
+ "Principles of Microeconomics (14.01 Fall 2011)": 55.6,
20
+ "Relativity (8.033 Fall 2006)": 27.3
21
+ },
22
+ "time_use_in_second": 31.15471076965332,
23
+ "time_use_in_minite": "0:31"
24
+ }
eval_results/global_step_10/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_10/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 675,
3
+ "num_scores": 675,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 33.9,
7
+ "pass_acc": 33.9,
8
+ "pass@k": {
9
+ "1": 33.9
10
+ },
11
+ "time_use_in_second": 110.84913039207458,
12
+ "time_use_in_minite": "1:50"
13
+ }
eval_results/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 30,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 13.3,
7
+ "pass_acc": 13.3,
8
+ "pass@k": {
9
+ "1": 13.3
10
+ },
11
+ "time_use_in_second": 41.12174916267395,
12
+ "time_use_in_minite": "0:41"
13
+ }
eval_results/global_step_20/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_20/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 40,
3
+ "num_scores": 40,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 52.5,
7
+ "pass_acc": 52.5,
8
+ "pass@k": {
9
+ "1": 52.5
10
+ },
11
+ "time_use_in_second": 19.10493278503418,
12
+ "time_use_in_minite": "0:19"
13
+ }
eval_results/global_step_20/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_20/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 1319,
3
+ "num_scores": 1319,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 89.7,
7
+ "pass_acc": 89.7,
8
+ "pass@k": {
9
+ "1": 89.7
10
+ },
11
+ "time_use_in_second": 60.75820541381836,
12
+ "time_use_in_minite": "1:00"
13
+ }
eval_results/global_step_20/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_20/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 500,
3
+ "num_scores": 500,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 1,
6
+ "acc": 74.6,
7
+ "pass_acc": 74.6,
8
+ "pass@k": {
9
+ "1": 74.6
10
+ },
11
+ "time_use_in_second": 50.139283895492554,
12
+ "time_use_in_minite": "0:50"
13
+ }
eval_results/global_step_20/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_20/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 272,
3
+ "num_scores": 272,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 33.8,
7
+ "pass_acc": 33.8,
8
+ "pass@k": {
9
+ "1": 33.8
10
+ },
11
+ "type_acc": {
12
+ "Differential Equations (18.03 Spring 2010)": 64.6,
13
+ "Dynamics and Control (2.003 Spring 2005)": 46.2,
14
+ "Ecology I (1.018J Fall 2009)": 60.0,
15
+ "Information and Entropy (6.050J Spring 2008)": 33.3,
16
+ "Introduction to Astronomy (8.282J Spring 2006)": 18.9,
17
+ "Introduction to Solid State Chemistry (3.091 Fall 2010)": 20.6,
18
+ "Physical Chemistry (5.61 Fall 2017)": 0.0,
19
+ "Principles of Microeconomics (14.01 Fall 2011)": 66.7,
20
+ "Relativity (8.033 Fall 2006)": 27.3
21
+ },
22
+ "time_use_in_second": 55.90699505805969,
23
+ "time_use_in_minite": "0:55"
24
+ }
eval_results/global_step_20/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_20/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 675,
3
+ "num_scores": 675,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 1,
6
+ "acc": 37.8,
7
+ "pass_acc": 37.8,
8
+ "pass@k": {
9
+ "1": 37.8
10
+ },
11
+ "time_use_in_second": 102.87086319923401,
12
+ "time_use_in_minite": "1:42"
13
+ }
eval_results/global_step_45/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_45/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 30,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 16.7,
7
+ "pass_acc": 16.7,
8
+ "pass@k": {
9
+ "1": 16.7
10
+ },
11
+ "time_use_in_second": 16.225586891174316,
12
+ "time_use_in_minite": "0:16"
13
+ }
eval_results/global_step_45/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_45/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 40,
3
+ "num_scores": 40,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 52.5,
7
+ "pass_acc": 52.5,
8
+ "pass@k": {
9
+ "1": 52.5
10
+ },
11
+ "time_use_in_second": 16.344417095184326,
12
+ "time_use_in_minite": "0:16"
13
+ }
eval_results/global_step_45/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_45/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 1319,
3
+ "num_scores": 1319,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 1,
6
+ "acc": 91.1,
7
+ "pass_acc": 91.1,
8
+ "pass@k": {
9
+ "1": 91.1
10
+ },
11
+ "time_use_in_second": 60.893043756484985,
12
+ "time_use_in_minite": "1:00"
13
+ }
eval_results/global_step_45/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 500,
3
+ "num_scores": 500,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 1,
6
+ "acc": 76.6,
7
+ "pass_acc": 76.6,
8
+ "pass@k": {
9
+ "1": 76.6
10
+ },
11
+ "time_use_in_second": 71.2480161190033,
12
+ "time_use_in_minite": "1:11"
13
+ }
eval_results/global_step_45/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_45/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 272,
3
+ "num_scores": 272,
4
+ "timeout_samples": 1,
5
+ "empty_samples": 0,
6
+ "acc": 34.9,
7
+ "pass_acc": 34.9,
8
+ "pass@k": {
9
+ "1": 34.9
10
+ },
11
+ "type_acc": {
12
+ "Differential Equations (18.03 Spring 2010)": 62.5,
13
+ "Dynamics and Control (2.003 Spring 2005)": 50.0,
14
+ "Ecology I (1.018J Fall 2009)": 60.0,
15
+ "Information and Entropy (6.050J Spring 2008)": 33.3,
16
+ "Introduction to Astronomy (8.282J Spring 2006)": 26.4,
17
+ "Introduction to Solid State Chemistry (3.091 Fall 2010)": 20.6,
18
+ "Physical Chemistry (5.61 Fall 2017)": 9.1,
19
+ "Principles of Microeconomics (14.01 Fall 2011)": 55.6,
20
+ "Relativity (8.033 Fall 2006)": 27.3
21
+ },
22
+ "time_use_in_second": 30.01806926727295,
23
+ "time_use_in_minite": "0:30"
24
+ }
eval_results/global_step_60/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_60/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 30,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 10.0,
7
+ "pass_acc": 10.0,
8
+ "pass@k": {
9
+ "1": 10.0
10
+ },
11
+ "time_use_in_second": 19.720505237579346,
12
+ "time_use_in_minite": "0:19"
13
+ }
eval_results/global_step_60/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff