johndoe123345 commited on
Commit
110855f
·
verified ·
1 Parent(s): e60aa28

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +3 -0
  2. eval_results_0622/eval_results.csv +23 -0
  3. eval_results_0622/global_step_10/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  4. eval_results_0622/global_step_10/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  5. eval_results_0622/global_step_10/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  6. eval_results_0622/global_step_10/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  7. eval_results_0622/global_step_10/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  8. eval_results_0622/global_step_10/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  9. eval_results_0622/global_step_10/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  10. eval_results_0622/global_step_10/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  11. eval_results_0622/global_step_10/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  12. eval_results_0622/global_step_10/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +24 -0
  13. eval_results_0622/global_step_10/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  14. eval_results_0622/global_step_10/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  15. eval_results_0622/global_step_20/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +24 -0
  16. eval_results_0622/global_step_60/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  17. eval_results_0622/global_step_60/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  18. eval_results_0622/global_step_60/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  19. eval_results_0622/global_step_60/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  20. eval_results_0622/global_step_60/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  21. eval_results_0622/global_step_60/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  22. eval_results_0622/global_step_60/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  23. eval_results_0622/global_step_60/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  24. eval_results_0622/global_step_60/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  25. eval_results_0622/global_step_60/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +24 -0
  26. eval_results_0622/global_step_60/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  27. eval_results_0622/global_step_60/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  28. eval_results_0622/global_step_65/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  29. eval_results_0622/global_step_65/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  30. eval_results_0622/global_step_65/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  31. eval_results_0622/global_step_65/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  32. eval_results_0622/global_step_65/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  33. eval_results_0622/global_step_65/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  34. eval_results_0622/global_step_65/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  35. eval_results_0622/global_step_65/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  36. eval_results_0622/global_step_65/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  37. eval_results_0622/global_step_65/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +24 -0
  38. eval_results_0622/global_step_65/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  39. eval_results_0622/global_step_65/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  40. eval_results_0622/global_step_70/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  41. eval_results_0622/global_step_70/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  42. eval_results_0622/global_step_70/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  43. eval_results_0622/global_step_70/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  44. eval_results_0622/global_step_70/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  45. eval_results_0622/global_step_70/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  46. eval_results_0622/global_step_70/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  47. eval_results_0622/global_step_70/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  48. eval_results_0622/global_step_70/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  49. eval_results_0622/global_step_70/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +24 -0
  50. eval_results_0622/global_step_70/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
.gitattributes CHANGED
@@ -45,3 +45,6 @@ global_step_105/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -
45
  global_step_5/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
46
  global_step_75/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
47
  global_step_55/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 
 
 
 
45
  global_step_5/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
46
  global_step_75/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
47
  global_step_55/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
48
+ global_step_15/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
49
+ global_step_30/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
50
+ global_step_95/actor/huggingface/tokenizer.json filter=lfs diff=lfs merge=lfs -text
eval_results_0622/eval_results.csv ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model,minerva_math_acc,minerva_math_pass_acc,minerva_math_tokens,minerva_math_keywords,minerva_math_correct_tokens,minerva_math_wrong_tokens,minerva_math_clip_ratio,minerva_math_stop_tokens,minerva_math_stop_ratio,minerva_math_box_ratio,minerva_math_repeat_ratio,amc23_acc,amc23_pass_acc,amc23_tokens,amc23_keywords,amc23_correct_tokens,amc23_wrong_tokens,amc23_clip_ratio,amc23_stop_tokens,amc23_stop_ratio,amc23_box_ratio,amc23_repeat_ratio,aime24_acc,aime24_pass_acc,aime24_tokens,aime24_keywords,aime24_correct_tokens,aime24_wrong_tokens,aime24_clip_ratio,aime24_stop_tokens,aime24_stop_ratio,aime24_box_ratio,aime24_repeat_ratio,gsm8k_acc,gsm8k_pass_acc,gsm8k_tokens,gsm8k_keywords,gsm8k_correct_tokens,gsm8k_wrong_tokens,gsm8k_clip_ratio,gsm8k_stop_tokens,gsm8k_stop_ratio,gsm8k_box_ratio,gsm8k_repeat_ratio,math500_acc,math500_pass_acc,math500_tokens,math500_keywords,math500_correct_tokens,math500_wrong_tokens,math500_clip_ratio,math500_stop_tokens,math500_stop_ratio,math500_box_ratio,math500_repeat_ratio,olympiadbench_acc,olympiadbench_pass_acc,olympiadbench_tokens,olympiadbench_keywords,olympiadbench_correct_tokens,olympiadbench_wrong_tokens,olympiadbench_clip_ratio,olympiadbench_stop_tokens,olympiadbench_stop_ratio,olympiadbench_box_ratio,olympiadbench_repeat_ratio,avg_acc,avg_pass_acc,avg_tokens,avg_keywords,avg_correct_tokens,avg_wrong_tokens,avg_clip_ratio,avg_stop_tokens,avg_stop_ratio,avg_box_ratio,avg_repeat_ratio
2
+ eval_results-global_step_0,19.9,19.9,634.0625,0.15441176470588236,507.7037037037037,665.3623853211009,0.0,634.0625,1.0,0.7610294117647058,0.44485294117647056,35.0,35.0,867.125,0.25,1288.857142857143,640.0384615384615,0.0,867.125,1.0,0.85,0.625,3.3,3.3,1002.0,0.13333333333333333,713.0,1011.9655172413793,0.0,1002.0,1.0,0.7333333333333333,0.7666666666666667,74.8,74.8,341.27445034116755,0.0356330553449583,286.6764705882353,502.93693693693695,0.0,341.27445034116755,1.0,0.8377558756633814,0.2623199393479909,52.0,52.0,631.896,0.166,482.8076923076923,793.4083333333333,0.0,631.896,1.0,0.828,0.514,17.8,17.8,822.5377777777778,0.23851851851851852,655.625,858.627027027027,0.0,822.5377777777778,1.0,0.7822222222222223,0.6251851851851852,33.800000000000004,33.800000000000004,716.4826213531575,0.16298277865044875,655.7783349094624,745.3897768997064,0.0,716.4826213531575,1.0,0.798723473830607,0.5396707887293856
3
+ eval_results-global_step_5,33.1,33.1,646.5661764705883,0.13970588235294118,476.9888888888889,730.4230769230769,0.0,646.5661764705883,1.0,0.9852941176470589,0.44485294117647056,42.5,42.5,819.075,0.375,674.3529411764706,926.0434782608696,0.0,819.075,1.0,1.0,0.7,6.7,6.7,1234.2666666666667,0.3333333333333333,706.5,1271.9642857142858,0.0,1234.2666666666667,1.0,0.9333333333333333,0.6666666666666666,87.9,87.9,297.9673995451099,0.024260803639120546,281.0353753235548,420.61875,0.0,297.9673995451099,1.0,0.9977255496588324,0.2092494313874147,67.0,67.0,621.562,0.174,487.2656716417911,894.2242424242424,0.0,621.562,1.0,0.998,0.472,33.3,33.3,896.794074074074,0.37037037037037035,699.4,995.4911111111111,0.0,896.794074074074,1.0,0.9807407407407407,0.6681481481481482,45.083333333333336,45.083333333333336,752.7052194594065,0.23611173161596088,554.2571461717843,873.1274907389311,0.0,752.7052194594065,1.0,0.9825156235633276,0.5268195312297833
4
+ eval_results-global_step_10,33.8,33.8,667.3272058823529,0.11029411764705882,536.2391304347826,734.3277777777778,0.0,667.3272058823529,1.0,0.9963235294117647,0.45955882352941174,42.5,42.5,857.625,0.175,666.1764705882352,999.1304347826087,0.0,857.625,1.0,1.0,0.5,10.0,10.0,1121.2,0.23333333333333334,1034.6666666666667,1130.8148148148148,0.0,1121.2,1.0,0.9666666666666667,0.6333333333333333,90.3,90.3,317.0538286580743,0.02122820318423048,305.9731318219983,420.15625,0.0,317.0538286580743,1.0,0.9992418498862775,0.17589082638362397,72.4,72.4,626.22,0.158,536.0165745856353,862.8405797101449,0.0,626.22,1.0,1.0,0.47,33.2,33.2,945.2385185185185,0.2311111111111111,721.9017857142857,1056.1640798226165,0.0,945.2385185185185,1.0,0.9807407407407407,0.6888888888888889,47.03333333333333,47.03333333333333,755.7774255098243,0.15482779421262227,633.4956266352673,867.2389894846605,0.0,755.7774255098243,1.0,0.9904954644509082,0.48794531202254304
5
+ eval_results-global_step_15,39.3,39.3,694.3566176470588,0.125,564.4672897196261,778.5878787878788,0.0,694.3566176470588,1.0,0.9963235294117647,0.46691176470588236,50.0,50.0,906.825,0.15,898.4,915.25,0.0,906.825,1.0,1.0,0.55,13.3,13.3,1512.0666666666666,0.16666666666666666,927.0,1602.076923076923,0.0,1512.0666666666666,1.0,0.9333333333333333,0.7666666666666667,89.6,89.6,337.1008339651251,0.022744503411675512,325.97800338409473,433.06569343065695,0.0,337.1008339651251,1.0,0.9977255496588324,0.14783927217589082,73.8,73.8,650.856,0.144,551.1653116531165,931.6641221374045,0.0,650.856,1.0,0.998,0.466,34.5,34.5,950.1955555555555,0.2325925925925926,733.6008583690987,1064.3733031674208,0.0,950.1955555555555,1.0,0.9718518518518519,0.6637037037037037,50.083333333333336,50.083333333333336,841.9001123057345,0.1401672937784891,666.768577187656,954.1696534333809,0.0,841.9001123057345,1.0,0.9828723773759638,0.5101869012086906
6
+ eval_results-global_step_20,38.2,38.2,721.0441176470588,0.125,569.5576923076923,814.8214285714286,0.0,721.0441176470588,1.0,0.9963235294117647,0.48161764705882354,52.5,52.5,938.95,0.225,791.3333333333334,1102.1052631578948,0.0,938.95,1.0,0.975,0.75,13.3,13.3,1399.3666666666666,0.16666666666666666,935.0,1470.8076923076924,0.0,1399.3666666666666,1.0,0.9333333333333333,0.8666666666666667,89.7,89.7,332.289613343442,0.028051554207733132,323.70076077768385,407.0,0.0,332.289613343442,1.0,0.9992418498862775,0.16224412433661864,74.4,74.4,652.914,0.146,545.5188172043011,965.03125,0.0,652.914,1.0,1.0,0.478,36.6,36.6,936.9748148148149,0.28296296296296297,759.5303643724696,1039.3785046728972,0.0,936.9748148148149,1.0,0.9837037037037037,0.6488888888888888,50.78333333333334,50.78333333333334,830.256535411997,0.16228019730622711,654.1068279992467,966.5240231183188,0.0,830.256535411997,1.0,0.9812670693891797,0.564569554491833
7
+ eval_results-global_step_25,38.6,38.6,688.5588235294117,0.125,571.3047619047619,762.2814371257485,0.0,688.5588235294117,1.0,0.9963235294117647,0.4227941176470588,50.0,50.0,996.1,0.25,799.9,1192.3,0.0,996.1,1.0,1.0,0.725,10.0,10.0,1130.4666666666667,0.3333333333333333,783.6666666666666,1169.0,0.0,1130.4666666666667,1.0,0.9333333333333333,0.7333333333333333,90.3,90.3,333.89310083396515,0.019711902956785442,321.36104114189754,450.5,0.0,333.89310083396515,1.0,0.9992418498862775,0.15769522365428354,76.6,76.6,644.042,0.134,551.8694516971279,945.7692307692307,0.0,644.042,1.0,0.994,0.478,36.9,36.9,929.6785185185186,0.2874074074074074,718.0321285140562,1053.387323943662,0.0,929.6785185185186,1.0,0.9807407407407407,0.6444444444444445,50.4,50.4,787.1231849247603,0.19157544061625434,624.3556749874184,928.8729986397735,0.0,787.1231849247603,1.0,0.9839399088953527,0.5268778531798534
8
+ eval_results-global_step_30,37.5,37.5,689.9632352941177,0.09191176470588236,546.1372549019608,776.2588235294118,0.0,689.9632352941177,1.0,0.9963235294117647,0.47794117647058826,60.0,60.0,936.1,0.3,838.125,1083.0625,0.0,936.1,1.0,1.0,0.725,16.7,16.7,1308.8666666666666,0.4666666666666667,988.2,1373.0,0.0,1308.8666666666666,1.0,0.9333333333333333,0.7333333333333333,91.3,91.3,327.7414708112206,0.022744503411675512,320.4892026578073,403.6695652173913,0.0,327.7414708112206,1.0,0.9992418498862775,0.1645185746777862,75.6,75.6,646.47,0.156,540.8756613756614,973.639344262295,0.0,646.47,1.0,0.994,0.478,35.9,35.9,985.7733333333333,0.3348148148148148,751.9338842975206,1116.4642032332563,0.0,985.7733333333333,1.0,0.9733333333333334,0.6503703703703704,52.833333333333336,52.833333333333336,815.819117684223,0.2286896249331732,664.293500538825,954.349072707059,0.0,815.819117684223,1.0,0.9827053409941181,0.5381939091420129
9
+ eval_results-global_step_35,41.9,41.9,690.1507352941177,0.125,582.7280701754386,767.6582278481013,0.0,690.1507352941177,1.0,1.0,0.5036764705882353,57.5,57.5,961.375,0.175,738.1304347826087,1263.4117647058824,0.0,961.375,1.0,1.0,0.65,13.3,13.3,1410.7,0.7,892.0,1490.5,0.0,1410.7,1.0,0.9333333333333333,0.8666666666666667,90.4,90.4,326.88703563305535,0.016679302501895376,317.86839899413246,412.27777777777777,0.0,326.88703563305535,1.0,0.9992418498862775,0.15238817285822592,75.2,75.2,642.932,0.172,537.8803191489362,961.4758064516129,0.0,642.932,1.0,0.998,0.512,37.0,37.0,972.2474074074074,0.2785185185185185,768.512,1092.0917647058823,0.0,972.2474074074074,1.0,0.9733333333333334,0.6592592592592592,52.550000000000004,52.550000000000004,834.0486963890967,0.244532970170069,639.5198705168526,997.9025569148762,0.0,834.0486963890967,1.0,0.9839847527588241,0.5573317615620645
10
+ eval_results-global_step_40,41.2,41.2,730.5073529411765,0.13602941176470587,586.8928571428571,831.0375,0.0,730.5073529411765,1.0,0.9963235294117647,0.4889705882352941,57.5,57.5,926.85,0.35,766.0,1144.4705882352941,0.0,926.85,1.0,0.975,0.7,13.3,13.3,1355.9,0.43333333333333335,809.75,1439.923076923077,0.0,1355.9,1.0,0.9,0.8333333333333334,91.4,91.4,329.8673237300986,0.028051554207733132,322.12603648424545,412.4867256637168,0.0,329.8673237300986,1.0,0.9992418498862775,0.15769522365428354,75.6,75.6,636.426,0.156,545.1058201058202,919.3688524590164,0.0,636.426,1.0,1.0,0.458,39.0,39.0,979.6103703703703,0.3274074074074074,754.0950570342205,1123.5679611650485,0.0,979.6103703703703,1.0,0.9674074074074074,0.6562962962962963,53.0,53.0,826.5268411736075,0.23847028445219662,630.6616284611906,978.4757840743588,0.0,826.5268411736075,1.0,0.9729954644509081,0.5490492402532012
11
+ eval_results-global_step_45,41.2,41.2,700.3272058823529,0.13970588235294118,584.8482142857143,781.1625,0.0,700.3272058823529,1.0,0.9963235294117647,0.5367647058823529,52.5,52.5,1008.3,0.25,791.5238095238095,1247.8947368421052,0.0,1008.3,1.0,1.0,0.75,16.7,16.7,1122.3333333333333,0.5333333333333333,870.6,1172.68,0.0,1122.3333333333333,1.0,0.9666666666666667,0.7,91.1,91.1,329.82714177407126,0.01819560272934041,320.93588676103246,420.3220338983051,0.0,329.82714177407126,1.0,0.9992418498862775,0.16148597422289612,76.6,76.6,663.268,0.16,551.1436031331592,1030.3076923076924,0.0,663.268,1.0,0.99,0.47,38.1,38.1,937.2607407407407,0.35555555555555557,764.4824902723735,1043.4904306220096,0.0,937.2607407407407,1.0,0.9866666666666667,0.6666666666666666,52.70000000000001,52.70000000000001,793.5527369550831,0.24279839566186176,647.2556673293482,949.3095656116856,0.0,793.5527369550831,1.0,0.9898164521052292,0.5474862244619859
12
+ eval_results-global_step_50,39.0,39.0,721.4117647058823,0.15808823529411764,577.9150943396227,813.0421686746988,0.0,721.4117647058823,1.0,1.0,0.47794117647058826,47.5,47.5,1100.625,0.5,730.578947368421,1435.4285714285713,0.0,1100.625,1.0,0.95,0.7,13.3,13.3,1167.8,0.8,829.5,1219.8461538461538,0.0,1167.8,1.0,0.9666666666666667,0.9,91.1,91.1,335.8779378316907,0.019711902956785442,323.6447587354409,461.55555555555554,0.0,335.8779378316907,1.0,0.9984836997725549,0.1516300227445034,76.0,76.0,673.072,0.22,567.1473684210526,1008.5,0.0,673.072,1.0,0.988,0.454,40.3,40.3,973.7348148148149,0.3718518518518519,749.6764705882352,1124.9602977667494,0.0,973.7348148148149,1.0,0.9777777777777777,0.6681481481481482,51.199999999999996,51.199999999999996,828.7535862253981,0.34494199835045913,629.7437732421288,1010.5554578786214,0.0,828.7535862253981,1.0,0.9801546907028332,0.5586198912272068
13
+ eval_results-global_step_55,38.6,38.6,711.6691176470588,0.1323529411764706,559.3904761904762,807.4131736526946,0.0,711.6691176470588,1.0,1.0,0.4522058823529412,57.5,57.5,1152.75,0.325,800.695652173913,1629.0588235294117,0.0,1152.75,1.0,0.95,0.8,16.7,16.7,1314.5666666666666,1.8666666666666667,865.6,1404.36,0.0,1314.5666666666666,1.0,0.9333333333333333,0.8,91.5,91.5,348.7475360121304,0.022744503411675512,330.13587406793704,549.3214285714286,0.0,348.7475360121304,1.0,0.9984836997725549,0.15238817285822592,75.8,75.8,677.508,0.198,539.8496042216359,1108.685950413223,0.0,677.508,1.0,0.992,0.45,37.8,37.8,1022.8844444444444,0.30074074074074075,708.2627450980392,1213.904761904762,0.0,1022.8844444444444,1.0,0.965925925925926,0.6681481481481482,52.98333333333334,52.98333333333334,871.3542941283835,0.47425080866592556,633.9890586253335,1118.7906896785864,0.0,871.3542941283835,1.0,0.973290493171969,0.5537903672265526
14
+ eval_results-global_step_60,43.8,43.8,733.7352941176471,0.1875,582.9411764705883,851.0196078431372,0.0,733.7352941176471,1.0,0.9963235294117647,0.4889705882352941,55.0,55.0,991.675,0.275,827.8636363636364,1191.888888888889,0.0,991.675,1.0,1.0,0.725,10.0,10.0,1125.8666666666666,0.4666666666666667,845.6666666666666,1157.0,0.0,1125.8666666666666,1.0,0.9333333333333333,0.7,90.7,90.7,373.92115238817286,0.04397270659590599,340.8035117056856,695.9430894308944,0.0,373.92115238817286,1.0,0.9984836997725549,0.1645185746777862,76.8,76.8,678.958,0.146,548.7994791666666,1109.8275862068965,0.0,678.958,1.0,0.992,0.51,39.7,39.7,1049.9422222222222,0.3288888888888889,773.1417910447761,1232.208845208845,0.0,1049.9422222222222,1.0,0.9703703703703703,0.6785185185185185,52.666666666666664,52.666666666666664,825.6830558991181,0.24133804369191028,653.2027102363367,1039.648002929777,0.0,825.6830558991181,1.0,0.9817518221480039,0.5445012802385998
15
+ eval_results-global_step_65,41.2,41.2,739.7904411764706,0.15808823529411764,568.7232142857143,859.5375,0.0,739.7904411764706,1.0,0.9963235294117647,0.5110294117647058,52.5,52.5,1005.925,0.275,744.0,1295.421052631579,0.0,1005.925,1.0,0.975,0.725,13.3,13.3,1491.0333333333333,0.36666666666666664,902.5,1581.576923076923,0.0,1491.0333333333333,1.0,0.9,0.8,90.6,90.6,352.14859742228964,0.0356330553449583,335.63430962343097,511.2983870967742,0.0,352.14859742228964,1.0,0.9992418498862775,0.17664897649734648,77.0,77.0,676.31,0.182,560.987012987013,1062.391304347826,0.0,676.31,1.0,0.992,0.484,40.7,40.7,979.8533333333334,0.3422222222222222,764.7745454545454,1127.72,0.0,979.8533333333334,1.0,0.9837037037037037,0.6814814814814815,52.550000000000004,52.550000000000004,874.1767842109044,0.22660169658799414,646.103180391784,1072.990861192184,0.0,874.1767842109044,1.0,0.9743781805002909,0.5630266449572557
16
+ eval_results-global_step_70,41.2,41.2,749.5367647058823,0.16911764705882354,593.3125,858.89375,0.0,749.5367647058823,1.0,0.9963235294117647,0.5441176470588235,40.0,40.0,1033.2,0.25,672.25,1273.8333333333333,0.0,1033.2,1.0,0.975,0.675,13.3,13.3,1227.4,0.4666666666666667,892.75,1278.8846153846155,0.0,1227.4,1.0,0.9666666666666667,0.8666666666666667,91.4,91.4,398.0068233510235,0.030326004548900682,360.90464344941955,793.9823008849557,0.000758150113722517,386.1752655538695,0.9992418498862775,0.9992418498862775,0.19257012888551933,76.6,76.6,701.854,0.186,550.3185378590078,1197.905982905983,0.0,701.854,1.0,0.992,0.462,41.2,41.2,1046.6577777777777,0.33925925925925926,789.2517985611511,1226.9068010075566,0.0,1046.6577777777777,1.0,0.9733333333333334,0.6962962962962963,50.61666666666667,50.61666666666667,859.4425609724473,0.24022826292227503,643.1312466449298,1105.0677972527408,0.00012635835228708617,857.4706346729216,0.9998736416477129,0.9837608965496737,0.5727751231512177
17
+ eval_results-global_step_75,40.4,40.4,787.4227941176471,0.17279411764705882,583.7727272727273,925.7037037037037,0.0,787.4227941176471,1.0,0.9926470588235294,0.5110294117647058,55.0,55.0,1115.85,0.25,882.5454545454545,1401.0,0.0,1115.85,1.0,0.95,0.775,20.0,20.0,1656.8,4.166666666666667,949.6666666666666,1833.5833333333333,0.0,1656.8,1.0,0.9,0.8666666666666667,90.4,90.4,417.6664139499621,0.04094010614101592,385.6501677852349,718.1653543307086,0.000758150113722517,405.86191198786037,0.9992418498862775,0.9977255496588324,0.20318423047763456,76.0,76.0,681.972,0.18,549.6473684210526,1101.0,0.0,681.972,1.0,0.992,0.488,41.3,41.3,1065.4459259259258,0.37925925925925924,793.5089605734767,1257.0378787878788,0.0,1065.4459259259258,1.0,0.9748148148148148,0.72,53.85,53.85,954.192855665589,0.8649433582856668,690.7985575441021,1206.081711692604,0.00012635835228708617,952.2254386719055,0.9998736416477129,0.9678645705495295,0.5939800514848345
18
+ eval_results-global_step_80,40.8,40.8,787.0,0.14338235294117646,626.2342342342342,897.8385093167702,0.0,787.0,1.0,0.9926470588235294,0.5220588235294118,50.0,50.0,1035.375,0.3,859.1,1211.65,0.0,1035.375,1.0,1.0,0.7,13.3,13.3,1263.3333333333333,0.8,840.5,1328.3846153846155,0.0,1263.3333333333333,1.0,0.9333333333333333,0.7666666666666667,89.2,89.2,565.0629264594389,0.06595905989385899,492.38690476190476,1162.7342657342658,0.000758150113722517,553.3528072837632,0.9992418498862775,0.9992418498862775,0.221379833206975,74.2,74.2,695.19,0.174,578.7816711590297,1029.9767441860465,0.0,695.19,1.0,0.994,0.488,39.1,39.1,1121.6918518518519,0.36,803.75,1325.9172749391728,0.0,1121.6918518518519,1.0,0.96,0.674074074074074,51.1,51.1,911.2755186074373,0.3072235688058393,700.1254683591948,1159.4169015934785,0.00012635835228708617,909.323832078158,0.9998736416477129,0.9798703736738567,0.5620298995795213
19
+ eval_results-global_step_85,39.0,39.0,820.9926470588235,0.15073529411764705,606.7547169811321,957.7951807228916,0.0,820.9926470588235,1.0,0.9963235294117647,0.5036764705882353,57.5,57.5,1056.975,0.375,887.0,1286.9411764705883,0.0,1056.975,1.0,0.975,0.675,16.7,16.7,1470.6333333333334,0.6666666666666666,1010.4,1562.68,0.0,1470.6333333333334,1.0,0.9333333333333333,0.7333333333333333,88.8,88.8,537.8081880212281,0.08339651250947688,462.1870196413322,1136.1351351351352,0.000758150113722517,526.2488619119879,0.9992418498862775,0.9984836997725549,0.21304018195602728,75.6,75.6,692.224,0.188,554.8306878306878,1117.9180327868853,0.0,692.224,1.0,0.992,0.486,37.0,37.0,1108.2977777777778,0.38074074074074077,770.588,1306.9505882352942,0.0,1108.2977777777778,1.0,0.9629629629629629,0.6814814814814815,52.43333333333334,52.43333333333334,947.8218243651937,0.30742320233908854,715.2934040755252,1228.070018891799,0.00012635835228708617,945.8952700136538,0.9998736416477129,0.9763505875801025,0.5487552445598463
20
+ eval_results-global_step_90,38.2,38.2,734.4007352941177,0.19852941176470587,579.5192307692307,830.2797619047619,0.0,734.4007352941177,1.0,1.0,0.5330882352941176,45.0,45.0,1091.0,0.475,764.1111111111111,1358.4545454545455,0.0,1091.0,1.0,0.975,0.7,10.0,10.0,1423.4333333333334,3.7333333333333334,963.0,1474.5925925925926,0.0,1423.4333333333334,1.0,0.9666666666666667,0.6666666666666666,92.7,92.7,352.0235026535254,0.0356330553449583,331.28291087489777,616.25,0.0,352.0235026535254,1.0,0.9992418498862775,0.19484457922668688,73.2,73.2,680.796,0.198,534.9016393442623,1079.2835820895523,0.0,680.796,1.0,0.988,0.498,39.0,39.0,1151.0355555555554,0.4622222222222222,795.0684410646388,1378.2669902912621,0.0,1151.0355555555554,1.0,0.9585185185185185,0.6681481481481482,49.68333333333334,49.68333333333334,905.4481878060886,0.8504530037775369,661.3138888606901,1122.8545787221192,0.0,905.4481878060886,1.0,0.9812378391785771,0.5434579382226032
21
+ eval_results-global_step_95,41.9,41.9,718.4154411764706,0.19117647058823528,562.1228070175439,831.1835443037975,0.0,718.4154411764706,1.0,1.0,0.5257352941176471,57.5,57.5,1013.0,0.325,839.2173913043479,1248.1176470588234,0.0,1013.0,1.0,1.0,0.775,23.3,23.3,1412.0333333333333,0.6333333333333333,861.7142857142857,1579.5217391304348,0.0,1412.0333333333333,1.0,0.9333333333333333,0.8,91.7,91.7,338.63078089461715,0.034874905231235785,327.7617866004963,458.09090909090907,0.0,338.63078089461715,1.0,0.9992418498862775,0.19029567854435178,76.6,76.6,710.514,0.218,565.8668407310705,1184.017094017094,0.0,710.514,1.0,0.99,0.502,37.9,37.9,1115.7925925925927,0.3688888888888889,725.57421875,1354.2076372315037,0.0,1115.7925925925927,1.0,0.957037037037037,0.6844444444444444,54.81666666666666,54.81666666666666,884.731024666169,0.2952122663402822,647.0428883529574,1109.189761805427,0.0,884.731024666169,1.0,0.9799353700427748,0.5795792361844072
22
+ eval_results-global_step_100,37.5,37.5,840.9007352941177,0.1875,574.4313725490196,1000.7823529411764,0.0,840.9007352941177,1.0,0.9816176470588235,0.5294117647058824,52.5,52.5,1399.2,1.35,957.3333333333334,1887.578947368421,0.0,1399.2,1.0,0.95,0.775,13.3,13.3,1361.2333333333333,0.8333333333333334,865.5,1437.5,0.0,1361.2333333333333,1.0,0.9666666666666667,0.8,91.7,91.7,348.95451099317665,0.027293404094010616,333.8132231404959,517.0366972477065,0.0,348.95451099317665,1.0,0.9977255496588324,0.19029567854435178,75.0,75.0,757.522,0.218,548.016,1386.04,0.0,757.522,1.0,0.986,0.512,39.7,39.7,1103.3155555555556,0.4266666666666667,781.294776119403,1315.3587223587224,0.0,1103.3155555555556,1.0,0.9733333333333334,0.7214814814814815,51.61666666666667,51.61666666666667,968.521022529364,0.5071322340156684,676.7314508570421,1257.382786652671,0.0,968.521022529364,1.0,0.975890532786276,0.588031487455286
23
+ eval_results-global_step_105,41.5,41.5,733.1580882352941,0.15808823529411764,600.0265486725664,827.7735849056604,0.0,733.1580882352941,1.0,0.9963235294117647,0.5036764705882353,45.0,45.0,1019.825,0.325,736.1666666666666,1251.909090909091,0.0,1019.825,1.0,1.0,0.725,16.7,16.7,1720.5333333333333,0.5666666666666667,1200.6,1824.52,0.0,1720.5333333333333,1.0,0.8666666666666667,0.8333333333333334,91.8,91.8,332.62168309325244,0.02350265352539803,320.2923203963666,470.8703703703704,0.0,332.62168309325244,1.0,0.9992418498862775,0.20545868081880211,77.0,77.0,710.106,0.172,555.6623376623377,1227.1565217391305,0.0,710.106,1.0,0.99,0.486,40.6,40.6,1131.1348148148147,0.37777777777777777,787.5875912408759,1365.8778054862844,0.0,1131.1348148148147,1.0,0.96,0.6933333333333334,52.1,52.1,941.2298199127823,0.2705058888773267,700.0559107731355,1161.351228901756,0.0,941.2298199127823,1.0,0.9687053409941182,0.5744669696789506
eval_results_0622/global_step_10/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_0622/global_step_10/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 30,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 10.0,
7
+ "pass_acc": 10.0,
8
+ "pass@k": {
9
+ "1": 10.0
10
+ },
11
+ "time_use_in_second": 22.24517822265625,
12
+ "time_use_in_minite": "0:22"
13
+ }
eval_results_0622/global_step_10/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_0622/global_step_10/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 40,
3
+ "num_scores": 40,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 42.5,
7
+ "pass_acc": 42.5,
8
+ "pass@k": {
9
+ "1": 42.5
10
+ },
11
+ "time_use_in_second": 11.488719463348389,
12
+ "time_use_in_minite": "0:11"
13
+ }
eval_results_0622/global_step_10/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_0622/global_step_10/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 1319,
3
+ "num_scores": 1319,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 90.3,
7
+ "pass_acc": 90.3,
8
+ "pass@k": {
9
+ "1": 90.3
10
+ },
11
+ "time_use_in_second": 75.81562781333923,
12
+ "time_use_in_minite": "1:15"
13
+ }
eval_results_0622/global_step_10/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_0622/global_step_10/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 500,
3
+ "num_scores": 500,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 1,
6
+ "acc": 72.4,
7
+ "pass_acc": 72.4,
8
+ "pass@k": {
9
+ "1": 72.4
10
+ },
11
+ "time_use_in_second": 44.2845983505249,
12
+ "time_use_in_minite": "0:44"
13
+ }
eval_results_0622/global_step_10/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_0622/global_step_10/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 272,
3
+ "num_scores": 272,
4
+ "timeout_samples": 3,
5
+ "empty_samples": 0,
6
+ "acc": 33.8,
7
+ "pass_acc": 33.8,
8
+ "pass@k": {
9
+ "1": 33.8
10
+ },
11
+ "type_acc": {
12
+ "Differential Equations (18.03 Spring 2010)": 58.3,
13
+ "Dynamics and Control (2.003 Spring 2005)": 46.2,
14
+ "Ecology I (1.018J Fall 2009)": 20.0,
15
+ "Information and Entropy (6.050J Spring 2008)": 0.0,
16
+ "Introduction to Astronomy (8.282J Spring 2006)": 22.6,
17
+ "Introduction to Solid State Chemistry (3.091 Fall 2010)": 22.7,
18
+ "Physical Chemistry (5.61 Fall 2017)": 36.4,
19
+ "Principles of Microeconomics (14.01 Fall 2011)": 44.4,
20
+ "Relativity (8.033 Fall 2006)": 45.5
21
+ },
22
+ "time_use_in_second": 31.698296308517456,
23
+ "time_use_in_minite": "0:31"
24
+ }
eval_results_0622/global_step_10/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_0622/global_step_10/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 675,
3
+ "num_scores": 675,
4
+ "timeout_samples": 1,
5
+ "empty_samples": 0,
6
+ "acc": 33.2,
7
+ "pass_acc": 33.2,
8
+ "pass@k": {
9
+ "1": 33.2
10
+ },
11
+ "time_use_in_second": 109.69667172431946,
12
+ "time_use_in_minite": "1:49"
13
+ }
eval_results_0622/global_step_20/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 272,
3
+ "num_scores": 272,
4
+ "timeout_samples": 2,
5
+ "empty_samples": 0,
6
+ "acc": 38.2,
7
+ "pass_acc": 38.2,
8
+ "pass@k": {
9
+ "1": 38.2
10
+ },
11
+ "type_acc": {
12
+ "Differential Equations (18.03 Spring 2010)": 66.7,
13
+ "Dynamics and Control (2.003 Spring 2005)": 53.8,
14
+ "Ecology I (1.018J Fall 2009)": 60.0,
15
+ "Information and Entropy (6.050J Spring 2008)": 33.3,
16
+ "Introduction to Astronomy (8.282J Spring 2006)": 24.5,
17
+ "Introduction to Solid State Chemistry (3.091 Fall 2010)": 26.8,
18
+ "Physical Chemistry (5.61 Fall 2017)": 18.2,
19
+ "Principles of Microeconomics (14.01 Fall 2011)": 55.6,
20
+ "Relativity (8.033 Fall 2006)": 27.3
21
+ },
22
+ "time_use_in_second": 55.51261758804321,
23
+ "time_use_in_minite": "0:55"
24
+ }
eval_results_0622/global_step_60/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_0622/global_step_60/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 30,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 10.0,
7
+ "pass_acc": 10.0,
8
+ "pass@k": {
9
+ "1": 10.0
10
+ },
11
+ "time_use_in_second": 19.802618265151978,
12
+ "time_use_in_minite": "0:19"
13
+ }
eval_results_0622/global_step_60/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_0622/global_step_60/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 40,
3
+ "num_scores": 40,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 55.0,
7
+ "pass_acc": 55.0,
8
+ "pass@k": {
9
+ "1": 55.0
10
+ },
11
+ "time_use_in_second": 13.86696457862854,
12
+ "time_use_in_minite": "0:13"
13
+ }
eval_results_0622/global_step_60/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_0622/global_step_60/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 1319,
3
+ "num_scores": 1319,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 1,
6
+ "acc": 90.7,
7
+ "pass_acc": 90.7,
8
+ "pass@k": {
9
+ "1": 90.7
10
+ },
11
+ "time_use_in_second": 101.03804540634155,
12
+ "time_use_in_minite": "1:41"
13
+ }
eval_results_0622/global_step_60/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_0622/global_step_60/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 500,
3
+ "num_scores": 500,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 1,
6
+ "acc": 76.8,
7
+ "pass_acc": 76.8,
8
+ "pass@k": {
9
+ "1": 76.8
10
+ },
11
+ "time_use_in_second": 75.19138765335083,
12
+ "time_use_in_minite": "1:15"
13
+ }
eval_results_0622/global_step_60/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_0622/global_step_60/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 272,
3
+ "num_scores": 272,
4
+ "timeout_samples": 3,
5
+ "empty_samples": 0,
6
+ "acc": 43.8,
7
+ "pass_acc": 43.8,
8
+ "pass@k": {
9
+ "1": 43.8
10
+ },
11
+ "type_acc": {
12
+ "Differential Equations (18.03 Spring 2010)": 64.6,
13
+ "Dynamics and Control (2.003 Spring 2005)": 50.0,
14
+ "Ecology I (1.018J Fall 2009)": 60.0,
15
+ "Information and Entropy (6.050J Spring 2008)": 33.3,
16
+ "Introduction to Astronomy (8.282J Spring 2006)": 34.0,
17
+ "Introduction to Solid State Chemistry (3.091 Fall 2010)": 36.1,
18
+ "Physical Chemistry (5.61 Fall 2017)": 27.3,
19
+ "Principles of Microeconomics (14.01 Fall 2011)": 55.6,
20
+ "Relativity (8.033 Fall 2006)": 45.5
21
+ },
22
+ "time_use_in_second": 33.310840368270874,
23
+ "time_use_in_minite": "0:33"
24
+ }
eval_results_0622/global_step_60/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_0622/global_step_60/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 675,
3
+ "num_scores": 675,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 39.7,
7
+ "pass_acc": 39.7,
8
+ "pass@k": {
9
+ "1": 39.7
10
+ },
11
+ "time_use_in_second": 119.3384759426117,
12
+ "time_use_in_minite": "1:59"
13
+ }
eval_results_0622/global_step_65/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_0622/global_step_65/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 30,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 13.3,
7
+ "pass_acc": 13.3,
8
+ "pass@k": {
9
+ "1": 13.3
10
+ },
11
+ "time_use_in_second": 41.94261574745178,
12
+ "time_use_in_minite": "0:41"
13
+ }
eval_results_0622/global_step_65/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_0622/global_step_65/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 40,
3
+ "num_scores": 40,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 52.5,
7
+ "pass_acc": 52.5,
8
+ "pass@k": {
9
+ "1": 52.5
10
+ },
11
+ "time_use_in_second": 22.45035743713379,
12
+ "time_use_in_minite": "0:22"
13
+ }
eval_results_0622/global_step_65/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_0622/global_step_65/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 1319,
3
+ "num_scores": 1319,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 1,
6
+ "acc": 90.6,
7
+ "pass_acc": 90.6,
8
+ "pass@k": {
9
+ "1": 90.6
10
+ },
11
+ "time_use_in_second": 89.95159482955933,
12
+ "time_use_in_minite": "1:29"
13
+ }
eval_results_0622/global_step_65/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_0622/global_step_65/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 500,
3
+ "num_scores": 500,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 1,
6
+ "acc": 77.0,
7
+ "pass_acc": 77.0,
8
+ "pass@k": {
9
+ "1": 77.0
10
+ },
11
+ "time_use_in_second": 57.73195791244507,
12
+ "time_use_in_minite": "0:57"
13
+ }
eval_results_0622/global_step_65/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_0622/global_step_65/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 272,
3
+ "num_scores": 272,
4
+ "timeout_samples": 3,
5
+ "empty_samples": 1,
6
+ "acc": 41.2,
7
+ "pass_acc": 41.2,
8
+ "pass@k": {
9
+ "1": 41.2
10
+ },
11
+ "type_acc": {
12
+ "Differential Equations (18.03 Spring 2010)": 64.6,
13
+ "Dynamics and Control (2.003 Spring 2005)": 53.8,
14
+ "Ecology I (1.018J Fall 2009)": 60.0,
15
+ "Information and Entropy (6.050J Spring 2008)": 33.3,
16
+ "Introduction to Astronomy (8.282J Spring 2006)": 35.8,
17
+ "Introduction to Solid State Chemistry (3.091 Fall 2010)": 27.8,
18
+ "Physical Chemistry (5.61 Fall 2017)": 27.3,
19
+ "Principles of Microeconomics (14.01 Fall 2011)": 55.6,
20
+ "Relativity (8.033 Fall 2006)": 36.4
21
+ },
22
+ "time_use_in_second": 56.464439392089844,
23
+ "time_use_in_minite": "0:56"
24
+ }
eval_results_0622/global_step_65/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_0622/global_step_65/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 675,
3
+ "num_scores": 675,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 40.7,
7
+ "pass_acc": 40.7,
8
+ "pass@k": {
9
+ "1": 40.7
10
+ },
11
+ "time_use_in_second": 108.06502532958984,
12
+ "time_use_in_minite": "1:48"
13
+ }
eval_results_0622/global_step_70/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_0622/global_step_70/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 30,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 13.3,
7
+ "pass_acc": 13.3,
8
+ "pass@k": {
9
+ "1": 13.3
10
+ },
11
+ "time_use_in_second": 16.721750259399414,
12
+ "time_use_in_minite": "0:16"
13
+ }
eval_results_0622/global_step_70/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_0622/global_step_70/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 40,
3
+ "num_scores": 40,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 40.0,
7
+ "pass_acc": 40.0,
8
+ "pass@k": {
9
+ "1": 40.0
10
+ },
11
+ "time_use_in_second": 18.163444995880127,
12
+ "time_use_in_minite": "0:18"
13
+ }
eval_results_0622/global_step_70/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_0622/global_step_70/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 1319,
3
+ "num_scores": 1319,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 1,
6
+ "acc": 91.4,
7
+ "pass_acc": 91.4,
8
+ "pass@k": {
9
+ "1": 91.4
10
+ },
11
+ "time_use_in_second": 136.2193775177002,
12
+ "time_use_in_minite": "2:16"
13
+ }
eval_results_0622/global_step_70/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_0622/global_step_70/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 500,
3
+ "num_scores": 500,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 1,
6
+ "acc": 76.6,
7
+ "pass_acc": 76.6,
8
+ "pass@k": {
9
+ "1": 76.6
10
+ },
11
+ "time_use_in_second": 75.49087476730347,
12
+ "time_use_in_minite": "1:15"
13
+ }
eval_results_0622/global_step_70/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_0622/global_step_70/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 272,
3
+ "num_scores": 272,
4
+ "timeout_samples": 2,
5
+ "empty_samples": 0,
6
+ "acc": 41.2,
7
+ "pass_acc": 41.2,
8
+ "pass@k": {
9
+ "1": 41.2
10
+ },
11
+ "type_acc": {
12
+ "Differential Equations (18.03 Spring 2010)": 62.5,
13
+ "Dynamics and Control (2.003 Spring 2005)": 57.7,
14
+ "Ecology I (1.018J Fall 2009)": 40.0,
15
+ "Information and Entropy (6.050J Spring 2008)": 33.3,
16
+ "Introduction to Astronomy (8.282J Spring 2006)": 30.2,
17
+ "Introduction to Solid State Chemistry (3.091 Fall 2010)": 29.9,
18
+ "Physical Chemistry (5.61 Fall 2017)": 36.4,
19
+ "Principles of Microeconomics (14.01 Fall 2011)": 50.0,
20
+ "Relativity (8.033 Fall 2006)": 54.5
21
+ },
22
+ "time_use_in_second": 57.781558990478516,
23
+ "time_use_in_minite": "0:57"
24
+ }
eval_results_0622/global_step_70/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff