bensondccnqwc commited on
Commit
1692758
·
verified ·
1 Parent(s): 05d7423

Add files using upload-large-folder tool

Browse files
Files changed (50) hide show
  1. eval_results/eval_results.csv +12 -0
  2. eval_results/global_step_0/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  3. eval_results/global_step_0/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  4. eval_results/global_step_0/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  5. eval_results/global_step_0/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  6. eval_results/global_step_0/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  7. eval_results/global_step_0/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  8. eval_results/global_step_0/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  9. eval_results/global_step_0/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  10. eval_results/global_step_0/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  11. eval_results/global_step_0/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  12. eval_results/global_step_0/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  13. eval_results/global_step_0/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +24 -0
  14. eval_results/global_step_0/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +33 -0
  15. eval_results/global_step_0/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  16. eval_results/global_step_0/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  17. eval_results/global_step_10/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  18. eval_results/global_step_10/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  19. eval_results/global_step_10/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  20. eval_results/global_step_10/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  21. eval_results/global_step_10/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  22. eval_results/global_step_10/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  23. eval_results/global_step_10/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  24. eval_results/global_step_10/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  25. eval_results/global_step_10/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  26. eval_results/global_step_10/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  27. eval_results/global_step_10/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  28. eval_results/global_step_10/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +24 -0
  29. eval_results/global_step_10/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +33 -0
  30. eval_results/global_step_10/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  31. eval_results/global_step_100/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  32. eval_results/global_step_100/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  33. eval_results/global_step_100/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  34. eval_results/global_step_100/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  35. eval_results/global_step_100/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  36. eval_results/global_step_100/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  37. eval_results/global_step_100/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  38. eval_results/global_step_100/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  39. eval_results/global_step_100/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  40. eval_results/global_step_100/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  41. eval_results/global_step_100/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +24 -0
  42. eval_results/global_step_100/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +33 -0
  43. eval_results/global_step_100/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  44. eval_results/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  45. eval_results/global_step_20/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  46. eval_results/global_step_20/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  47. eval_results/global_step_20/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  48. eval_results/global_step_20/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  49. eval_results/global_step_20/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  50. latest_checkpointed_iteration.txt +1 -0
eval_results/eval_results.csv ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model,aime24_acc,aime24_pass_acc,aime24_tokens,aime24_keywords,aime24_correct_tokens,aime24_wrong_tokens,aime24_clip_ratio,aime24_stop_tokens,aime24_stop_ratio,aime24_box_ratio,aime24_repeat_ratio,aime25_acc,aime25_pass_acc,aime25_tokens,aime25_keywords,aime25_correct_tokens,aime25_wrong_tokens,aime25_clip_ratio,aime25_stop_tokens,aime25_stop_ratio,aime25_box_ratio,aime25_repeat_ratio,amc23_acc,amc23_pass_acc,amc23_tokens,amc23_keywords,amc23_correct_tokens,amc23_wrong_tokens,amc23_clip_ratio,amc23_stop_tokens,amc23_stop_ratio,amc23_box_ratio,amc23_repeat_ratio,gsm8k_acc,gsm8k_pass_acc,gsm8k_tokens,gsm8k_keywords,gsm8k_correct_tokens,gsm8k_wrong_tokens,gsm8k_clip_ratio,gsm8k_stop_tokens,gsm8k_stop_ratio,gsm8k_box_ratio,gsm8k_repeat_ratio,math500_acc,math500_pass_acc,math500_tokens,math500_keywords,math500_correct_tokens,math500_wrong_tokens,math500_clip_ratio,math500_stop_tokens,math500_stop_ratio,math500_box_ratio,math500_repeat_ratio,minerva_math_acc,minerva_math_pass_acc,minerva_math_tokens,minerva_math_keywords,minerva_math_correct_tokens,minerva_math_wrong_tokens,minerva_math_clip_ratio,minerva_math_stop_tokens,minerva_math_stop_ratio,minerva_math_box_ratio,minerva_math_repeat_ratio,mmlu_stem_acc,mmlu_stem_pass_acc,mmlu_stem_tokens,mmlu_stem_keywords,mmlu_stem_correct_tokens,mmlu_stem_wrong_tokens,mmlu_stem_clip_ratio,mmlu_stem_stop_tokens,mmlu_stem_stop_ratio,mmlu_stem_box_ratio,mmlu_stem_repeat_ratio,olympiadbench_acc,olympiadbench_pass_acc,olympiadbench_tokens,olympiadbench_keywords,olympiadbench_correct_tokens,olympiadbench_wrong_tokens,olympiadbench_clip_ratio,olympiadbench_stop_tokens,olympiadbench_stop_ratio,olympiadbench_box_ratio,olympiadbench_repeat_ratio,avg_acc,avg_pass_acc,avg_tokens,avg_keywords,avg_correct_tokens,avg_wrong_tokens,avg_clip_ratio,avg_stop_tokens,avg_stop_ratio,avg_box_ratio,avg_repeat_ratio
2
+ eval_results-global_step_0,6.7,6.7,3019.5,0.3333333333333333,735.0,3182.6785714285716,0.13333333333333333,1029.8076923076924,0.8666666666666667,0.8,0.7666666666666667,0.0,0.0,1676.9333333333334,0.6666666666666666,0.0,1676.9333333333334,0.03333333333333333,1183.1379310344828,0.9666666666666667,0.8333333333333334,0.7,20.0,20.0,1145.575,0.525,629.0,1274.71875,0.025,764.8461538461538,0.975,0.75,0.625,60.1,60.1,1106.6868840030327,0.1281273692191054,583.8991172761664,1894.8441064638782,0.04245640636846096,300.1900237529691,0.9575435936315391,0.7604245640636846,0.2805155420773313,44.2,44.2,1192.764,0.174,558.1538461538462,1695.448028673835,0.034,568.2008281573499,0.966,0.832,0.498,10.3,10.3,1055.7757352941176,0.1875,476.75,1122.22131147541,0.025735294117647058,570.4867924528302,0.9742647058823529,0.8014705882352942,0.4485294117647059,40.7,40.7,803.4662027833002,0.5062955599734924,470.83224755700326,1031.664245810056,0.027833001988071572,310.7338104976142,0.9721669980119284,0.6093439363817097,0.47746852220013253,16.0,16.0,1588.5762962962963,0.6014814814814815,697.3425925925926,1758.3350970017636,0.04,920.7376543209876,0.96,0.8444444444444444,0.6488888888888888,24.75,24.75,1448.65968146376,0.39030055133425984,518.872225447451,1704.605430523356,0.04521142114260578,706.01761079626,0.9547885788573943,0.7788771083073083,0.5556336289497157
3
+ eval_results-global_step_10,10.0,10.0,2279.0666666666666,0.6666666666666666,695.0,2455.074074074074,0.06666666666666667,1299.0714285714287,0.9333333333333333,0.9,0.7666666666666667,0.0,0.0,1796.1333333333334,0.7333333333333333,0.0,1796.1333333333334,0.06666666666666667,787.1428571428571,0.9333333333333333,0.9333333333333333,0.7333333333333333,32.5,32.5,757.125,0.075,631.0,817.8518518518518,0.0,757.125,1.0,0.975,0.65,73.2,73.2,399.20697498104624,0.3169067475360121,301.72256728778467,665.9773371104816,0.006823351023502654,291.775572519084,0.9931766489764974,0.9651250947687642,0.22062168309325247,55.6,55.6,866.892,0.292,569.2805755395683,1239.5765765765766,0.014,651.2251521298174,0.986,0.972,0.498,19.1,19.1,849.4227941176471,0.16176470588235295,460.71153846153845,941.3,0.014705882352941176,619.0559701492538,0.9852941176470589,0.9522058823529411,0.5036764705882353,42.4,42.4,447.33233929754806,0.5738899933730948,342.63956215793587,524.3317998849914,0.005632869449966865,353.65511496167943,0.9943671305500331,0.8137839628893306,0.538104705102717,22.1,22.1,1585.842962962963,1.7155555555555555,901.751677852349,1779.6254752851712,0.044444444444444446,915.7240310077519,0.9555555555555556,0.9348148148148148,0.6488888888888888,31.8625,31.8625,1122.6277589199005,0.5668896252933768,487.76324016239704,1277.4838060145598,0.02736748507552356,709.3468908102342,0.9726325149244766,0.930782886019898,0.5699114684591368
4
+ eval_results-global_step_20,13.3,13.3,1808.5,0.5,937.5,1942.5,0.06666666666666667,794.8928571428571,0.9333333333333333,0.9333333333333333,0.6,0.0,0.0,1484.0666666666666,1.2666666666666666,0.0,1484.0666666666666,0.03333333333333333,983.5172413793103,0.9666666666666667,0.9333333333333333,0.5666666666666667,40.0,40.0,1964.375,3.875,716.5625,2796.25,0.05,1225.5526315789473,0.95,0.9,0.65,78.8,78.8,328.92115238817286,0.0932524639878696,289.390760346487,475.60714285714283,0.002274450341167551,293.1367781155015,0.9977255496588324,0.9954510993176648,0.2304776345716452,60.6,60.6,852.284,0.154,456.6237623762376,1460.8375634517768,0.018,575.7942973523421,0.982,0.978,0.494,26.5,26.5,1103.0367647058824,0.5220588235294118,677.6527777777778,1256.175,0.029411764705882353,649.344696969697,0.9705882352941176,0.9448529411764706,0.47058823529411764,46.2,46.2,449.41451292246524,0.388336646785951,320.4989231873654,559.924923076923,0.004970178926441352,370.5181485181485,0.9950298210735586,0.9367130550033135,0.572233267064281,25.5,25.5,1283.6192592592593,0.3348148148148148,668.7790697674419,1493.8628230616303,0.02962962962962963,834.2213740458016,0.9703703703703703,0.957037037037037,0.6607407407407407,36.3625,36.3625,1159.2771694928058,0.8917661769730892,508.3759741819137,1433.6530148892675,0.029285752950390112,715.8722531378257,0.9707142470496097,0.9473400999001441,0.5305883180421814
5
+ eval_results-global_step_30,3.3,3.3,1949.4333333333334,7.566666666666666,497.0,1999.5172413793102,0.03333333333333333,1464.896551724138,0.9666666666666667,0.9333333333333333,0.7,3.3,3.3,1339.3333333333333,0.3,796.0,1358.0689655172414,0.03333333333333333,833.7931034482758,0.9666666666666667,0.9666666666666667,0.7666666666666667,32.5,32.5,1179.95,0.2,681.5384615384615,1419.9259259259259,0.025,799.9487179487179,0.975,0.975,0.625,79.6,79.6,312.6406368460955,0.04473085670962851,289.2419047619048,403.97397769516726,0.000758150113722517,300.73672230652505,0.9992418498862775,0.9984836997725549,0.21531463229719486,60.8,60.8,738.374,0.152,444.5986842105263,1194.0255102040817,0.01,584.2161616161616,0.99,0.988,0.458,26.8,26.8,713.0183823529412,0.1323529411764706,433.8082191780822,815.4422110552764,0.007352941176470588,599.7777777777778,0.9926470588235294,0.9889705882352942,0.45955882352941174,49.5,49.5,472.2107355864811,0.6500994035785288,370.5344943067649,571.7534426229508,0.006295559973492379,373.754918306102,0.9937044400265076,0.9678595096090126,0.5868124585818423,27.0,27.0,1342.037037037037,1.2385185185185186,701.0824175824176,1578.657200811359,0.03259259259259259,848.5742725880551,0.9674074074074074,0.957037037037037,0.64,35.35,35.35,1005.8746823111526,1.2855460483312267,526.7255226972696,1167.6705594014143,0.018583238815368094,725.7122782144692,0.981416761184632,0.9719188543317373,0.5564190726343895
6
+ eval_results-global_step_40,13.3,13.3,1408.0,0.43333333333333335,995.5,1471.4615384615386,0.03333333333333333,904.9655172413793,0.9666666666666667,0.9666666666666667,0.7666666666666667,6.7,6.7,1300.8,0.5333333333333333,723.5,1342.0357142857142,0.03333333333333333,793.9310344827586,0.9666666666666667,0.9666666666666667,0.6666666666666666,32.5,32.5,1382.45,0.175,625.4615384615385,1746.9259259259259,0.025,1007.6666666666666,0.975,0.95,0.75,80.3,80.3,329.40333586050036,0.032600454890068235,298.47119924457036,455.39230769230767,0.002274450341167551,293.6193009118541,0.9977255496588324,0.9962092494313874,0.23654283548142532,61.8,61.8,747.13,0.166,477.15210355987057,1183.9005235602094,0.012,561.8846153846154,0.988,0.986,0.516,28.3,28.3,589.8860294117648,0.09558823529411764,479.3766233766234,633.5230769230769,0.0,589.8860294117648,1.0,0.9926470588235294,0.4485294117647059,51.8,51.8,405.1265738899934,0.30947647448641485,341.3282149712092,473.66048109965635,0.0016567263088137839,379.07036176568204,0.9983432736911863,0.9744864148442677,0.583167660702452,26.4,26.4,1314.237037037037,0.5081481481481481,649.0786516853933,1552.4627766599597,0.028148148148148148,888.9100609756098,0.9718518518518519,0.957037037037037,0.6592592592592592,37.6375,37.6375,934.629122024912,0.28168499743567693,573.7335414124007,1107.4202930760484,0.01696824893309952,677.4916983550413,0.9830317510669005,0.9737141366836944,0.578354062567647
7
+ eval_results-global_step_50,6.7,6.7,1430.3333333333333,0.5,585.5,1490.6785714285713,0.0,1430.3333333333333,1.0,0.9333333333333333,0.6333333333333333,0.0,0.0,1364.9666666666667,0.4,0.0,1364.9666666666667,0.03333333333333333,860.3448275862069,0.9666666666666667,0.9666666666666667,0.8,37.5,37.5,1020.95,0.175,1359.5333333333333,817.8,0.0,1020.95,1.0,1.0,0.725,80.4,80.4,298.8726307808946,0.04245640636846096,285.87264150943395,352.0772200772201,0.0,298.8726307808946,1.0,0.9992418498862775,0.23881728582259287,65.2,65.2,584.506,0.136,461.34355828220856,815.2586206896551,0.002,553.6172344689379,0.998,0.994,0.472,27.6,27.6,711.5808823529412,0.08088235294117647,489.24,796.2284263959391,0.003676470588235294,656.2361623616237,0.9963235294117647,0.9816176470588235,0.48161764705882354,54.4,54.4,432.64546056991384,0.35785288270377735,364.20876445526477,514.4210909090909,0.0023194168323392977,396.60146130853536,0.9976805831676607,0.9797879390324719,0.6050364479787939,25.9,25.9,1327.7288888888888,0.8311111111111111,623.6114285714285,1574.17,0.03259259259259259,833.4211332312404,0.9674074074074074,0.96,0.6844444444444444,37.2125,37.2125,896.4479828240799,0.3154128441405657,521.1637157689586,965.7000745208928,0.009240226668312566,756.2970978838465,0.9907597733316874,0.9768309294971966,0.5800311448297484
8
+ eval_results-global_step_60,10.0,10.0,2341.7,5.233333333333333,706.6666666666666,2523.3703703703704,0.1,824.2222222222222,0.9,0.9,0.7666666666666667,0.0,0.0,2022.7,8.1,0.0,2022.7,0.06666666666666667,1024.5357142857142,0.9333333333333333,0.9,0.7333333333333333,37.5,37.5,799.15,0.3,637.4,896.2,0.0,799.15,1.0,1.0,0.7,81.7,81.7,326.41167551175135,0.04397270659590599,288.63450834879404,495.3900414937759,0.001516300227445034,302.6993166287016,0.9984836997725549,0.9977255496588324,0.2486732373009856,63.6,63.6,838.988,1.032,517.559748427673,1400.6043956043957,0.016,592.469512195122,0.984,0.978,0.446,29.4,29.4,742.5955882352941,0.15441176470588236,507.2375,840.6614583333334,0.007352941176470588,629.574074074074,0.9926470588235294,0.9816176470588235,0.5404411764705882,55.0,55.0,411.8542080848244,0.3260437375745527,363.8740204942737,470.4260485651214,0.0016567263088137839,385.39163624294724,0.9983432736911863,0.9840954274353877,0.6080185553346588,25.5,25.5,1371.0844444444444,0.8429629629629629,653.5581395348837,1616.441351888668,0.028148148148148148,947.9817073170732,0.9718518518518519,0.9614814814814815,0.6577777777777778,37.8375,37.8375,1106.810489534539,2.0040905631465797,459.3663229340364,1283.2242082819582,0.02766759781594303,688.2530228707318,0.972332402184057,0.9628650132043155,0.5876138433605014
9
+ eval_results-global_step_70,13.3,13.3,1322.0,0.3,798.5,1402.5384615384614,0.03333333333333333,815.8275862068965,0.9666666666666667,0.9666666666666667,0.8,3.3,3.3,1323.9666666666667,0.3,577.0,1349.7241379310344,0.03333333333333333,817.9310344827586,0.9666666666666667,0.9666666666666667,0.7333333333333333,42.5,42.5,975.925,0.275,758.8235294117648,1136.391304347826,0.0,975.925,1.0,0.975,0.65,80.4,80.4,329.9598180439727,0.10993176648976498,288.94910461828465,498.61240310077517,0.001516300227445034,305.8435839028094,0.9984836997725549,0.9969673995451099,0.2767247915087187,64.6,64.6,667.506,0.126,469.3003095975232,1029.2033898305085,0.006,574.9597585513078,0.994,0.992,0.51,27.2,27.2,669.1801470588235,0.09191176470588236,473.4189189189189,742.3434343434343,0.003676470588235294,612.5756457564576,0.9963235294117647,0.9963235294117647,0.4632352941176471,58.4,58.4,413.4642147117296,0.30649436713055006,348.0533182076007,505.35219123505976,0.0016567263088137839,387.6435446398938,0.9983432736911863,0.9804506295559974,0.6060304837640822,29.5,29.5,1452.2948148148148,0.6014814814814815,747.4572864321608,1746.9642857142858,0.037037037037037035,892.896923076923,0.9629629629629629,0.9481481481481482,0.674074074074074,39.9,39.9,894.2870826620008,0.26385242247595986,557.6878083982816,1051.3912010051731,0.014569150103524726,672.9503845771308,0.9854308498964752,0.9777778799992942,0.5891747470997319
10
+ eval_results-global_step_80,6.7,6.7,2953.866666666667,1.1333333333333333,615.0,3120.9285714285716,0.1,1504.2592592592594,0.9,0.8333333333333334,0.8,3.3,3.3,1305.2666666666667,0.5666666666666667,1169.0,1309.9655172413793,0.03333333333333333,798.551724137931,0.9666666666666667,0.9666666666666667,0.6,32.5,32.5,798.125,0.15,645.6153846153846,871.5555555555555,0.0,798.125,1.0,1.0,0.575,81.8,81.8,325.4192570128886,0.18726307808946172,290.23911028730305,483.5833333333333,0.001516300227445034,301.7524677296887,0.9984836997725549,0.9969673995451099,0.2721758908263836,63.4,63.4,633.504,0.194,466.9589905362776,922.0,0.004,571.7911646586346,0.996,0.994,0.514,29.4,29.4,824.1213235294117,0.1323529411764706,478.95,967.9427083333334,0.014705882352941176,598.044776119403,0.9852941176470589,0.9816176470588235,0.5257352941176471,57.9,57.9,431.8681245858184,0.3601722995361166,364.87686139747996,523.8231132075472,0.0023194168323392977,395.67519096645634,0.9976805831676607,0.9840954274353877,0.6156394963552021,29.9,29.9,1530.1659259259259,0.2325925925925926,675.1732673267327,1895.3002114164906,0.04296296296296296,880.6222910216718,0.957037037037037,0.9451851851851852,0.7051851851851851,38.1125,38.1125,1100.2921205484222,0.36954761392433016,588.2267017703972,1261.8873763145264,0.024854736963627725,731.1027342366306,0.9751452630363723,0.9627332074030632,0.5759669833105523
11
+ eval_results-global_step_90,3.3,3.3,1737.5333333333333,0.3,494.0,1780.4137931034484,0.03333333333333333,1245.7241379310344,0.9666666666666667,0.9333333333333333,0.8333333333333334,0.0,0.0,1362.0666666666666,0.7666666666666667,0.0,1362.0666666666666,0.03333333333333333,857.3103448275862,0.9666666666666667,0.9666666666666667,0.6666666666666666,37.5,37.5,1282.825,0.225,699.2,1633.0,0.0,1282.825,1.0,0.975,0.75,83.3,83.3,301.37831690674756,0.0310841546626232,291.7424931756142,349.51363636363635,0.0,301.37831690674756,1.0,0.9984836997725549,0.2608036391205459,66.4,66.4,700.87,0.19,464.9909638554217,1167.0119047619048,0.008,577.4798387096774,0.992,0.988,0.502,29.8,29.8,688.6691176470588,0.15073529411764705,496.037037037037,770.3612565445026,0.003676470588235294,632.1771217712177,0.9963235294117647,0.9889705882352942,0.5183823529411765,58.5,58.5,446.71736249171636,0.34956925115970844,373.07809847198644,550.7306155075939,0.0016567263088137839,421.0016594756057,0.9983432736911863,0.9831013916500994,0.6222664015904572,29.6,29.6,1326.8562962962963,0.25333333333333335,697.675,1591.7747368421053,0.025185185185185185,947.7507598784194,0.9748148148148148,0.9629629629629629,0.6755555555555556,38.550000000000004,38.550000000000004,980.8645116677272,0.28329858749249737,439.5904490675074,1150.6090762237322,0.013148131093612615,783.2058974375359,0.9868518689063874,0.9745648303276139,0.6036259936509669
12
+ eval_results-global_step_100,3.3,3.3,2221.0333333333333,0.3,438.0,2282.5172413793102,0.06666666666666667,1236.8214285714287,0.9333333333333333,0.9,0.7,3.3,3.3,1094.7,0.43333333333333335,666.0,1109.4827586206898,0.0,1094.7,1.0,0.9666666666666667,0.7333333333333333,42.5,42.5,1184.55,0.175,707.8235294117648,1536.9130434782608,0.025,804.6410256410256,0.975,0.975,0.775,82.7,82.7,319.9529946929492,0.026535253980288095,285.84509624197983,483.1622807017544,0.001516300227445034,296.14047076689445,0.9984836997725549,0.9977255496588324,0.25928733889310085,65.2,65.2,781.244,0.202,463.23312883435585,1377.057471264368,0.01,627.5232323232324,0.99,0.982,0.496,30.1,30.1,739.9816176470588,0.13970588235294118,564.4268292682926,815.7473684210527,0.007352941176470588,626.9666666666667,0.9926470588235294,0.9852941176470589,0.5367647058823529,59.1,59.1,444.0457256461233,0.4665341285619616,382.1390914189568,533.4218623481781,0.002982107355864811,397.82718511133265,0.9970178926441352,0.9827700463883366,0.6219350563286945,30.4,30.4,1329.7333333333333,0.2607407407407407,693.0926829268293,1607.4170212765957,0.02666666666666667,927.9147640791476,0.9733333333333334,0.96,0.6977777777777778,39.574999999999996,39.574999999999996,1014.4051255815997,0.2504811673711581,525.0700447627725,1218.2148809362764,0.01752308526163922,751.5668466449661,0.9824769147383607,0.9686820475451118,0.6025122765269073
eval_results/global_step_0/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_0/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 30,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 6.7,
7
+ "pass_acc": 6.7,
8
+ "pass@k": {
9
+ "1": 6.7
10
+ },
11
+ "time_use_in_second": 158.15379571914673,
12
+ "time_use_in_minite": "2:38"
13
+ }
eval_results/global_step_0/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_0/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 30,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 2,
6
+ "acc": 0.0,
7
+ "pass_acc": 0.0,
8
+ "pass@k": {
9
+ "1": 0.0
10
+ },
11
+ "time_use_in_second": 139.49037170410156,
12
+ "time_use_in_minite": "2:19"
13
+ }
eval_results/global_step_0/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_0/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 40,
3
+ "num_scores": 40,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 4,
6
+ "acc": 20.0,
7
+ "pass_acc": 20.0,
8
+ "pass@k": {
9
+ "1": 20.0
10
+ },
11
+ "time_use_in_second": 136.22078156471252,
12
+ "time_use_in_minite": "2:16"
13
+ }
eval_results/global_step_0/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_0/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 1319,
3
+ "num_scores": 1319,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 86,
6
+ "acc": 60.1,
7
+ "pass_acc": 60.1,
8
+ "pass@k": {
9
+ "1": 60.1
10
+ },
11
+ "time_use_in_second": 667.6512560844421,
12
+ "time_use_in_minite": "11:07"
13
+ }
eval_results/global_step_0/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_0/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 500,
3
+ "num_scores": 500,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 27,
6
+ "acc": 44.2,
7
+ "pass_acc": 44.2,
8
+ "pass@k": {
9
+ "1": 44.2
10
+ },
11
+ "time_use_in_second": 268.09369802474976,
12
+ "time_use_in_minite": "4:28"
13
+ }
eval_results/global_step_0/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_0/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 272,
3
+ "num_scores": 272,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 23,
6
+ "acc": 10.3,
7
+ "pass_acc": 10.3,
8
+ "pass@k": {
9
+ "1": 10.3
10
+ },
11
+ "type_acc": {
12
+ "Differential Equations (18.03 Spring 2010)": 27.1,
13
+ "Dynamics and Control (2.003 Spring 2005)": 23.1,
14
+ "Ecology I (1.018J Fall 2009)": 0.0,
15
+ "Information and Entropy (6.050J Spring 2008)": 33.3,
16
+ "Introduction to Astronomy (8.282J Spring 2006)": 3.8,
17
+ "Introduction to Solid State Chemistry (3.091 Fall 2010)": 3.1,
18
+ "Physical Chemistry (5.61 Fall 2017)": 0.0,
19
+ "Principles of Microeconomics (14.01 Fall 2011)": 16.7,
20
+ "Relativity (8.033 Fall 2006)": 0.0
21
+ },
22
+ "time_use_in_second": 192.75555324554443,
23
+ "time_use_in_minite": "3:12"
24
+ }
eval_results/global_step_0/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 3018,
3
+ "num_scores": 3018,
4
+ "timeout_samples": 3,
5
+ "empty_samples": 6,
6
+ "acc": 40.7,
7
+ "pass_acc": 40.7,
8
+ "pass@k": {
9
+ "1": 40.7
10
+ },
11
+ "type_acc": {
12
+ "abstract_algebra": 41.0,
13
+ "astronomy": 55.3,
14
+ "college_biology": 47.9,
15
+ "college_chemistry": 29.0,
16
+ "college_computer_science": 35.0,
17
+ "college_mathematics": 22.0,
18
+ "college_physics": 33.3,
19
+ "computer_security": 41.0,
20
+ "conceptual_physics": 52.3,
21
+ "electrical_engineering": 44.1,
22
+ "elementary_mathematics": 38.9,
23
+ "high_school_biology": 49.7,
24
+ "high_school_chemistry": 45.8,
25
+ "high_school_computer_science": 47.0,
26
+ "high_school_mathematics": 20.7,
27
+ "high_school_physics": 31.8,
28
+ "high_school_statistics": 45.4,
29
+ "machine_learning": 38.4
30
+ },
31
+ "time_use_in_second": 1042.9115931987762,
32
+ "time_use_in_minite": "17:22"
33
+ }
eval_results/global_step_0/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_0/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 675,
3
+ "num_scores": 675,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 27,
6
+ "acc": 16.0,
7
+ "pass_acc": 16.0,
8
+ "pass@k": {
9
+ "1": 16.0
10
+ },
11
+ "time_use_in_second": 441.98529720306396,
12
+ "time_use_in_minite": "7:21"
13
+ }
eval_results/global_step_10/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_10/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 30,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 10.0,
7
+ "pass_acc": 10.0,
8
+ "pass@k": {
9
+ "1": 10.0
10
+ },
11
+ "time_use_in_second": 152.91111731529236,
12
+ "time_use_in_minite": "2:32"
13
+ }
eval_results/global_step_10/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_10/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 30,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 0.0,
7
+ "pass_acc": 0.0,
8
+ "pass@k": {
9
+ "1": 0.0
10
+ },
11
+ "time_use_in_second": 149.9012632369995,
12
+ "time_use_in_minite": "2:29"
13
+ }
eval_results/global_step_10/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_10/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 40,
3
+ "num_scores": 40,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 32.5,
7
+ "pass_acc": 32.5,
8
+ "pass@k": {
9
+ "1": 32.5
10
+ },
11
+ "time_use_in_second": 11.931201934814453,
12
+ "time_use_in_minite": "0:11"
13
+ }
eval_results/global_step_10/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_10/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 1319,
3
+ "num_scores": 1319,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 7,
6
+ "acc": 73.2,
7
+ "pass_acc": 73.2,
8
+ "pass@k": {
9
+ "1": 73.2
10
+ },
11
+ "time_use_in_second": 257.0179879665375,
12
+ "time_use_in_minite": "4:17"
13
+ }
eval_results/global_step_10/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_10/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 500,
3
+ "num_scores": 500,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 3,
6
+ "acc": 55.6,
7
+ "pass_acc": 55.6,
8
+ "pass@k": {
9
+ "1": 55.6
10
+ },
11
+ "time_use_in_second": 223.20129466056824,
12
+ "time_use_in_minite": "3:43"
13
+ }
eval_results/global_step_10/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_10/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 272,
3
+ "num_scores": 272,
4
+ "timeout_samples": 3,
5
+ "empty_samples": 8,
6
+ "acc": 19.1,
7
+ "pass_acc": 19.1,
8
+ "pass@k": {
9
+ "1": 19.1
10
+ },
11
+ "type_acc": {
12
+ "Differential Equations (18.03 Spring 2010)": 43.8,
13
+ "Dynamics and Control (2.003 Spring 2005)": 23.1,
14
+ "Ecology I (1.018J Fall 2009)": 0.0,
15
+ "Information and Entropy (6.050J Spring 2008)": 33.3,
16
+ "Introduction to Astronomy (8.282J Spring 2006)": 11.3,
17
+ "Introduction to Solid State Chemistry (3.091 Fall 2010)": 9.3,
18
+ "Physical Chemistry (5.61 Fall 2017)": 9.1,
19
+ "Principles of Microeconomics (14.01 Fall 2011)": 44.4,
20
+ "Relativity (8.033 Fall 2006)": 0.0
21
+ },
22
+ "time_use_in_second": 176.7901575565338,
23
+ "time_use_in_minite": "2:56"
24
+ }
eval_results/global_step_10/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 3018,
3
+ "num_scores": 3018,
4
+ "timeout_samples": 5,
5
+ "empty_samples": 5,
6
+ "acc": 42.4,
7
+ "pass_acc": 42.4,
8
+ "pass@k": {
9
+ "1": 42.4
10
+ },
11
+ "type_acc": {
12
+ "abstract_algebra": 31.0,
13
+ "astronomy": 50.7,
14
+ "college_biology": 55.6,
15
+ "college_chemistry": 30.0,
16
+ "college_computer_science": 39.0,
17
+ "college_mathematics": 24.0,
18
+ "college_physics": 34.3,
19
+ "computer_security": 53.0,
20
+ "conceptual_physics": 57.0,
21
+ "electrical_engineering": 41.4,
22
+ "elementary_mathematics": 38.9,
23
+ "high_school_biology": 58.4,
24
+ "high_school_chemistry": 48.8,
25
+ "high_school_computer_science": 44.0,
26
+ "high_school_mathematics": 19.6,
27
+ "high_school_physics": 35.1,
28
+ "high_school_statistics": 43.5,
29
+ "machine_learning": 40.2
30
+ },
31
+ "time_use_in_second": 448.41801929473877,
32
+ "time_use_in_minite": "7:28"
33
+ }
eval_results/global_step_10/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 675,
3
+ "num_scores": 675,
4
+ "timeout_samples": 1,
5
+ "empty_samples": 2,
6
+ "acc": 22.1,
7
+ "pass_acc": 22.1,
8
+ "pass@k": {
9
+ "1": 22.1
10
+ },
11
+ "time_use_in_second": 463.8451635837555,
12
+ "time_use_in_minite": "7:43"
13
+ }
eval_results/global_step_100/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_100/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 30,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 3.3,
7
+ "pass_acc": 3.3,
8
+ "pass@k": {
9
+ "1": 3.3
10
+ },
11
+ "time_use_in_second": 150.60272455215454,
12
+ "time_use_in_minite": "2:30"
13
+ }
eval_results/global_step_100/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_100/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 30,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 3.3,
7
+ "pass_acc": 3.3,
8
+ "pass@k": {
9
+ "1": 3.3
10
+ },
11
+ "time_use_in_second": 40.9404182434082,
12
+ "time_use_in_minite": "0:40"
13
+ }
eval_results/global_step_100/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_100/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 40,
3
+ "num_scores": 40,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 42.5,
7
+ "pass_acc": 42.5,
8
+ "pass@k": {
9
+ "1": 42.5
10
+ },
11
+ "time_use_in_second": 135.6712465286255,
12
+ "time_use_in_minite": "2:15"
13
+ }
eval_results/global_step_100/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 1319,
3
+ "num_scores": 1319,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 82.7,
7
+ "pass_acc": 82.7,
8
+ "pass@k": {
9
+ "1": 82.7
10
+ },
11
+ "time_use_in_second": 213.1951560974121,
12
+ "time_use_in_minite": "3:33"
13
+ }
eval_results/global_step_100/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_100/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 500,
3
+ "num_scores": 500,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 1,
6
+ "acc": 65.2,
7
+ "pass_acc": 65.2,
8
+ "pass@k": {
9
+ "1": 65.2
10
+ },
11
+ "time_use_in_second": 212.29153633117676,
12
+ "time_use_in_minite": "3:32"
13
+ }
eval_results/global_step_100/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_100/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 272,
3
+ "num_scores": 272,
4
+ "timeout_samples": 1,
5
+ "empty_samples": 0,
6
+ "acc": 30.1,
7
+ "pass_acc": 30.1,
8
+ "pass@k": {
9
+ "1": 30.1
10
+ },
11
+ "type_acc": {
12
+ "Differential Equations (18.03 Spring 2010)": 60.4,
13
+ "Dynamics and Control (2.003 Spring 2005)": 42.3,
14
+ "Ecology I (1.018J Fall 2009)": 40.0,
15
+ "Information and Entropy (6.050J Spring 2008)": 33.3,
16
+ "Introduction to Astronomy (8.282J Spring 2006)": 20.8,
17
+ "Introduction to Solid State Chemistry (3.091 Fall 2010)": 15.5,
18
+ "Physical Chemistry (5.61 Fall 2017)": 18.2,
19
+ "Principles of Microeconomics (14.01 Fall 2011)": 44.4,
20
+ "Relativity (8.033 Fall 2006)": 27.3
21
+ },
22
+ "time_use_in_second": 168.03026056289673,
23
+ "time_use_in_minite": "2:48"
24
+ }
eval_results/global_step_100/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 3018,
3
+ "num_scores": 3018,
4
+ "timeout_samples": 1,
5
+ "empty_samples": 1,
6
+ "acc": 59.1,
7
+ "pass_acc": 59.1,
8
+ "pass@k": {
9
+ "1": 59.1
10
+ },
11
+ "type_acc": {
12
+ "abstract_algebra": 46.0,
13
+ "astronomy": 63.8,
14
+ "college_biology": 61.8,
15
+ "college_chemistry": 42.0,
16
+ "college_computer_science": 58.0,
17
+ "college_mathematics": 44.0,
18
+ "college_physics": 58.8,
19
+ "computer_security": 54.0,
20
+ "conceptual_physics": 57.0,
21
+ "electrical_engineering": 61.4,
22
+ "elementary_mathematics": 74.9,
23
+ "high_school_biology": 68.7,
24
+ "high_school_chemistry": 64.0,
25
+ "high_school_computer_science": 71.0,
26
+ "high_school_mathematics": 41.1,
27
+ "high_school_physics": 59.6,
28
+ "high_school_statistics": 55.6,
29
+ "machine_learning": 46.4
30
+ },
31
+ "time_use_in_second": 387.84250497817993,
32
+ "time_use_in_minite": "6:27"
33
+ }
eval_results/global_step_100/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 675,
3
+ "num_scores": 675,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 30.4,
7
+ "pass_acc": 30.4,
8
+ "pass@k": {
9
+ "1": 30.4
10
+ },
11
+ "time_use_in_second": 350.055180311203,
12
+ "time_use_in_minite": "5:50"
13
+ }
eval_results/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 30,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 13.3,
7
+ "pass_acc": 13.3,
8
+ "pass@k": {
9
+ "1": 13.3
10
+ },
11
+ "time_use_in_second": 154.05221939086914,
12
+ "time_use_in_minite": "2:34"
13
+ }
eval_results/global_step_20/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_20/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 30,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 0.0,
7
+ "pass_acc": 0.0,
8
+ "pass@k": {
9
+ "1": 0.0
10
+ },
11
+ "time_use_in_second": 142.2926208972931,
12
+ "time_use_in_minite": "2:22"
13
+ }
eval_results/global_step_20/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_20/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 40,
3
+ "num_scores": 40,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 40.0,
7
+ "pass_acc": 40.0,
8
+ "pass@k": {
9
+ "1": 40.0
10
+ },
11
+ "time_use_in_second": 159.4089334011078,
12
+ "time_use_in_minite": "2:39"
13
+ }
eval_results/global_step_20/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 1319,
3
+ "num_scores": 1319,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 78.8,
7
+ "pass_acc": 78.8,
8
+ "pass@k": {
9
+ "1": 78.8
10
+ },
11
+ "time_use_in_second": 217.77196097373962,
12
+ "time_use_in_minite": "3:37"
13
+ }
latest_checkpointed_iteration.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ 100