bensondccnqwc commited on
Commit
5333af9
·
verified ·
1 Parent(s): 5cf1f9a

Add files using upload-large-folder tool

Browse files
Files changed (50) hide show
  1. eval_results/eval_results.csv +12 -0
  2. eval_results/global_step_0/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  3. eval_results/global_step_0/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  4. eval_results/global_step_0/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  5. eval_results/global_step_0/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  6. eval_results/global_step_0/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  7. eval_results/global_step_0/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  8. eval_results/global_step_0/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  9. eval_results/global_step_0/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  10. eval_results/global_step_0/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  11. eval_results/global_step_0/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  12. eval_results/global_step_0/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  13. eval_results/global_step_0/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +24 -0
  14. eval_results/global_step_0/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +33 -0
  15. eval_results/global_step_0/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  16. eval_results/global_step_0/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  17. eval_results/global_step_10/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  18. eval_results/global_step_10/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  19. eval_results/global_step_10/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  20. eval_results/global_step_10/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  21. eval_results/global_step_10/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  22. eval_results/global_step_10/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  23. eval_results/global_step_10/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  24. eval_results/global_step_10/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  25. eval_results/global_step_10/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  26. eval_results/global_step_10/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  27. eval_results/global_step_10/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  28. eval_results/global_step_10/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +24 -0
  29. eval_results/global_step_10/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +33 -0
  30. eval_results/global_step_10/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  31. eval_results/global_step_100/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  32. eval_results/global_step_100/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  33. eval_results/global_step_100/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  34. eval_results/global_step_100/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  35. eval_results/global_step_100/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  36. eval_results/global_step_100/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  37. eval_results/global_step_100/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  38. eval_results/global_step_100/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  39. eval_results/global_step_100/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  40. eval_results/global_step_100/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  41. eval_results/global_step_100/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  42. eval_results/global_step_100/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +24 -0
  43. eval_results/global_step_100/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +33 -0
  44. eval_results/global_step_100/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  45. eval_results/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  46. eval_results/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  47. eval_results/global_step_20/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  48. eval_results/global_step_20/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  49. eval_results/global_step_20/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  50. latest_checkpointed_iteration.txt +1 -0
eval_results/eval_results.csv ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model,aime24_acc,aime24_pass_acc,aime24_tokens,aime24_keywords,aime24_correct_tokens,aime24_wrong_tokens,aime24_clip_ratio,aime24_stop_tokens,aime24_stop_ratio,aime24_box_ratio,aime24_repeat_ratio,aime25_acc,aime25_pass_acc,aime25_tokens,aime25_keywords,aime25_correct_tokens,aime25_wrong_tokens,aime25_clip_ratio,aime25_stop_tokens,aime25_stop_ratio,aime25_box_ratio,aime25_repeat_ratio,amc23_acc,amc23_pass_acc,amc23_tokens,amc23_keywords,amc23_correct_tokens,amc23_wrong_tokens,amc23_clip_ratio,amc23_stop_tokens,amc23_stop_ratio,amc23_box_ratio,amc23_repeat_ratio,gsm8k_acc,gsm8k_pass_acc,gsm8k_tokens,gsm8k_keywords,gsm8k_correct_tokens,gsm8k_wrong_tokens,gsm8k_clip_ratio,gsm8k_stop_tokens,gsm8k_stop_ratio,gsm8k_box_ratio,gsm8k_repeat_ratio,math500_acc,math500_pass_acc,math500_tokens,math500_keywords,math500_correct_tokens,math500_wrong_tokens,math500_clip_ratio,math500_stop_tokens,math500_stop_ratio,math500_box_ratio,math500_repeat_ratio,minerva_math_acc,minerva_math_pass_acc,minerva_math_tokens,minerva_math_keywords,minerva_math_correct_tokens,minerva_math_wrong_tokens,minerva_math_clip_ratio,minerva_math_stop_tokens,minerva_math_stop_ratio,minerva_math_box_ratio,minerva_math_repeat_ratio,mmlu_stem_acc,mmlu_stem_pass_acc,mmlu_stem_tokens,mmlu_stem_keywords,mmlu_stem_correct_tokens,mmlu_stem_wrong_tokens,mmlu_stem_clip_ratio,mmlu_stem_stop_tokens,mmlu_stem_stop_ratio,mmlu_stem_box_ratio,mmlu_stem_repeat_ratio,olympiadbench_acc,olympiadbench_pass_acc,olympiadbench_tokens,olympiadbench_keywords,olympiadbench_correct_tokens,olympiadbench_wrong_tokens,olympiadbench_clip_ratio,olympiadbench_stop_tokens,olympiadbench_stop_ratio,olympiadbench_box_ratio,olympiadbench_repeat_ratio,avg_acc,avg_pass_acc,avg_tokens,avg_keywords,avg_correct_tokens,avg_wrong_tokens,avg_clip_ratio,avg_stop_tokens,avg_stop_ratio,avg_box_ratio,avg_repeat_ratio
2
+ eval_results-global_step_0,6.7,6.7,2572.5,1.7,1592.0,2642.535714285714,0.06666666666666667,1613.392857142857,0.9333333333333333,0.8333333333333334,0.6666666666666666,3.3,3.3,1406.2,0.26666666666666666,806.0,1426.896551724138,0.03333333333333333,902.8965517241379,0.9666666666666667,0.9333333333333333,0.7333333333333333,15.0,15.0,1081.575,0.375,596.5,1167.1764705882354,0.025,699.0512820512821,0.975,0.95,0.575,60.7,60.7,1000.2024260803639,0.1281273692191054,409.7178526841448,1913.2876447876447,0.03866565579984837,307.0118296529968,0.9613343442001516,0.7558756633813495,0.27369219105382864,44.2,44.2,1309.69,0.414,645.4977375565611,1835.8064516129032,0.04,681.3145833333333,0.96,0.868,0.518,13.6,13.6,1340.8419117647059,0.10661764705882353,465.8378378378378,1478.608510638298,0.04779411764705882,581.1891891891892,0.9522058823529411,0.7794117647058824,0.5183823529411765,38.9,38.9,758.0768721007289,2.856527501656726,440.28303495311167,960.1214092140922,0.025182239893969515,301.8956492182189,0.9748177601060305,0.6259111994698476,0.4701789264413519,17.5,17.5,1803.2651851851851,0.7777777777777778,647.6779661016949,2048.0754039497306,0.05333333333333334,940.4287949921753,0.9466666666666667,0.8355555555555556,0.6637037037037037,24.9875,24.9875,1409.043924391373,0.8280896202973874,700.4393036416686,1684.0635196000944,0.04124691833427625,753.3975921630238,0.9587530816657237,0.8226776062224127,0.5523696467675077
3
+ eval_results-global_step_10,3.3,3.3,2954.6666666666665,3.6,957.0,3023.551724137931,0.13333333333333333,956.6153846153846,0.8666666666666667,0.8333333333333334,0.7,0.0,0.0,1490.1666666666667,0.3333333333333333,0.0,1490.1666666666667,0.03333333333333333,989.8275862068965,0.9666666666666667,0.9666666666666667,0.6666666666666666,25.0,25.0,1565.225,0.375,688.7,1857.4,0.05,805.5,0.95,0.95,0.575,75.7,75.7,378.3176648976497,0.060652009097801364,323.64028056112227,548.3115264797508,0.004548900682335102,293.5757806549886,0.9954510993176648,0.9681576952236542,0.21910538286580744,55.0,55.0,907.604,0.362,572.08,1317.6888888888889,0.018,605.6211812627291,0.982,0.974,0.512,17.3,17.3,835.4411764705883,0.11397058823529412,474.1489361702128,910.9111111111112,0.011029411764705883,666.3382899628252,0.9889705882352942,0.9448529411764706,0.5183823529411765,44.2,44.2,494.6805831676607,0.5056328694499669,384.2119850187266,582.3071895424837,0.008614976805831677,348.01403743315507,0.9913850231941683,0.8475811795891319,0.5440689198144466,23.3,23.3,1390.84,0.9392592592592592,647.6496815286624,1616.092664092664,0.03111111111111111,922.6758409785933,0.9688888888888889,0.9496296296296296,0.6903703703703704,30.475,30.475,1252.117719733654,0.786231007421957,505.92886040984047,1418.3037213649372,0.0362463833788313,698.5210126393215,0.9637536166211687,0.9292776807023608,0.5531992115823084
4
+ eval_results-global_step_20,6.7,6.7,1934.4333333333334,0.36666666666666664,899.5,2008.357142857143,0.06666666666666667,929.8928571428571,0.9333333333333333,0.9,0.7,0.0,0.0,1106.6333333333334,0.6666666666666666,0.0,1106.6333333333334,0.0,1106.6333333333334,1.0,0.9666666666666667,0.6666666666666666,25.0,25.0,1133.575,0.225,554.5,1326.6,0.025,752.3846153846154,0.975,0.975,0.7,79.0,79.0,313.94162244124334,0.043214556482183475,290.58349328214973,401.8086642599278,0.001516300227445034,290.0493545937737,0.9984836997725549,0.9984836997725549,0.2168309325246399,60.0,60.0,822.962,0.586,441.4166666666667,1395.28,0.016,576.310975609756,0.984,0.982,0.488,25.0,25.0,663.2573529411765,0.8529411764705882,488.3970588235294,721.5441176470588,0.003676470588235294,606.6531365313654,0.9963235294117647,1.0,0.47794117647058826,46.7,46.7,425.74784625579855,0.5049701789264414,335.4446808510638,504.931592039801,0.0026507620941020544,384.2156146179402,0.9973492379058979,0.974155069582505,0.5931080185553347,22.4,22.4,1278.245925925926,0.7437037037037038,655.1920529801324,1457.7900763358778,0.02962962962962963,828.5267175572519,0.9703703703703703,0.9629629629629629,0.6696296296296296,33.099999999999994,33.099999999999994,959.8495517788513,0.4986453686145313,458.1292440754427,1115.3681158091429,0.01814247865075984,684.3333255963616,0.98185752134924,0.9699085498730862,0.5640220529808574
5
+ eval_results-global_step_30,3.3,3.3,2067.3333333333335,0.5,530.0,2120.344827586207,0.06666666666666667,1072.142857142857,0.9333333333333333,0.9,0.7,6.7,6.7,841.4666666666667,0.36666666666666664,985.0,831.2142857142857,0.0,841.4666666666667,1.0,1.0,0.7,42.5,42.5,729.225,0.15,709.3529411764706,743.9130434782609,0.0,729.225,1.0,1.0,0.55,80.8,80.8,300.79605761940866,0.11675511751326763,277.66322701688557,398.2648221343874,0.000758150113722517,288.97572078907433,0.9992418498862775,0.9977255496588324,0.21228203184230476,64.4,64.4,720.038,0.884,442.1801242236025,1222.6797752808989,0.012,534.8562753036438,0.988,0.988,0.504,25.4,25.4,896.3014705882352,0.14338235294117646,712.695652173913,958.7093596059113,0.01838235294117647,613.5992509363296,0.9816176470588235,0.9816176470588235,0.5110294117647058,49.9,49.9,424.80914512922465,0.4393638170974155,360.3063122923588,488.97091870456046,0.0026507620941020544,383.48305647840533,0.9973492379058979,0.9814446653412856,0.6080185553346588,27.4,27.4,1159.7955555555557,0.5866666666666667,705.2594594594594,1331.4061224489797,0.022222222222222223,822.6757575757575,0.9777777777777777,0.9733333333333334,0.6814814814814815,37.55,37.55,892.4706536115531,0.39835432761064915,590.3072145428363,1011.9378943691864,0.015335019254736243,660.8030731115917,0.9846649807452639,0.9777651494240345,0.5583514350528938
6
+ eval_results-global_step_40,10.0,10.0,1277.7,0.5666666666666667,695.6666666666666,1342.3703703703704,0.03333333333333333,770.0,0.9666666666666667,0.9666666666666667,0.8,3.3,3.3,932.4333333333333,0.8,1286.0,920.2413793103449,0.0,932.4333333333333,1.0,1.0,0.7333333333333333,32.5,32.5,1725.9,0.3,642.0769230769231,2247.740740740741,0.05,974.6578947368421,0.95,0.95,0.6,81.3,81.3,293.80288097043217,0.03866565579984837,283.2041006523765,340.0325203252033,0.0,293.80288097043217,1.0,0.9984836997725549,0.21531463229719486,65.0,65.0,688.378,0.214,453.68,1124.2457142857143,0.008,564.8991935483871,0.992,0.992,0.498,24.6,24.6,824.6507352941177,0.17647058823529413,440.8805970149254,950.0780487804878,0.014705882352941176,599.0485074626865,0.9852941176470589,0.9779411764705882,0.4117647058823529,52.1,52.1,419.0848243870113,0.6517561298873427,359.89560789306177,483.34623358673116,0.0023194168323392977,382.86217203586847,0.9976805831676607,0.9850894632206759,0.6209410205434063,27.7,27.7,1232.8503703703705,0.40444444444444444,666.7112299465241,1449.7930327868853,0.028148148148148148,803.4634146341464,0.9718518518518519,0.9674074074074074,0.6814814814814815,37.0625,37.0625,924.3500180444082,0.3940004356291995,603.5143906563097,1107.2310050233098,0.017063347583345247,665.1459245902122,0.9829366524166548,0.9796985516922365,0.5701043966922211
7
+ eval_results-global_step_50,3.3,3.3,1588.4666666666667,0.6333333333333333,412.0,1629.0344827586207,0.03333333333333333,1091.5172413793102,0.9666666666666667,0.9666666666666667,0.7333333333333333,3.3,3.3,2355.8333333333335,0.8,718.0,2412.310344827586,0.1,839.7407407407408,0.9,0.9,0.6333333333333333,30.0,30.0,1191.15,0.425,567.3333333333334,1458.5,0.025,811.4358974358975,0.975,0.975,0.725,83.2,83.2,320.13343442001513,0.05458680818802123,284.73497267759564,496.00452488687785,0.001516300227445034,296.50189825360667,0.9984836997725549,0.9977255496588324,0.21228203184230476,62.0,62.0,766.458,0.534,447.5483870967742,1286.7842105263157,0.012,581.4534412955466,0.988,0.986,0.496,25.0,25.0,638.1102941176471,0.27205882352941174,480.44117647058823,690.6666666666666,0.0,638.1102941176471,1.0,0.9926470588235294,0.5257352941176471,53.4,53.4,441.2972166998012,0.4363817097415507,368.9628022318661,524.3402135231316,0.0026507620941020544,399.94750830564783,0.9973492379058979,0.9837640821736249,0.6259111994698476,25.9,25.9,1318.0311111111112,0.7066666666666667,603.4,1568.152,0.03111111111111111,850.9449541284404,0.9688888888888889,0.9614814814814815,0.6755555555555556,35.762499999999996,35.762499999999996,1077.4350070435719,0.48275341768237295,485.3025839762697,1258.2240553986499,0.025701438345748943,688.7064969571046,0.974298561654251,0.9704106048505168,0.5783938434565028
8
+ eval_results-global_step_60,6.7,6.7,1348.7,0.7666666666666667,1269.0,1354.392857142857,0.03333333333333333,843.5172413793103,0.9666666666666667,0.9666666666666667,0.7666666666666667,0.0,0.0,1832.7666666666667,1.2333333333333334,0.0,1832.7666666666667,0.06666666666666667,820.8214285714286,0.9333333333333333,0.9333333333333333,0.7333333333333333,32.5,32.5,1494.675,0.5,703.2307692307693,1875.7407407407406,0.025,1122.8974358974358,0.975,0.95,0.675,84.1,84.1,345.07960576194085,0.1068991660348749,292.1352569882777,624.6761904761905,0.002274450341167551,309.38829787234044,0.9977255496588324,0.9969673995451099,0.2175890826383624,62.0,62.0,619.532,0.32,445.9548387096774,902.7368421052631,0.004,557.7630522088353,0.996,0.994,0.482,28.3,28.3,645.2977941176471,0.35661764705882354,463.15584415584414,717.2205128205128,0.0,645.2977941176471,1.0,1.0,0.47794117647058826,52.9,52.9,438.1070245195494,0.5215374420145792,370.58046336881654,513.997185080929,0.0019880715705765406,407.1085657370518,0.9980119284294234,0.9857521537442014,0.6278992710404241,27.1,27.1,1219.8651851851853,0.5822222222222222,645.6994535519126,1433.4268292682927,0.02666666666666667,814.1050228310502,0.9733333333333334,0.9748148148148148,0.6666666666666666,36.7,36.7,993.0029095313737,0.5484095596663124,523.7195782506623,1156.8697280376814,0.019991148572301345,690.1123548268874,0.9800088514276986,0.9751917960130158,0.5808870246020051
9
+ eval_results-global_step_70,10.0,10.0,1384.8,1.1666666666666667,1057.3333333333333,1421.1851851851852,0.03333333333333333,880.8620689655172,0.9666666666666667,0.9666666666666667,0.7333333333333333,3.3,3.3,884.4,1.2333333333333334,809.0,887.0,0.0,884.4,1.0,1.0,0.6333333333333333,35.0,35.0,764.05,0.625,720.8571428571429,787.3076923076923,0.0,764.05,1.0,1.0,0.65,83.0,83.0,339.6194086429113,0.08339651250947688,298.3808219178082,541.2098214285714,0.001516300227445034,315.8344722854973,0.9984836997725549,0.9977255496588324,0.2100075815011372,62.8,62.8,698.086,0.406,464.63057324840764,1092.1989247311828,0.008,574.6875,0.992,0.992,0.47,25.4,25.4,659.3198529411765,0.44485294117647056,500.3478260869565,713.3546798029556,0.0,659.3198529411765,1.0,1.0,0.4852941176470588,52.1,52.1,434.4761431411531,0.6139827700463883,385.64249363867685,487.56500691562934,0.0009940357852882703,418.9363184079602,0.9990059642147118,0.9867461895294898,0.633532140490391,27.0,27.0,1101.131851851852,0.6192592592592593,692.1923076923077,1252.0993914807302,0.01925925925925926,808.583081570997,0.9807407407407407,0.9792592592592593,0.717037037037037,37.325,37.325,783.2354070721367,0.6490614353739494,616.048062346829,897.7400877314934,0.007887866075665738,663.3341617713935,0.9921121339243343,0.990299708139281,0.5665671929177863
10
+ eval_results-global_step_80,6.7,6.7,889.2666666666667,1.2333333333333334,1081.5,875.5357142857143,0.0,889.2666666666667,1.0,1.0,0.6666666666666666,3.3,3.3,2337.366666666667,1.2666666666666666,608.0,2397.0,0.1,819.2962962962963,0.9,0.9,0.8,30.0,30.0,780.775,0.525,595.8333333333334,860.0357142857143,0.0,780.775,1.0,1.0,0.675,82.3,82.3,337.78999241849885,0.1417740712661107,302.170349907919,503.81115879828326,0.000758150113722517,325.90591805766314,0.9992418498862775,0.9984836997725549,0.23199393479909022,63.4,63.4,731.452,0.402,482.8391167192429,1162.1092896174864,0.01,577.2646464646465,0.99,0.99,0.498,26.1,26.1,738.2941176470588,1.3308823529411764,544.8028169014085,806.6417910447761,0.003676470588235294,682.0221402214022,0.9963235294117647,0.9963235294117647,0.5036764705882353,54.0,54.0,479.8141153081511,0.536779324055666,409.48741559238795,562.2922966162707,0.002982107355864811,433.39481555334,0.9970178926441352,0.9837640821736249,0.6520874751491054,27.0,27.0,1071.76,0.6607407407407407,809.9120879120879,1168.4259634888438,0.01925925925925926,778.6208459214502,0.9807407407407407,0.9807407407407407,0.6948148148148148,36.599999999999994,36.599999999999994,920.8148198383803,0.7621470611254617,604.3181400457976,1041.981491017136,0.017084498414635236,660.8182911476831,0.9829155015853647,0.9811640065123356,0.5902799202522391
11
+ eval_results-global_step_90,6.7,6.7,830.0,0.9333333333333333,729.0,837.2142857142857,0.0,830.0,1.0,1.0,0.8,0.0,0.0,1829.7333333333333,1.2,0.0,1829.7333333333333,0.06666666666666667,817.6428571428571,0.9333333333333333,0.9333333333333333,0.7,37.5,37.5,779.475,0.575,650.0666666666667,857.12,0.0,779.475,1.0,1.0,0.675,83.9,83.9,349.2661106899166,0.10462471569370735,307.58084914182473,566.933962264151,0.001516300227445034,325.4783599088838,0.9984836997725549,0.9977255496588324,0.22744503411675512,62.4,62.4,655.41,0.492,504.38461538461536,906.0478723404256,0.004,593.785140562249,0.996,0.996,0.496,25.7,25.7,745.4117647058823,0.5698529411764706,547.9571428571429,813.8366336633663,0.003676470588235294,689.1217712177122,0.9963235294117647,0.9926470588235294,0.5294117647058824,54.8,54.8,469.4456593770709,0.5848243870112657,409.3510574018127,542.4145267791636,0.0016567263088137839,443.9827414537006,0.9983432736911863,0.9840954274353877,0.6630218687872763,24.9,24.9,1079.4503703703704,0.682962962962963,633.2440476190476,1227.3057199211046,0.01925925925925926,786.3912386706949,0.9807407407407407,0.9792592592592593,0.7022222222222222,36.9875,36.9875,842.2740298095717,0.6428247925222175,472.69804738388876,947.5757917519787,0.012096927881302504,658.2346386195122,0.9879030721186975,0.9853825785637926,0.5991376112290169
12
+ eval_results-global_step_100,6.7,6.7,857.5333333333333,0.8666666666666667,620.5,874.4642857142857,0.0,857.5333333333333,1.0,1.0,0.8666666666666667,3.3,3.3,1305.9666666666667,1.2,952.0,1318.1724137931035,0.03333333333333333,799.2758620689655,0.9666666666666667,0.9666666666666667,0.8666666666666667,30.0,30.0,1520.1,0.625,640.75,1896.9642857142858,0.05,757.921052631579,0.95,0.95,0.75,83.9,83.9,347.14632297194845,0.08188021228203184,307.03071364046974,556.6179245283018,0.001516300227445034,323.37433561123765,0.9984836997725549,0.9977255496588324,0.25094768764215314,62.8,62.8,637.76,0.39,468.63057324840764,923.2795698924731,0.004,576.0582329317269,0.996,0.994,0.492,27.6,27.6,677.7904411764706,0.47794117647058826,531.5333333333333,733.4720812182741,0.0,677.7904411764706,1.0,0.9963235294117647,0.5294117647058824,57.2,57.2,467.6640159045726,0.5702451954937044,416.536231884058,535.8739365815932,0.0009940357852882703,452.20995024875623,0.9990059642147118,0.9854208084824387,0.6603711066931743,25.5,25.5,1048.8177777777778,0.6562962962962963,647.953488372093,1185.892644135189,0.016296296296296295,801.1415662650602,0.9837037037037037,0.9792592592592593,0.7111111111111111,37.125,37.125,857.8473197288461,0.608503693401161,573.1167925597952,1003.0921426971881,0.013267495705295368,655.6630967833912,0.9867325042947046,0.9836744766848702,0.6408968754357068
eval_results/global_step_0/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_0/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 30,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 1,
6
+ "acc": 6.7,
7
+ "pass_acc": 6.7,
8
+ "pass@k": {
9
+ "1": 6.7
10
+ },
11
+ "time_use_in_second": 154.68396830558777,
12
+ "time_use_in_minite": "2:34"
13
+ }
eval_results/global_step_0/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_0/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 30,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 3.3,
7
+ "pass_acc": 3.3,
8
+ "pass@k": {
9
+ "1": 3.3
10
+ },
11
+ "time_use_in_second": 136.81052923202515,
12
+ "time_use_in_minite": "2:16"
13
+ }
eval_results/global_step_0/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_0/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 40,
3
+ "num_scores": 40,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 15.0,
7
+ "pass_acc": 15.0,
8
+ "pass@k": {
9
+ "1": 15.0
10
+ },
11
+ "time_use_in_second": 136.4170002937317,
12
+ "time_use_in_minite": "2:16"
13
+ }
eval_results/global_step_0/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_0/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 1319,
3
+ "num_scores": 1319,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 106,
6
+ "acc": 60.7,
7
+ "pass_acc": 60.7,
8
+ "pass@k": {
9
+ "1": 60.7
10
+ },
11
+ "time_use_in_second": 628.8340845108032,
12
+ "time_use_in_minite": "10:28"
13
+ }
eval_results/global_step_0/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_0/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 500,
3
+ "num_scores": 500,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 12,
6
+ "acc": 44.2,
7
+ "pass_acc": 44.2,
8
+ "pass@k": {
9
+ "1": 44.2
10
+ },
11
+ "time_use_in_second": 306.87160658836365,
12
+ "time_use_in_minite": "5:06"
13
+ }
eval_results/global_step_0/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_0/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 272,
3
+ "num_scores": 272,
4
+ "timeout_samples": 1,
5
+ "empty_samples": 25,
6
+ "acc": 13.6,
7
+ "pass_acc": 13.6,
8
+ "pass@k": {
9
+ "1": 13.6
10
+ },
11
+ "type_acc": {
12
+ "Differential Equations (18.03 Spring 2010)": 33.3,
13
+ "Dynamics and Control (2.003 Spring 2005)": 15.4,
14
+ "Ecology I (1.018J Fall 2009)": 0.0,
15
+ "Information and Entropy (6.050J Spring 2008)": 0.0,
16
+ "Introduction to Astronomy (8.282J Spring 2006)": 3.8,
17
+ "Introduction to Solid State Chemistry (3.091 Fall 2010)": 9.3,
18
+ "Physical Chemistry (5.61 Fall 2017)": 9.1,
19
+ "Principles of Microeconomics (14.01 Fall 2011)": 27.8,
20
+ "Relativity (8.033 Fall 2006)": 0.0
21
+ },
22
+ "time_use_in_second": 215.69851899147034,
23
+ "time_use_in_minite": "3:35"
24
+ }
eval_results/global_step_0/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 3018,
3
+ "num_scores": 3018,
4
+ "timeout_samples": 4,
5
+ "empty_samples": 5,
6
+ "acc": 38.9,
7
+ "pass_acc": 38.9,
8
+ "pass@k": {
9
+ "1": 38.9
10
+ },
11
+ "type_acc": {
12
+ "abstract_algebra": 27.0,
13
+ "astronomy": 52.0,
14
+ "college_biology": 53.5,
15
+ "college_chemistry": 43.0,
16
+ "college_computer_science": 35.0,
17
+ "college_mathematics": 25.0,
18
+ "college_physics": 37.3,
19
+ "computer_security": 40.0,
20
+ "conceptual_physics": 40.4,
21
+ "electrical_engineering": 35.2,
22
+ "elementary_mathematics": 37.8,
23
+ "high_school_biology": 53.5,
24
+ "high_school_chemistry": 43.3,
25
+ "high_school_computer_science": 37.0,
26
+ "high_school_mathematics": 18.1,
27
+ "high_school_physics": 32.5,
28
+ "high_school_statistics": 41.2,
29
+ "machine_learning": 37.5
30
+ },
31
+ "time_use_in_second": 947.2400703430176,
32
+ "time_use_in_minite": "15:47"
33
+ }
eval_results/global_step_0/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_0/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 675,
3
+ "num_scores": 675,
4
+ "timeout_samples": 4,
5
+ "empty_samples": 26,
6
+ "acc": 17.5,
7
+ "pass_acc": 17.5,
8
+ "pass@k": {
9
+ "1": 17.5
10
+ },
11
+ "time_use_in_second": 526.93394780159,
12
+ "time_use_in_minite": "8:46"
13
+ }
eval_results/global_step_10/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_10/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 30,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 3.3,
7
+ "pass_acc": 3.3,
8
+ "pass@k": {
9
+ "1": 3.3
10
+ },
11
+ "time_use_in_second": 157.95016050338745,
12
+ "time_use_in_minite": "2:37"
13
+ }
eval_results/global_step_10/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_10/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 30,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 0.0,
7
+ "pass_acc": 0.0,
8
+ "pass@k": {
9
+ "1": 0.0
10
+ },
11
+ "time_use_in_second": 138.09202575683594,
12
+ "time_use_in_minite": "2:18"
13
+ }
eval_results/global_step_10/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_10/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 40,
3
+ "num_scores": 40,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 25.0,
7
+ "pass_acc": 25.0,
8
+ "pass@k": {
9
+ "1": 25.0
10
+ },
11
+ "time_use_in_second": 151.25203156471252,
12
+ "time_use_in_minite": "2:31"
13
+ }
eval_results/global_step_10/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_10/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 1319,
3
+ "num_scores": 1319,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 9,
6
+ "acc": 75.7,
7
+ "pass_acc": 75.7,
8
+ "pass@k": {
9
+ "1": 75.7
10
+ },
11
+ "time_use_in_second": 242.95963263511658,
12
+ "time_use_in_minite": "4:02"
13
+ }
eval_results/global_step_10/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_10/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 500,
3
+ "num_scores": 500,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 2,
6
+ "acc": 55.0,
7
+ "pass_acc": 55.0,
8
+ "pass@k": {
9
+ "1": 55.0
10
+ },
11
+ "time_use_in_second": 233.9824345111847,
12
+ "time_use_in_minite": "3:53"
13
+ }
eval_results/global_step_10/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_10/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 272,
3
+ "num_scores": 272,
4
+ "timeout_samples": 1,
5
+ "empty_samples": 7,
6
+ "acc": 17.3,
7
+ "pass_acc": 17.3,
8
+ "pass@k": {
9
+ "1": 17.3
10
+ },
11
+ "type_acc": {
12
+ "Differential Equations (18.03 Spring 2010)": 31.2,
13
+ "Dynamics and Control (2.003 Spring 2005)": 26.9,
14
+ "Ecology I (1.018J Fall 2009)": 0.0,
15
+ "Information and Entropy (6.050J Spring 2008)": 0.0,
16
+ "Introduction to Astronomy (8.282J Spring 2006)": 9.4,
17
+ "Introduction to Solid State Chemistry (3.091 Fall 2010)": 12.4,
18
+ "Physical Chemistry (5.61 Fall 2017)": 9.1,
19
+ "Principles of Microeconomics (14.01 Fall 2011)": 38.9,
20
+ "Relativity (8.033 Fall 2006)": 0.0
21
+ },
22
+ "time_use_in_second": 178.46332120895386,
23
+ "time_use_in_minite": "2:58"
24
+ }
eval_results/global_step_10/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 3018,
3
+ "num_scores": 3018,
4
+ "timeout_samples": 2,
5
+ "empty_samples": 4,
6
+ "acc": 44.2,
7
+ "pass_acc": 44.2,
8
+ "pass@k": {
9
+ "1": 44.2
10
+ },
11
+ "type_acc": {
12
+ "abstract_algebra": 26.0,
13
+ "astronomy": 59.9,
14
+ "college_biology": 66.7,
15
+ "college_chemistry": 32.0,
16
+ "college_computer_science": 42.0,
17
+ "college_mathematics": 27.0,
18
+ "college_physics": 34.3,
19
+ "computer_security": 47.0,
20
+ "conceptual_physics": 51.9,
21
+ "electrical_engineering": 48.3,
22
+ "elementary_mathematics": 39.7,
23
+ "high_school_biology": 60.0,
24
+ "high_school_chemistry": 48.3,
25
+ "high_school_computer_science": 55.0,
26
+ "high_school_mathematics": 21.9,
27
+ "high_school_physics": 35.8,
28
+ "high_school_statistics": 49.1,
29
+ "machine_learning": 34.8
30
+ },
31
+ "time_use_in_second": 517.9842488765717,
32
+ "time_use_in_minite": "8:37"
33
+ }
eval_results/global_step_10/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 675,
3
+ "num_scores": 675,
4
+ "timeout_samples": 1,
5
+ "empty_samples": 1,
6
+ "acc": 23.3,
7
+ "pass_acc": 23.3,
8
+ "pass@k": {
9
+ "1": 23.3
10
+ },
11
+ "time_use_in_second": 384.54618430137634,
12
+ "time_use_in_minite": "6:24"
13
+ }
eval_results/global_step_100/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_100/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 30,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 6.7,
7
+ "pass_acc": 6.7,
8
+ "pass@k": {
9
+ "1": 6.7
10
+ },
11
+ "time_use_in_second": 12.105410814285278,
12
+ "time_use_in_minite": "0:12"
13
+ }
eval_results/global_step_100/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_100/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 30,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 3.3,
7
+ "pass_acc": 3.3,
8
+ "pass@k": {
9
+ "1": 3.3
10
+ },
11
+ "time_use_in_second": 133.4112937450409,
12
+ "time_use_in_minite": "2:13"
13
+ }
eval_results/global_step_100/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_100/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 40,
3
+ "num_scores": 40,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 30.0,
7
+ "pass_acc": 30.0,
8
+ "pass@k": {
9
+ "1": 30.0
10
+ },
11
+ "time_use_in_second": 151.70961046218872,
12
+ "time_use_in_minite": "2:31"
13
+ }
eval_results/global_step_100/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_100/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 1319,
3
+ "num_scores": 1319,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 83.9,
7
+ "pass_acc": 83.9,
8
+ "pass@k": {
9
+ "1": 83.9
10
+ },
11
+ "time_use_in_second": 210.28998517990112,
12
+ "time_use_in_minite": "3:30"
13
+ }
eval_results/global_step_100/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_100/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 500,
3
+ "num_scores": 500,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 1,
6
+ "acc": 62.8,
7
+ "pass_acc": 62.8,
8
+ "pass@k": {
9
+ "1": 62.8
10
+ },
11
+ "time_use_in_second": 193.88340139389038,
12
+ "time_use_in_minite": "3:13"
13
+ }
eval_results/global_step_100/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_100/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 272,
3
+ "num_scores": 272,
4
+ "timeout_samples": 2,
5
+ "empty_samples": 0,
6
+ "acc": 27.6,
7
+ "pass_acc": 27.6,
8
+ "pass@k": {
9
+ "1": 27.6
10
+ },
11
+ "type_acc": {
12
+ "Differential Equations (18.03 Spring 2010)": 50.0,
13
+ "Dynamics and Control (2.003 Spring 2005)": 50.0,
14
+ "Ecology I (1.018J Fall 2009)": 60.0,
15
+ "Information and Entropy (6.050J Spring 2008)": 33.3,
16
+ "Introduction to Astronomy (8.282J Spring 2006)": 11.3,
17
+ "Introduction to Solid State Chemistry (3.091 Fall 2010)": 16.5,
18
+ "Physical Chemistry (5.61 Fall 2017)": 18.2,
19
+ "Principles of Microeconomics (14.01 Fall 2011)": 55.6,
20
+ "Relativity (8.033 Fall 2006)": 0.0
21
+ },
22
+ "time_use_in_second": 34.49744939804077,
23
+ "time_use_in_minite": "0:34"
24
+ }
eval_results/global_step_100/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 3018,
3
+ "num_scores": 3018,
4
+ "timeout_samples": 1,
5
+ "empty_samples": 0,
6
+ "acc": 57.2,
7
+ "pass_acc": 57.2,
8
+ "pass@k": {
9
+ "1": 57.2
10
+ },
11
+ "type_acc": {
12
+ "abstract_algebra": 42.0,
13
+ "astronomy": 61.8,
14
+ "college_biology": 61.1,
15
+ "college_chemistry": 58.0,
16
+ "college_computer_science": 54.0,
17
+ "college_mathematics": 34.0,
18
+ "college_physics": 61.8,
19
+ "computer_security": 57.0,
20
+ "conceptual_physics": 59.6,
21
+ "electrical_engineering": 50.3,
22
+ "elementary_mathematics": 70.6,
23
+ "high_school_biology": 64.8,
24
+ "high_school_chemistry": 57.6,
25
+ "high_school_computer_science": 69.0,
26
+ "high_school_mathematics": 37.4,
27
+ "high_school_physics": 57.0,
28
+ "high_school_statistics": 56.0,
29
+ "machine_learning": 53.6
30
+ },
31
+ "time_use_in_second": 373.4691824913025,
32
+ "time_use_in_minite": "6:13"
33
+ }
eval_results/global_step_100/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 675,
3
+ "num_scores": 675,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 25.5,
7
+ "pass_acc": 25.5,
8
+ "pass@k": {
9
+ "1": 25.5
10
+ },
11
+ "time_use_in_second": 274.0989546775818,
12
+ "time_use_in_minite": "4:34"
13
+ }
eval_results/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 30,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 6.7,
7
+ "pass_acc": 6.7,
8
+ "pass@k": {
9
+ "1": 6.7
10
+ },
11
+ "time_use_in_second": 156.29598665237427,
12
+ "time_use_in_minite": "2:36"
13
+ }
eval_results/global_step_20/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_20/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 30,
3
+ "num_scores": 30,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 0.0,
7
+ "pass_acc": 0.0,
8
+ "pass@k": {
9
+ "1": 0.0
10
+ },
11
+ "time_use_in_second": 44.255409717559814,
12
+ "time_use_in_minite": "0:44"
13
+ }
eval_results/global_step_20/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
latest_checkpointed_iteration.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ 100