Add files using upload-large-folder tool
Browse files- eval_results/eval_results.csv +12 -0
- eval_results/global_step_0/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_0/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_0/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_0/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_0/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_0/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_0/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_0/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_0/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_0/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_0/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_0/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +24 -0
- eval_results/global_step_0/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +33 -0
- eval_results/global_step_0/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_10/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_10/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_10/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_10/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_10/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_10/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_10/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_10/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_10/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_10/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_10/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_10/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +24 -0
- eval_results/global_step_10/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +33 -0
- eval_results/global_step_10/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_10/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_100/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_100/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_100/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_100/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_100/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_100/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_100/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_100/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_100/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_100/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_100/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_100/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +24 -0
- eval_results/global_step_100/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +33 -0
- eval_results/global_step_100/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_20/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
- eval_results/global_step_20/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_20/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- eval_results/global_step_20/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
- latest_checkpointed_iteration.txt +1 -0
eval_results/eval_results.csv
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
model,aime24_acc,aime24_pass_acc,aime24_tokens,aime24_keywords,aime24_correct_tokens,aime24_wrong_tokens,aime24_clip_ratio,aime24_stop_tokens,aime24_stop_ratio,aime24_box_ratio,aime24_repeat_ratio,aime25_acc,aime25_pass_acc,aime25_tokens,aime25_keywords,aime25_correct_tokens,aime25_wrong_tokens,aime25_clip_ratio,aime25_stop_tokens,aime25_stop_ratio,aime25_box_ratio,aime25_repeat_ratio,amc23_acc,amc23_pass_acc,amc23_tokens,amc23_keywords,amc23_correct_tokens,amc23_wrong_tokens,amc23_clip_ratio,amc23_stop_tokens,amc23_stop_ratio,amc23_box_ratio,amc23_repeat_ratio,gsm8k_acc,gsm8k_pass_acc,gsm8k_tokens,gsm8k_keywords,gsm8k_correct_tokens,gsm8k_wrong_tokens,gsm8k_clip_ratio,gsm8k_stop_tokens,gsm8k_stop_ratio,gsm8k_box_ratio,gsm8k_repeat_ratio,math500_acc,math500_pass_acc,math500_tokens,math500_keywords,math500_correct_tokens,math500_wrong_tokens,math500_clip_ratio,math500_stop_tokens,math500_stop_ratio,math500_box_ratio,math500_repeat_ratio,minerva_math_acc,minerva_math_pass_acc,minerva_math_tokens,minerva_math_keywords,minerva_math_correct_tokens,minerva_math_wrong_tokens,minerva_math_clip_ratio,minerva_math_stop_tokens,minerva_math_stop_ratio,minerva_math_box_ratio,minerva_math_repeat_ratio,mmlu_stem_acc,mmlu_stem_pass_acc,mmlu_stem_tokens,mmlu_stem_keywords,mmlu_stem_correct_tokens,mmlu_stem_wrong_tokens,mmlu_stem_clip_ratio,mmlu_stem_stop_tokens,mmlu_stem_stop_ratio,mmlu_stem_box_ratio,mmlu_stem_repeat_ratio,olympiadbench_acc,olympiadbench_pass_acc,olympiadbench_tokens,olympiadbench_keywords,olympiadbench_correct_tokens,olympiadbench_wrong_tokens,olympiadbench_clip_ratio,olympiadbench_stop_tokens,olympiadbench_stop_ratio,olympiadbench_box_ratio,olympiadbench_repeat_ratio,avg_acc,avg_pass_acc,avg_tokens,avg_keywords,avg_correct_tokens,avg_wrong_tokens,avg_clip_ratio,avg_stop_tokens,avg_stop_ratio,avg_box_ratio,avg_repeat_ratio
|
| 2 |
+
eval_results-global_step_0,3.3,3.3,2365.9666666666667,1.9,771.0,2420.9655172413795,0.06666666666666667,1219.9285714285713,0.9333333333333333,0.8666666666666667,0.7,0.0,0.0,1412.9666666666667,0.3333333333333333,0.0,1412.9666666666667,0.03333333333333333,909.9655172413793,0.9666666666666667,0.7666666666666667,0.7333333333333333,12.5,12.5,2523.025,0.875,674.0,2787.1714285714284,0.075,1429.7027027027027,0.925,0.825,0.7,61.7,61.7,973.6997725549659,0.5519332827899924,493.014742014742,1748.5069306930693,0.0401819560272934,292.5157977883096,0.9598180439727066,0.756633813495072,0.2812736921910538,49.0,49.0,1100.464,0.642,516.2244897959183,1661.792156862745,0.026,680.652977412731,0.974,0.87,0.528,15.4,15.4,1322.389705882353,0.1875,470.04761904761904,1478.0347826086957,0.03676470588235294,611.5305343511451,0.9632352941176471,0.8014705882352942,0.4632352941176471,40.1,40.1,653.6832339297548,0.39231278992710406,541.7877786952931,728.6723851687881,0.020212060967528166,314.221170104836,0.9797879390324719,0.6123260437375746,0.47647448641484424,16.7,16.7,1952.7348148148149,0.5022222222222222,1012.2831858407079,2141.829181494662,0.07111111111111111,862.5885167464115,0.9288888888888889,0.8325925925925926,0.682962962962963,24.8375,24.8375,1538.1162325644027,0.6730377035340815,559.7947269242851,1797.4923811634294,0.0461587292485357,790.1382234720108,0.9538412707514643,0.7914195464242334,0.5706599711274802
|
| 3 |
+
eval_results-global_step_10,13.3,13.3,1956.4,0.43333333333333335,752.5,2141.6153846153848,0.06666666666666667,953.2857142857143,0.9333333333333333,0.9333333333333333,0.8,10.0,10.0,2547.766666666667,0.4666666666666667,831.3333333333334,2738.4814814814813,0.1,1057.9259259259259,0.9,0.8666666666666667,0.7333333333333333,20.0,20.0,1110.875,0.3,637.0,1229.34375,0.025,729.0512820512821,0.975,0.975,0.675,74.5,74.5,319.5526914329037,0.037149355572403335,297.24643584521385,384.5519287833828,0.001516300227445034,295.13211845102506,0.9984836997725549,0.9605761940864291,0.2001516300227445,55.4,55.4,868.534,0.258,439.59205776173286,1401.3452914798206,0.016,622.5162601626016,0.984,0.968,0.51,20.6,20.6,720.3345588235294,0.16544117647058823,500.0,777.4583333333334,0.007352941176470588,607.1851851851852,0.9926470588235294,0.9044117647058824,0.5,43.3,43.3,489.50231941683234,0.4840954274353877,357.09709480122325,590.780701754386,0.00927766732935719,343.48896321070237,0.9907223326706428,0.8204108681245859,0.5268389662027833,24.3,24.3,1446.6888888888889,0.33925925925925926,782.7682926829268,1659.7671232876712,0.03111111111111111,974.0993883792049,0.9688888888888889,0.9392592592592592,0.6755555555555556,32.675,32.675,1182.4567656536028,0.3104931523422048,574.6921518030538,1365.4179993419325,0.032115585813881325,697.8356047064553,0.9678844141861187,0.9209572607720196,0.5776099356393021
|
| 4 |
+
eval_results-global_step_20,0.0,0.0,2715.6666666666665,2.433333333333333,0.0,2715.6666666666665,0.1,1245.3333333333333,0.9,0.8333333333333334,0.5333333333333333,3.3,3.3,1449.5666666666666,0.26666666666666666,1056.0,1463.1379310344828,0.03333333333333333,947.8275862068965,0.9666666666666667,0.9666666666666667,0.6666666666666666,37.5,37.5,1492.475,0.275,650.3333333333334,1997.76,0.05,727.1052631578947,0.95,0.95,0.6,79.5,79.5,335.4450341167551,0.44124336618650495,273.1078244274809,576.5129151291513,0.003032600454890068,287.8015209125475,0.9969673995451099,0.9946929492039424,0.2001516300227445,61.8,61.8,797.58,0.628,458.37540453074433,1346.34554973822,0.014,566.2535496957404,0.986,0.986,0.472,23.2,23.2,738.8786764705883,0.15073529411764705,458.1904761904762,823.4880382775119,0.007352941176470588,626.8851851851852,0.9926470588235294,0.9595588235294118,0.45588235294117646,46.5,46.5,454.8333333333333,0.4711729622266402,379.8851640513552,519.8564356435644,0.005301524188204109,360.0542971352432,0.9946984758117959,0.9310801855533466,0.5672630881378397,22.2,22.2,1331.4651851851852,0.3362962962962963,624.4,1533.4838095238094,0.03111111111111111,858.7217125382263,0.9688888888888889,0.9525925925925925,0.6651851851851852,34.25,34.25,1164.4888203048995,0.6253059898533859,487.53652531667376,1372.0314182516759,0.030516438783001157,702.4978060206333,0.9694835612169987,0.9467405688599118,0.5200602820358683
|
| 5 |
+
eval_results-global_step_30,3.3,3.3,1014.9333333333333,0.3333333333333333,536.0,1031.448275862069,0.0,1014.9333333333333,1.0,1.0,0.6666666666666666,0.0,0.0,1653.0333333333333,0.6,0.0,1653.0333333333333,0.03333333333333333,1158.3793103448277,0.9666666666666667,0.9333333333333333,0.7333333333333333,32.5,32.5,1289.15,0.3,694.2307692307693,1575.5925925925926,0.025,911.8974358974359,0.975,0.95,0.65,79.9,79.9,318.9347990902199,0.10386656557998483,283.0208728652751,461.77735849056603,0.001516300227445034,295.0911161731207,0.9984836997725549,0.9969673995451099,0.2259287338893101,62.2,62.2,703.146,0.134,520.3118971061093,1004.0,0.008,579.8548387096774,0.992,0.986,0.468,22.4,22.4,754.7573529411765,0.11764705882352941,461.9836065573771,839.3981042654028,0.007352941176470588,641.8444444444444,0.9926470588235294,0.9632352941176471,0.4632352941176471,48.5,48.5,484.98376408217365,0.4821073558648111,353.72385509227615,608.4778135048232,0.006295559973492379,380.6685561853951,0.9937044400265076,0.9579191517561298,0.5669317428760768,24.9,24.9,1274.1348148148147,0.2474074074074074,618.2380952380952,1491.473372781065,0.02962962962962963,824.424427480916,0.9703703703703703,0.9629629629629629,0.677037037037037,34.2125,34.2125,936.6341746993813,0.28979521512613327,433.4386370112378,1083.1501063537316,0.01389097054254637,725.8866828211438,0.9861090294574537,0.9688022677143978,0.5563916009900088
|
| 6 |
+
eval_results-global_step_40,6.7,6.7,2067.5333333333333,0.4,882.0,2152.214285714286,0.03333333333333333,1587.2068965517242,0.9666666666666667,0.9,0.8,3.3,3.3,1651.8666666666666,2.6,1054.0,1672.4827586206898,0.03333333333333333,1157.103448275862,0.9666666666666667,0.9333333333333333,0.7333333333333333,27.5,27.5,1791.075,1.875,574.2727272727273,2252.6206896551726,0.025,1428.025641025641,0.975,0.925,0.7,80.2,80.2,313.50113722517057,0.03411675511751327,283.3412098298677,435.7586206896552,0.000758150113722517,301.5955993930197,0.9992418498862775,0.9969673995451099,0.22517058377558757,62.8,62.8,658.292,0.16,508.23248407643314,911.6182795698925,0.006,565.6901408450705,0.994,0.992,0.478,27.9,27.9,812.8125,0.2426470588235294,472.55263157894734,944.75,0.011029411764705883,644.2379182156134,0.9889705882352942,0.9705882352941176,0.45955882352941174,52.0,52.0,441.87707090788604,0.5145791915175613,391.4113520408163,496.44965517241377,0.004307488402915839,374.82695507487523,0.9956925115970842,0.9701789264413518,0.5735586481113321,25.3,25.3,1444.7244444444445,0.5911111111111111,608.8538011695906,1728.3234126984128,0.03851851851851852,862.8813559322034,0.9614814814814815,0.9525925925925925,0.6948148148148148,35.7125,35.7125,1147.7102690721877,0.8021817645712144,596.8330257460478,1324.2772127650653,0.01903502943331618,865.1959944142513,0.9809649705666837,0.9550825609008132,0.5830545254455599
|
| 7 |
+
eval_results-global_step_50,6.7,6.7,1754.0,0.5666666666666667,533.5,1841.1785714285713,0.06666666666666667,736.5,0.9333333333333333,0.9333333333333333,0.8,6.7,6.7,2361.0,0.3333333333333333,900.0,2465.3571428571427,0.1,845.6296296296297,0.9,0.9,0.7666666666666667,37.5,37.5,1135.425,0.275,712.4666666666667,1389.2,0.025,754.2820512820513,0.975,0.975,0.675,80.3,80.3,320.0030326004549,0.026535253980288095,288.4117091595845,448.67692307692306,0.000758150113722517,308.1638846737481,0.9992418498862775,0.9969673995451099,0.244882486732373,65.4,65.4,791.094,0.176,467.29357798165137,1403.1329479768785,0.014,575.1703853955376,0.986,0.982,0.5,27.9,27.9,682.0735294117648,0.11397058823529412,504.86842105263156,750.7857142857143,0.003676470588235294,625.549815498155,0.9963235294117647,0.9852941176470589,0.49264705882352944,53.4,53.4,418.52418820410867,0.3535453943008615,347.52638112973307,499.81592039800995,0.0026507620941020544,377.1129568106312,0.9973492379058979,0.9764744864148442,0.5964214711729622,28.6,28.6,1281.6637037037037,0.3688888888888889,705.7668393782384,1512.2614107883817,0.028148148148148148,855.608231707317,0.9718518518518519,0.96,0.6622222222222223,38.3125,38.3125,1092.972931740004,0.27674251567566654,557.4791994210632,1288.8010788514528,0.030112524701359337,634.7521193746337,0.9698874752986406,0.9636336671175433,0.5922299882022193
|
| 8 |
+
eval_results-global_step_60,6.7,6.7,1762.8333333333333,0.4,840.0,1828.75,0.06666666666666667,745.8928571428571,0.9333333333333333,0.9333333333333333,0.7,0.0,0.0,1023.7666666666667,0.7,0.0,1023.7666666666667,0.0,1023.7666666666667,1.0,0.9666666666666667,0.6666666666666666,30.0,30.0,1099.7,2.425,649.9166666666666,1292.4642857142858,0.0,1099.7,1.0,0.975,0.625,81.1,81.1,302.5587566338135,0.027293404094010616,291.0,352.2289156626506,0.0,302.5587566338135,1.0,0.9984836997725549,0.2486732373009856,65.0,65.0,850.044,0.626,451.94153846153847,1589.3771428571429,0.014,634.94523326572,0.986,0.98,0.496,26.1,26.1,917.8419117647059,0.12867647058823528,493.9577464788732,1067.5721393034826,0.01838235294117647,636.2958801498128,0.9816176470588235,0.9705882352941176,0.5220588235294118,54.2,54.2,437.01457919151756,0.33465871438038436,341.08129584352076,550.5795947901591,0.003644797879390325,380.086797472564,0.9963552021206097,0.9801192842942346,0.5950960901259112,25.0,25.0,1368.0281481481481,0.2814814814814815,651.4437869822485,1607.3616600790515,0.03259259259259259,875.0995405819295,0.9674074074074074,0.9555555555555556,0.674074074074074,36.0125,36.0125,970.2234244672732,0.615388758818014,464.91762930410596,1164.01255063418,0.016910801259978257,712.2932164891705,0.9830891987400217,0.9699683468645579,0.5659461114621311
|
| 9 |
+
eval_results-global_step_70,3.3,3.3,1331.6666666666667,0.4,668.0,1354.551724137931,0.03333333333333333,825.8620689655172,0.9666666666666667,0.9666666666666667,0.7,6.7,6.7,1891.0333333333333,0.7666666666666667,1169.5,1942.5714285714287,0.06666666666666667,883.3214285714286,0.9333333333333333,0.9333333333333333,0.8333333333333334,40.0,40.0,810.375,0.25,653.5625,914.9166666666666,0.0,810.375,1.0,1.0,0.65,81.0,81.0,331.69598180439726,0.18877937831690675,289.9513108614232,509.3187250996016,0.001516300227445034,307.9073652239939,0.9984836997725549,0.9977255496588324,0.2539802880970432,63.8,63.8,754.316,0.18,463.5266457680251,1266.8121546961327,0.012,569.1497975708502,0.988,0.984,0.508,28.7,28.7,642.8235294117648,0.125,499.64102564102564,700.3917525773196,0.0,642.8235294117648,1.0,0.9926470588235294,0.4852941176470588,57.0,57.0,448.6848906560636,0.35619615639496355,402.38394415357766,509.95612009237874,0.003644797879390325,391.50714998337213,0.9963552021206097,0.9797879390324719,0.6030483764082174,28.3,28.3,1370.394074074074,0.27555555555555555,661.8743455497382,1649.995867768595,0.02962962962962963,923.9541984732824,0.9703703703703703,0.9555555555555556,0.6533333333333333,38.6,38.6,947.6236844932873,0.31777471961676157,601.0549714967236,1106.0643049512569,0.018348840967058126,669.3625672750262,0.9816511590329418,0.9762145128837988,0.5858736811023733
|
| 10 |
+
eval_results-global_step_80,6.7,6.7,3350.766666666667,0.4666666666666667,529.0,3552.3214285714284,0.16666666666666666,820.96,0.8333333333333334,0.8333333333333334,0.8333333333333334,0.0,0.0,1016.2,0.6333333333333333,0.0,1016.2,0.0,1016.2,1.0,0.9666666666666667,0.8333333333333334,30.0,30.0,779.075,0.175,747.8333333333334,792.4642857142857,0.0,779.075,1.0,1.0,0.675,81.9,81.9,335.357846853677,0.032600454890068235,293.94351851851854,522.5020920502092,0.001516300227445034,311.15945330296125,0.9984836997725549,0.9962092494313874,0.2486732373009856,64.4,64.4,664.104,0.16,456.2080745341615,1040.185393258427,0.006,571.8470824949699,0.994,0.99,0.496,28.3,28.3,659.1029411764706,0.09926470588235294,468.42857142857144,734.3948717948718,0.0,659.1029411764706,1.0,0.9852941176470589,0.47794117647058826,59.0,59.0,425.97448641484425,0.35520212060967526,354.20482603815935,529.4482200647249,0.0026507620941020544,384.5468438538206,0.9973492379058979,0.9817760106030484,0.5891318754141815,28.9,28.9,1286.7614814814815,0.28296296296296297,745.9230769230769,1506.4770833333334,0.022222222222222223,952.7212121212121,0.9777777777777777,0.957037037037037,0.6681481481481482,37.4,37.4,1064.6678028241427,0.27562878054313245,449.4426750969776,1211.74917184841,0.024881993901304498,686.9515666186794,0.9751180060986955,0.9637895518398164,0.6026951380000714
|
| 11 |
+
eval_results-global_step_90,13.3,13.3,1851.9666666666667,0.6,886.25,2000.5384615384614,0.06666666666666667,841.3571428571429,0.9333333333333333,0.9333333333333333,0.7666666666666667,0.0,0.0,1319.1,0.4666666666666667,0.0,1319.1,0.03333333333333333,812.8620689655172,0.9666666666666667,0.9666666666666667,0.7666666666666667,37.5,37.5,857.75,0.175,641.1333333333333,987.72,0.0,857.75,1.0,1.0,0.725,81.4,81.4,320.3434420015163,0.03866565579984837,297.91620111731845,418.65714285714284,0.000758150113722517,308.44688922610015,0.9992418498862775,0.9984836997725549,0.27065959059893857,66.4,66.4,650.004,0.158,470.1114457831325,1005.5059523809524,0.006,557.3501006036217,0.994,0.994,0.476,28.7,28.7,676.4117647058823,0.10661764705882353,524.8846153846154,737.3350515463917,0.003676470588235294,620.6974169741698,0.9963235294117647,0.9852941176470589,0.4632352941176471,59.8,59.8,392.2418820410868,0.35917826375082834,361.7484764542936,437.6174773289365,0.0,392.2418820410868,1.0,0.9847581179589132,0.610337972166998,28.9,28.9,1267.4933333333333,0.34814814814814815,666.8717948717949,1511.4958333333334,0.023703703703703703,909.8330804248862,0.9762962962962963,0.9629629629629629,0.6888888888888889,39.49999999999999,39.49999999999999,916.9138860935607,0.28153454767803937,481.11448336806103,1052.2462398731523,0.016767290550707692,662.5673226365657,0.9832327094492923,0.9781873622926862,0.5959318848882258
|
| 12 |
+
eval_results-global_step_100,6.7,6.7,2833.2,3.3666666666666667,563.0,2995.3571428571427,0.06666666666666667,1892.8214285714287,0.9333333333333333,0.8666666666666667,0.8,6.7,6.7,1052.1333333333334,0.5,945.0,1059.7857142857142,0.0,1052.1333333333334,1.0,0.9666666666666667,0.7666666666666667,35.0,35.0,803.3,0.25,720.7857142857143,847.7307692307693,0.0,803.3,1.0,1.0,0.675,82.2,82.2,313.4374526156179,0.02577710386656558,297.13837638376384,388.6212765957447,0.0,313.4374526156179,1.0,0.9984836997725549,0.2934040940106141,64.0,64.0,639.61,0.21,474.25625,933.5722222222222,0.002,608.8276553106213,0.998,0.994,0.502,27.9,27.9,670.7757352941177,0.09558823529411764,475.3421052631579,746.5561224489796,0.003676470588235294,614.2250922509226,0.9963235294117647,0.9963235294117647,0.46691176470588236,59.3,59.3,409.48542080848244,0.35056328694499667,367.89155953046395,470.03173311635476,0.0009940357852882703,393.9883913764511,0.9990059642147118,0.9850894632206759,0.6232604373757455,28.4,28.4,1115.805925925926,0.2740740740740741,676.9270833333334,1290.2670807453417,0.02074074074074074,799.8593040847201,0.9792592592592593,0.9748148148148148,0.6814814814814815,38.775,38.775,979.7184834971847,0.6340836708558026,565.0426360995541,1091.4902576877837,0.011759739222616371,809.824082192887,0.9882402607773837,0.972755605069143,0.6010905555300488
|
eval_results/global_step_0/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_0/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 30,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 2,
|
| 6 |
+
"acc": 3.3,
|
| 7 |
+
"pass_acc": 3.3,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 3.3
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 150.91682815551758,
|
| 12 |
+
"time_use_in_minite": "2:30"
|
| 13 |
+
}
|
eval_results/global_step_0/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_0/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 30,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 4,
|
| 6 |
+
"acc": 0.0,
|
| 7 |
+
"pass_acc": 0.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 0.0
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 135.4334716796875,
|
| 12 |
+
"time_use_in_minite": "2:15"
|
| 13 |
+
}
|
eval_results/global_step_0/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_0/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 40,
|
| 3 |
+
"num_scores": 40,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 2,
|
| 6 |
+
"acc": 12.5,
|
| 7 |
+
"pass_acc": 12.5,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 12.5
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 158.87086462974548,
|
| 12 |
+
"time_use_in_minite": "2:38"
|
| 13 |
+
}
|
eval_results/global_step_0/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_0/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 1319,
|
| 3 |
+
"num_scores": 1319,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 101,
|
| 6 |
+
"acc": 61.7,
|
| 7 |
+
"pass_acc": 61.7,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 61.7
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 647.2615280151367,
|
| 12 |
+
"time_use_in_minite": "10:47"
|
| 13 |
+
}
|
eval_results/global_step_0/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_0/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 500,
|
| 3 |
+
"num_scores": 500,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 20,
|
| 6 |
+
"acc": 49.0,
|
| 7 |
+
"pass_acc": 49.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 49.0
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 261.24151849746704,
|
| 12 |
+
"time_use_in_minite": "4:21"
|
| 13 |
+
}
|
eval_results/global_step_0/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_0/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 272,
|
| 3 |
+
"num_scores": 272,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 29,
|
| 6 |
+
"acc": 15.4,
|
| 7 |
+
"pass_acc": 15.4,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 15.4
|
| 10 |
+
},
|
| 11 |
+
"type_acc": {
|
| 12 |
+
"Differential Equations (18.03 Spring 2010)": 27.1,
|
| 13 |
+
"Dynamics and Control (2.003 Spring 2005)": 23.1,
|
| 14 |
+
"Ecology I (1.018J Fall 2009)": 40.0,
|
| 15 |
+
"Information and Entropy (6.050J Spring 2008)": 33.3,
|
| 16 |
+
"Introduction to Astronomy (8.282J Spring 2006)": 13.2,
|
| 17 |
+
"Introduction to Solid State Chemistry (3.091 Fall 2010)": 7.2,
|
| 18 |
+
"Physical Chemistry (5.61 Fall 2017)": 0.0,
|
| 19 |
+
"Principles of Microeconomics (14.01 Fall 2011)": 33.3,
|
| 20 |
+
"Relativity (8.033 Fall 2006)": 0.0
|
| 21 |
+
},
|
| 22 |
+
"time_use_in_second": 208.6862416267395,
|
| 23 |
+
"time_use_in_minite": "3:28"
|
| 24 |
+
}
|
eval_results/global_step_0/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 3018,
|
| 3 |
+
"num_scores": 3018,
|
| 4 |
+
"timeout_samples": 3,
|
| 5 |
+
"empty_samples": 4,
|
| 6 |
+
"acc": 40.1,
|
| 7 |
+
"pass_acc": 40.1,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 40.1
|
| 10 |
+
},
|
| 11 |
+
"type_acc": {
|
| 12 |
+
"abstract_algebra": 31.0,
|
| 13 |
+
"astronomy": 47.4,
|
| 14 |
+
"college_biology": 51.4,
|
| 15 |
+
"college_chemistry": 33.0,
|
| 16 |
+
"college_computer_science": 41.0,
|
| 17 |
+
"college_mathematics": 31.0,
|
| 18 |
+
"college_physics": 33.3,
|
| 19 |
+
"computer_security": 41.0,
|
| 20 |
+
"conceptual_physics": 51.5,
|
| 21 |
+
"electrical_engineering": 35.9,
|
| 22 |
+
"elementary_mathematics": 38.1,
|
| 23 |
+
"high_school_biology": 51.0,
|
| 24 |
+
"high_school_chemistry": 43.3,
|
| 25 |
+
"high_school_computer_science": 54.0,
|
| 26 |
+
"high_school_mathematics": 21.5,
|
| 27 |
+
"high_school_physics": 33.8,
|
| 28 |
+
"high_school_statistics": 38.4,
|
| 29 |
+
"machine_learning": 40.2
|
| 30 |
+
},
|
| 31 |
+
"time_use_in_second": 831.7992584705353,
|
| 32 |
+
"time_use_in_minite": "13:51"
|
| 33 |
+
}
|
eval_results/global_step_0/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 675,
|
| 3 |
+
"num_scores": 675,
|
| 4 |
+
"timeout_samples": 5,
|
| 5 |
+
"empty_samples": 25,
|
| 6 |
+
"acc": 16.7,
|
| 7 |
+
"pass_acc": 16.7,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 16.7
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 630.0073010921478,
|
| 12 |
+
"time_use_in_minite": "10:30"
|
| 13 |
+
}
|
eval_results/global_step_10/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_10/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 30,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 13.3,
|
| 7 |
+
"pass_acc": 13.3,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 13.3
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 152.79555130004883,
|
| 12 |
+
"time_use_in_minite": "2:32"
|
| 13 |
+
}
|
eval_results/global_step_10/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_10/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 30,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 10.0,
|
| 7 |
+
"pass_acc": 10.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 10.0
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 154.3770513534546,
|
| 12 |
+
"time_use_in_minite": "2:34"
|
| 13 |
+
}
|
eval_results/global_step_10/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_10/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 40,
|
| 3 |
+
"num_scores": 40,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 20.0,
|
| 7 |
+
"pass_acc": 20.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 20.0
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 139.03463554382324,
|
| 12 |
+
"time_use_in_minite": "2:19"
|
| 13 |
+
}
|
eval_results/global_step_10/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_10/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 1319,
|
| 3 |
+
"num_scores": 1319,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 9,
|
| 6 |
+
"acc": 74.5,
|
| 7 |
+
"pass_acc": 74.5,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 74.5
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 213.90553379058838,
|
| 12 |
+
"time_use_in_minite": "3:33"
|
| 13 |
+
}
|
eval_results/global_step_10/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_10/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 500,
|
| 3 |
+
"num_scores": 500,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 1,
|
| 6 |
+
"acc": 55.4,
|
| 7 |
+
"pass_acc": 55.4,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 55.4
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 229.85546159744263,
|
| 12 |
+
"time_use_in_minite": "3:49"
|
| 13 |
+
}
|
eval_results/global_step_10/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_10/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 272,
|
| 3 |
+
"num_scores": 272,
|
| 4 |
+
"timeout_samples": 2,
|
| 5 |
+
"empty_samples": 5,
|
| 6 |
+
"acc": 20.6,
|
| 7 |
+
"pass_acc": 20.6,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 20.6
|
| 10 |
+
},
|
| 11 |
+
"type_acc": {
|
| 12 |
+
"Differential Equations (18.03 Spring 2010)": 39.6,
|
| 13 |
+
"Dynamics and Control (2.003 Spring 2005)": 38.5,
|
| 14 |
+
"Ecology I (1.018J Fall 2009)": 0.0,
|
| 15 |
+
"Information and Entropy (6.050J Spring 2008)": 33.3,
|
| 16 |
+
"Introduction to Astronomy (8.282J Spring 2006)": 9.4,
|
| 17 |
+
"Introduction to Solid State Chemistry (3.091 Fall 2010)": 11.3,
|
| 18 |
+
"Physical Chemistry (5.61 Fall 2017)": 9.1,
|
| 19 |
+
"Principles of Microeconomics (14.01 Fall 2011)": 44.4,
|
| 20 |
+
"Relativity (8.033 Fall 2006)": 9.1
|
| 21 |
+
},
|
| 22 |
+
"time_use_in_second": 165.44177341461182,
|
| 23 |
+
"time_use_in_minite": "2:45"
|
| 24 |
+
}
|
eval_results/global_step_10/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 3018,
|
| 3 |
+
"num_scores": 3018,
|
| 4 |
+
"timeout_samples": 4,
|
| 5 |
+
"empty_samples": 4,
|
| 6 |
+
"acc": 43.3,
|
| 7 |
+
"pass_acc": 43.3,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 43.3
|
| 10 |
+
},
|
| 11 |
+
"type_acc": {
|
| 12 |
+
"abstract_algebra": 29.0,
|
| 13 |
+
"astronomy": 59.2,
|
| 14 |
+
"college_biology": 59.0,
|
| 15 |
+
"college_chemistry": 43.0,
|
| 16 |
+
"college_computer_science": 35.0,
|
| 17 |
+
"college_mathematics": 23.0,
|
| 18 |
+
"college_physics": 35.3,
|
| 19 |
+
"computer_security": 47.0,
|
| 20 |
+
"conceptual_physics": 52.8,
|
| 21 |
+
"electrical_engineering": 49.0,
|
| 22 |
+
"elementary_mathematics": 36.8,
|
| 23 |
+
"high_school_biology": 60.6,
|
| 24 |
+
"high_school_chemistry": 43.8,
|
| 25 |
+
"high_school_computer_science": 54.0,
|
| 26 |
+
"high_school_mathematics": 17.4,
|
| 27 |
+
"high_school_physics": 36.4,
|
| 28 |
+
"high_school_statistics": 46.8,
|
| 29 |
+
"machine_learning": 46.4
|
| 30 |
+
},
|
| 31 |
+
"time_use_in_second": 524.8361523151398,
|
| 32 |
+
"time_use_in_minite": "8:44"
|
| 33 |
+
}
|
eval_results/global_step_10/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_10/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 675,
|
| 3 |
+
"num_scores": 675,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 24.3,
|
| 7 |
+
"pass_acc": 24.3,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 24.3
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 410.3986358642578,
|
| 12 |
+
"time_use_in_minite": "6:50"
|
| 13 |
+
}
|
eval_results/global_step_100/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_100/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 30,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 6.7,
|
| 7 |
+
"pass_acc": 6.7,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 6.7
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 165.57087087631226,
|
| 12 |
+
"time_use_in_minite": "2:45"
|
| 13 |
+
}
|
eval_results/global_step_100/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_100/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 30,
|
| 4 |
+
"timeout_samples": 1,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 6.7,
|
| 7 |
+
"pass_acc": 6.7,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 6.7
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 38.979785680770874,
|
| 12 |
+
"time_use_in_minite": "0:38"
|
| 13 |
+
}
|
eval_results/global_step_100/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_100/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 40,
|
| 3 |
+
"num_scores": 40,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 35.0,
|
| 7 |
+
"pass_acc": 35.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 35.0
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 12.683761358261108,
|
| 12 |
+
"time_use_in_minite": "0:12"
|
| 13 |
+
}
|
eval_results/global_step_100/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_100/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 1319,
|
| 3 |
+
"num_scores": 1319,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 1,
|
| 6 |
+
"acc": 82.2,
|
| 7 |
+
"pass_acc": 82.2,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 82.2
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 99.66675853729248,
|
| 12 |
+
"time_use_in_minite": "1:39"
|
| 13 |
+
}
|
eval_results/global_step_100/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_100/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 500,
|
| 3 |
+
"num_scores": 500,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 1,
|
| 6 |
+
"acc": 64.0,
|
| 7 |
+
"pass_acc": 64.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 64.0
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 180.35561394691467,
|
| 12 |
+
"time_use_in_minite": "3:00"
|
| 13 |
+
}
|
eval_results/global_step_100/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_100/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 272,
|
| 3 |
+
"num_scores": 272,
|
| 4 |
+
"timeout_samples": 1,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 27.9,
|
| 7 |
+
"pass_acc": 27.9,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 27.9
|
| 10 |
+
},
|
| 11 |
+
"type_acc": {
|
| 12 |
+
"Differential Equations (18.03 Spring 2010)": 52.1,
|
| 13 |
+
"Dynamics and Control (2.003 Spring 2005)": 46.2,
|
| 14 |
+
"Ecology I (1.018J Fall 2009)": 60.0,
|
| 15 |
+
"Information and Entropy (6.050J Spring 2008)": 33.3,
|
| 16 |
+
"Introduction to Astronomy (8.282J Spring 2006)": 15.1,
|
| 17 |
+
"Introduction to Solid State Chemistry (3.091 Fall 2010)": 15.5,
|
| 18 |
+
"Physical Chemistry (5.61 Fall 2017)": 0.0,
|
| 19 |
+
"Principles of Microeconomics (14.01 Fall 2011)": 55.6,
|
| 20 |
+
"Relativity (8.033 Fall 2006)": 18.2
|
| 21 |
+
},
|
| 22 |
+
"time_use_in_second": 155.37784552574158,
|
| 23 |
+
"time_use_in_minite": "2:35"
|
| 24 |
+
}
|
eval_results/global_step_100/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 3018,
|
| 3 |
+
"num_scores": 3018,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 59.3,
|
| 7 |
+
"pass_acc": 59.3,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 59.3
|
| 10 |
+
},
|
| 11 |
+
"type_acc": {
|
| 12 |
+
"abstract_algebra": 44.0,
|
| 13 |
+
"astronomy": 66.4,
|
| 14 |
+
"college_biology": 63.9,
|
| 15 |
+
"college_chemistry": 44.0,
|
| 16 |
+
"college_computer_science": 56.0,
|
| 17 |
+
"college_mathematics": 50.0,
|
| 18 |
+
"college_physics": 62.7,
|
| 19 |
+
"computer_security": 52.0,
|
| 20 |
+
"conceptual_physics": 62.6,
|
| 21 |
+
"electrical_engineering": 62.1,
|
| 22 |
+
"elementary_mathematics": 75.1,
|
| 23 |
+
"high_school_biology": 64.2,
|
| 24 |
+
"high_school_chemistry": 60.6,
|
| 25 |
+
"high_school_computer_science": 66.0,
|
| 26 |
+
"high_school_mathematics": 42.6,
|
| 27 |
+
"high_school_physics": 55.6,
|
| 28 |
+
"high_school_statistics": 58.8,
|
| 29 |
+
"machine_learning": 45.5
|
| 30 |
+
},
|
| 31 |
+
"time_use_in_second": 357.6535186767578,
|
| 32 |
+
"time_use_in_minite": "5:57"
|
| 33 |
+
}
|
eval_results/global_step_100/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 675,
|
| 3 |
+
"num_scores": 675,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 28.4,
|
| 7 |
+
"pass_acc": 28.4,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 28.4
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 294.4607768058777,
|
| 12 |
+
"time_use_in_minite": "4:54"
|
| 13 |
+
}
|
eval_results/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 30,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 0.0,
|
| 7 |
+
"pass_acc": 0.0,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 0.0
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 159.08689522743225,
|
| 12 |
+
"time_use_in_minite": "2:39"
|
| 13 |
+
}
|
eval_results/global_step_20/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
eval_results/global_step_20/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 30,
|
| 3 |
+
"num_scores": 30,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 3.3,
|
| 7 |
+
"pass_acc": 3.3,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 3.3
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 136.16185522079468,
|
| 12 |
+
"time_use_in_minite": "2:16"
|
| 13 |
+
}
|
eval_results/global_step_20/amc23/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 40,
|
| 3 |
+
"num_scores": 40,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 37.5,
|
| 7 |
+
"pass_acc": 37.5,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 37.5
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 149.62879085540771,
|
| 12 |
+
"time_use_in_minite": "2:29"
|
| 13 |
+
}
|
eval_results/global_step_20/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"num_samples": 1319,
|
| 3 |
+
"num_scores": 1319,
|
| 4 |
+
"timeout_samples": 0,
|
| 5 |
+
"empty_samples": 0,
|
| 6 |
+
"acc": 79.5,
|
| 7 |
+
"pass_acc": 79.5,
|
| 8 |
+
"pass@k": {
|
| 9 |
+
"1": 79.5
|
| 10 |
+
},
|
| 11 |
+
"time_use_in_second": 220.7365050315857,
|
| 12 |
+
"time_use_in_minite": "3:40"
|
| 13 |
+
}
|
latest_checkpointed_iteration.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
100
|