bensondccnqwc commited on
Commit
d3c16c8
·
verified ·
1 Parent(s): 84ba9ce

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. eval_results/eval_results.csv +11 -0
  2. eval_results/global_step_10/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  3. eval_results/global_step_10/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  4. eval_results/global_step_10/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  5. eval_results/global_step_10/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  6. eval_results/global_step_10/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  7. eval_results/global_step_10/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +24 -0
  8. eval_results/global_step_10/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  9. eval_results/global_step_10/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +33 -0
  10. eval_results/global_step_10/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  11. eval_results/global_step_10/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  12. eval_results/global_step_100/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  13. eval_results/global_step_100/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  14. eval_results/global_step_100/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  15. eval_results/global_step_100/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  16. eval_results/global_step_100/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  17. eval_results/global_step_100/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +24 -0
  18. eval_results/global_step_100/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  19. eval_results/global_step_100/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +33 -0
  20. eval_results/global_step_100/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  21. eval_results/global_step_100/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  22. eval_results/global_step_20/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  23. eval_results/global_step_20/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  24. eval_results/global_step_20/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  25. eval_results/global_step_20/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  26. eval_results/global_step_20/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  27. eval_results/global_step_20/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +24 -0
  28. eval_results/global_step_20/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +33 -0
  29. eval_results/global_step_20/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  30. eval_results/global_step_20/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  31. eval_results/global_step_30/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  32. eval_results/global_step_30/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  33. eval_results/global_step_30/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  34. eval_results/global_step_30/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  35. eval_results/global_step_30/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  36. eval_results/global_step_30/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +24 -0
  37. eval_results/global_step_30/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +33 -0
  38. eval_results/global_step_30/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  39. eval_results/global_step_40/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  40. eval_results/global_step_40/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  41. eval_results/global_step_40/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  42. eval_results/global_step_40/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +24 -0
  43. eval_results/global_step_40/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +33 -0
  44. eval_results/global_step_40/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  45. eval_results/global_step_50/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json +13 -0
  46. eval_results_avg32/global_step_10/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  47. eval_results_avg32/global_step_100/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  48. eval_results_avg32/global_step_100/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  49. eval_results_avg32/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
  50. eval_results_avg32/global_step_30/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl +0 -0
eval_results/eval_results.csv ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model,gsm8k_acc,gsm8k_pass_acc,gsm8k_tokens,gsm8k_keywords,gsm8k_correct_tokens,gsm8k_wrong_tokens,gsm8k_clip_ratio,gsm8k_stop_tokens,gsm8k_stop_ratio,gsm8k_box_ratio,gsm8k_repeat_ratio,math500_acc,math500_pass_acc,math500_tokens,math500_keywords,math500_correct_tokens,math500_wrong_tokens,math500_clip_ratio,math500_stop_tokens,math500_stop_ratio,math500_box_ratio,math500_repeat_ratio,minerva_math_acc,minerva_math_pass_acc,minerva_math_tokens,minerva_math_keywords,minerva_math_correct_tokens,minerva_math_wrong_tokens,minerva_math_clip_ratio,minerva_math_stop_tokens,minerva_math_stop_ratio,minerva_math_box_ratio,minerva_math_repeat_ratio,mmlu_stem_acc,mmlu_stem_pass_acc,mmlu_stem_tokens,mmlu_stem_keywords,mmlu_stem_correct_tokens,mmlu_stem_wrong_tokens,mmlu_stem_clip_ratio,mmlu_stem_stop_tokens,mmlu_stem_stop_ratio,mmlu_stem_box_ratio,mmlu_stem_repeat_ratio,olympiadbench_acc,olympiadbench_pass_acc,olympiadbench_tokens,olympiadbench_keywords,olympiadbench_correct_tokens,olympiadbench_wrong_tokens,olympiadbench_clip_ratio,olympiadbench_stop_tokens,olympiadbench_stop_ratio,olympiadbench_box_ratio,olympiadbench_repeat_ratio,avg_acc,avg_pass_acc,avg_tokens,avg_keywords,avg_correct_tokens,avg_wrong_tokens,avg_clip_ratio,avg_stop_tokens,avg_stop_ratio,avg_box_ratio,avg_repeat_ratio
2
+ eval_results-global_step_10,90.0,90.0,297.9658832448825,0.037149355572403335,276.0454928390901,495.0833333333333,0.000758150113722517,284.6039453717754,0.9992418498862775,0.9924184988627748,0.14783927217589082,75.6,75.6,745.9,0.456,506.2275132275132,1488.4918032786886,0.006,652.4164989939637,0.994,0.988,0.468,27.9,27.9,788.7867647058823,0.25,749.171052631579,804.1479591836735,0.003676470588235294,732.6642066420665,0.9963235294117647,0.9558823529411765,0.4742647058823529,60.1,60.1,425.62591119946984,0.34161696487740223,368.8450082735797,511.0564315352697,0.0026507620941020544,384.2305647840532,0.9973492379058979,0.961895294897283,0.5424121935056329,37.2,37.2,1149.3096296296296,0.7333333333333333,705.9681274900398,1411.7594339622642,0.011851851851851851,971.5592203898051,0.9881481481481481,0.9733333333333334,0.6592592592592592,58.160000000000004,58.160000000000004,681.5176377559728,0.3636199307566278,521.2514388923603,942.1077922586459,0.004987446929582343,605.0948872363327,0.9950125530704177,0.9743058960069136,0.45835508616462717
3
+ eval_results-global_step_20,90.4,90.4,329.5557240333586,0.13646702047005307,284.3640939597315,753.7165354330708,0.002274450341167551,293.88829787234044,0.9977255496588324,0.9977255496588324,0.12585291887793784,77.2,77.2,731.878,0.418,517.779792746114,1456.8070175438597,0.002,701.2825651302605,0.998,0.99,0.462,34.2,34.2,674.1580882352941,0.1213235294117647,516.2795698924731,756.1843575418994,0.003676470588235294,617.6273062730627,0.9963235294117647,0.9926470588235294,0.4742647058823529,60.2,60.2,426.8793903247184,0.5974155069582505,371.46919691969197,510.8258333333333,0.0016567263088137839,400.9920345170926,0.9983432736911863,0.9827700463883366,0.5546719681908548,39.7,39.7,1227.0533333333333,0.5614814814814815,792.4253731343283,1513.2457002457002,0.017777777777777778,959.710407239819,0.9822222222222222,0.9718518518518519,0.6370370370370371,60.339999999999996,60.339999999999996,677.9049071853409,0.36693750766430994,496.4636053304677,998.1558888195726,0.005477085003198882,594.7001222065151,0.9945229149968011,0.9869989013445102,0.4507653259976365
4
+ eval_results-global_step_30,92.1,92.1,314.0310841546626,0.0887035633055345,294.7753086419753,538.9903846153846,0.000758150113722517,302.13050075872536,0.9992418498862775,0.9984836997725549,0.12282031842304776,76.8,76.8,740.888,1.118,542.9765625,1396.0431034482758,0.002,710.314629258517,0.998,0.994,0.502,40.8,40.8,701.7389705882352,0.19117647058823528,529.7387387387388,820.3229813664597,0.003676470588235294,645.350553505535,0.9963235294117647,0.9852941176470589,0.49264705882352944,63.4,63.4,412.60205434062294,0.403247183565275,373.4628661087866,480.2640144665461,0.0006626905235255136,402.28149867374003,0.9993373094764745,0.9837640821736249,0.5583167660702452,39.7,39.7,1431.7837037037036,1.32,749.7723880597015,1880.872235872236,0.023703703703703703,1078.019726858877,0.9762962962962963,0.957037037037037,0.6681481481481482,62.55999999999999,62.55999999999999,720.208762557445,0.624225443491809,498.14517280984046,1023.2985439537804,0.006160202985837406,627.6193818110789,0.9938397970141626,0.9837157873260551,0.46878645829299403
5
+ eval_results-global_step_40,92.4,92.4,322.20318423047763,0.1865049279757392,291.233798195242,699.72,0.0,322.20318423047763,1.0,0.9962092494313874,0.12585291887793784,78.0,78.0,896.814,0.604,585.8435897435897,1999.3454545454545,0.01,744.3333333333334,0.99,0.986,0.474,37.9,37.9,669.7720588235294,0.34191176470588236,541.8834951456311,747.7159763313609,0.0,669.7720588235294,1.0,0.9926470588235294,0.45588235294117646,65.0,65.0,435.149436713055,0.4178263750828363,407.7121752419766,486.20094786729857,0.0016567263088137839,409.459010952539,0.9983432736911863,0.9834327369118622,0.5453943008614976,40.0,40.0,1546.2162962962964,1.4637037037037037,828.9444444444445,2024.3975308641975,0.02962962962962963,1105.7053435114503,0.9703703703703703,0.9511111111111111,0.6622222222222223,62.660000000000004,62.660000000000004,774.0309952126715,0.6027893542936323,531.1235005541768,1191.4759819216622,0.008257271187688684,650.2945861702659,0.9917427288123115,0.981880031255578,0.45267035898056684
6
+ eval_results-global_step_50,92.2,92.2,318.1652767247915,0.08794541319181198,291.24342105263156,636.0,0.000758150113722517,306.3285280728376,0.9992418498862775,0.9984836997725549,0.13646702047005307,79.0,79.0,914.92,0.864,556.1873417721519,2264.4380952380952,0.012,732.4271255060729,0.988,0.986,0.474,39.3,39.3,815.1397058823529,0.2536764705882353,534.7570093457944,996.9636363636364,0.007352941176470588,702.8444444444444,0.9926470588235294,0.9852941176470589,0.5073529411764706,66.8,66.8,455.0473823724321,0.6815772034459907,376.1318790282598,614.0609390609391,0.0033134526176275677,403.4644281914894,0.9966865473823724,0.9837640821736249,0.547713717693837,41.6,41.6,1718.1777777777777,2.5407407407407407,830.814946619217,2351.043147208122,0.03851851851851852,1145.6101694915253,0.9614814814814815,0.957037037037037,0.6533333333333333,63.78000000000001,63.78000000000001,844.2900285514709,0.8855879655933556,517.826919563611,1372.5011635741585,0.012388612485267838,658.1349391412739,0.9876113875147322,0.9821157873260551,0.4637734025347388
7
+ eval_results-global_step_60,92.2,92.2,318.92418498862776,0.06823351023502654,305.0493421052632,482.7281553398058,0.0,318.92418498862776,1.0,0.9992418498862775,0.13646702047005307,81.2,81.2,905.92,1.192,591.2142857142857,2265.18085106383,0.006,814.82092555332,0.994,0.988,0.496,39.7,39.7,838.8235294117648,0.5845588235294118,609.7592592592592,989.670731707317,0.007352941176470588,726.5259259259259,0.9926470588235294,0.9852941176470589,0.48161764705882354,71.3,71.3,457.058648111332,0.5006626905235255,403.3559479553903,590.5092378752887,0.0023194168323392977,420.88077050813683,0.9976805831676607,0.9831013916500994,0.5357852882703777,41.9,41.9,1403.7407407407406,1.5866666666666667,856.0212014134275,1799.1607142857142,0.017777777777777778,1138.5686274509803,0.9822222222222222,0.9718518518518519,0.64,65.26,65.26,784.893420650493,0.7864243381909262,553.0800072895252,1225.4499380543912,0.0066900271573175326,683.9440868853982,0.9933099728426825,0.9854978422070575,0.4579739911598509
8
+ eval_results-global_step_70,93.0,93.0,339.1319181197877,0.266868840030326,294.92257538712306,928.75,0.002274450341167551,303.3328267477204,0.9977255496588324,0.9977255496588324,0.12054586808188021,81.0,81.0,837.746,0.636,612.3802469135802,1798.5157894736842,0.0,837.746,1.0,0.998,0.464,41.5,41.5,713.9889705882352,0.24632352941176472,527.9026548672566,846.2389937106918,0.003676470588235294,657.6125461254612,0.9963235294117647,0.9963235294117647,0.4338235294117647,74.1,74.1,467.97282968853546,0.5089463220675944,392.6317673378076,683.0268199233717,0.002982107355864811,421.52210036556994,0.9970178926441352,0.9821073558648111,0.5351225977468522,43.7,43.7,1608.6725925925925,1.288888888888889,887.0881355932204,2168.85,0.02666666666666667,1215.0121765601218,0.9733333333333334,0.9644444444444444,0.6874074074074074,66.66,66.66,793.5024621978303,0.5894055160797148,542.9850760197976,1285.0763206215495,0.007119938990386865,687.0451299597746,0.9928800610096131,0.9877201758759705,0.4481798805295809
9
+ eval_results-global_step_80,92.3,92.3,340.7338893100834,0.14404852160727824,308.898931799507,720.5686274509804,0.002274450341167551,305.0387537993921,0.9977255496588324,0.9977255496588324,0.1326762699014405,80.0,80.0,886.88,0.882,601.3975,2028.81,0.004,826.2389558232932,0.996,0.992,0.474,38.2,38.2,790.1286764705883,1.1580882352941178,607.0480769230769,903.4642857142857,0.007352941176470588,677.4740740740741,0.9926470588235294,0.9926470588235294,0.4742647058823529,76.5,76.5,457.18124585818424,0.69350563286945,406.67071057192373,621.3760563380282,0.0019880715705765406,426.04913678618857,0.9980119284294234,0.9844267726971504,0.5473823724320742,42.4,42.4,1764.7985185185184,2.1214814814814815,888.6643356643357,2408.948586118252,0.03259259259259259,1285.0336906584992,0.9674074074074074,0.9614814814814815,0.6785185185185185,65.88,65.88,847.9444660314748,0.9998247742504656,562.5359109917687,1336.6335111243093,0.009641611136161455,703.9669222282895,0.9903583888638385,0.9856561725321988,0.4613683733468772
10
+ eval_results-global_step_90,93.1,93.1,333.76648976497347,0.155420773313116,299.12296416938113,801.2637362637363,0.000758150113722517,321.88088012139605,0.9992418498862775,0.9984836997725549,0.12206216830932524,82.4,82.4,973.79,1.226,643.2111650485436,2521.5,0.008,853.1612903225806,0.992,0.986,0.494,39.3,39.3,807.9669117647059,1.0661764705882353,756.392523364486,841.4121212121212,0.003676470588235294,751.9114391143911,0.9963235294117647,0.9889705882352942,0.45955882352941174,77.9,77.9,436.93671305500334,0.42909211398277003,411.5027647809443,526.5847076461769,0.0003313452617627568,431.94862446138546,0.9996686547382373,0.9867461895294898,0.5506958250497018,44.7,44.7,1721.3392592592593,2.745185185185185,947.8145695364238,2347.6246648793567,0.02666666666666667,1330.1887366818873,0.9733333333333334,0.9585185185185185,0.6948148148148148,67.48,67.48,854.7598747687883,1.1243749086138615,611.6087973799557,1407.677046000278,0.007886526526077447,737.818194140328,0.9921134734739224,0.9837437992111715,0.4642263263406507
11
+ eval_results-global_step_100,93.3,93.3,327.6360879454132,0.18271417740712662,302.2113821138211,679.0112359550562,0.000758150113722517,315.6631259484067,0.9992418498862775,0.9984836997725549,0.1326762699014405,80.4,80.4,934.582,1.188,589.9253731343283,2348.377551020408,0.004,874.4819277108434,0.996,0.996,0.524,41.2,41.2,797.0330882352941,0.7463235294117647,556.7589285714286,965.225,0.003676470588235294,740.9335793357934,0.9963235294117647,0.9889705882352942,0.4852941176470588,79.9,79.9,441.13021868787274,0.5324718356527501,413.7375621890547,550.1584158415842,0.0006626905235255136,430.8113395225464,0.9993373094764745,0.9857521537442014,0.5646123260437376,44.6,44.6,1886.8133333333333,3.5007407407407407,1097.4451827242524,2522.1069518716577,0.02666666666666667,1500.1217656012177,0.9733333333333334,0.965925925925926,0.7096296296296296,67.88,67.88,877.4389456403827,1.2300500566424764,592.015685746577,1412.9758309377412,0.007152795578429999,772.4023476237614,0.99284720442157,0.9870264735355953,0.48324246864437337
eval_results/global_step_10/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_10/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 1319,
3
+ "num_scores": 1319,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 1,
6
+ "acc": 90.0,
7
+ "pass_acc": 90.0,
8
+ "pass@k": {
9
+ "1": 90.0
10
+ },
11
+ "time_use_in_second": 215.5067675113678,
12
+ "time_use_in_minite": "3:35"
13
+ }
eval_results/global_step_10/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_10/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 500,
3
+ "num_scores": 500,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 4,
6
+ "acc": 75.6,
7
+ "pass_acc": 75.6,
8
+ "pass@k": {
9
+ "1": 75.6
10
+ },
11
+ "time_use_in_second": 245.10675311088562,
12
+ "time_use_in_minite": "4:05"
13
+ }
eval_results/global_step_10/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_10/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 272,
3
+ "num_scores": 272,
4
+ "timeout_samples": 2,
5
+ "empty_samples": 8,
6
+ "acc": 27.9,
7
+ "pass_acc": 27.9,
8
+ "pass@k": {
9
+ "1": 27.9
10
+ },
11
+ "type_acc": {
12
+ "Differential Equations (18.03 Spring 2010)": 50.0,
13
+ "Dynamics and Control (2.003 Spring 2005)": 42.3,
14
+ "Ecology I (1.018J Fall 2009)": 0.0,
15
+ "Information and Entropy (6.050J Spring 2008)": 66.7,
16
+ "Introduction to Astronomy (8.282J Spring 2006)": 20.8,
17
+ "Introduction to Solid State Chemistry (3.091 Fall 2010)": 15.5,
18
+ "Physical Chemistry (5.61 Fall 2017)": 27.3,
19
+ "Principles of Microeconomics (14.01 Fall 2011)": 50.0,
20
+ "Relativity (8.033 Fall 2006)": 9.1
21
+ },
22
+ "time_use_in_second": 204.03055500984192,
23
+ "time_use_in_minite": "3:24"
24
+ }
eval_results/global_step_10/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_10/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 3018,
3
+ "num_scores": 3018,
4
+ "timeout_samples": 1,
5
+ "empty_samples": 2,
6
+ "acc": 60.1,
7
+ "pass_acc": 60.1,
8
+ "pass@k": {
9
+ "1": 60.1
10
+ },
11
+ "type_acc": {
12
+ "abstract_algebra": 50.0,
13
+ "astronomy": 75.7,
14
+ "college_biology": 81.9,
15
+ "college_chemistry": 54.0,
16
+ "college_computer_science": 60.0,
17
+ "college_mathematics": 49.0,
18
+ "college_physics": 47.1,
19
+ "computer_security": 61.0,
20
+ "conceptual_physics": 72.3,
21
+ "electrical_engineering": 65.5,
22
+ "elementary_mathematics": 44.4,
23
+ "high_school_biology": 81.9,
24
+ "high_school_chemistry": 70.0,
25
+ "high_school_computer_science": 78.0,
26
+ "high_school_mathematics": 25.2,
27
+ "high_school_physics": 53.6,
28
+ "high_school_statistics": 60.2,
29
+ "machine_learning": 64.3
30
+ },
31
+ "time_use_in_second": 520.8277628421783,
32
+ "time_use_in_minite": "8:40"
33
+ }
eval_results/global_step_10/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_10/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 675,
3
+ "num_scores": 675,
4
+ "timeout_samples": 1,
5
+ "empty_samples": 0,
6
+ "acc": 37.2,
7
+ "pass_acc": 37.2,
8
+ "pass@k": {
9
+ "1": 37.2
10
+ },
11
+ "time_use_in_second": 422.3931884765625,
12
+ "time_use_in_minite": "7:02"
13
+ }
eval_results/global_step_100/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_100/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 1319,
3
+ "num_scores": 1319,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 93.3,
7
+ "pass_acc": 93.3,
8
+ "pass@k": {
9
+ "1": 93.3
10
+ },
11
+ "time_use_in_second": 223.6158697605133,
12
+ "time_use_in_minite": "3:43"
13
+ }
eval_results/global_step_100/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_100/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 500,
3
+ "num_scores": 500,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 80.4,
7
+ "pass_acc": 80.4,
8
+ "pass@k": {
9
+ "1": 80.4
10
+ },
11
+ "time_use_in_second": 269.90813422203064,
12
+ "time_use_in_minite": "4:29"
13
+ }
eval_results/global_step_100/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_100/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 272,
3
+ "num_scores": 272,
4
+ "timeout_samples": 1,
5
+ "empty_samples": 0,
6
+ "acc": 41.2,
7
+ "pass_acc": 41.2,
8
+ "pass@k": {
9
+ "1": 41.2
10
+ },
11
+ "type_acc": {
12
+ "Differential Equations (18.03 Spring 2010)": 64.6,
13
+ "Dynamics and Control (2.003 Spring 2005)": 50.0,
14
+ "Ecology I (1.018J Fall 2009)": 20.0,
15
+ "Information and Entropy (6.050J Spring 2008)": 66.7,
16
+ "Introduction to Astronomy (8.282J Spring 2006)": 28.3,
17
+ "Introduction to Solid State Chemistry (3.091 Fall 2010)": 27.8,
18
+ "Physical Chemistry (5.61 Fall 2017)": 36.4,
19
+ "Principles of Microeconomics (14.01 Fall 2011)": 77.8,
20
+ "Relativity (8.033 Fall 2006)": 45.5
21
+ },
22
+ "time_use_in_second": 184.09167504310608,
23
+ "time_use_in_minite": "3:04"
24
+ }
eval_results/global_step_100/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_100/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 3018,
3
+ "num_scores": 3018,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 79.9,
7
+ "pass_acc": 79.9,
8
+ "pass@k": {
9
+ "1": 79.9
10
+ },
11
+ "type_acc": {
12
+ "abstract_algebra": 75.0,
13
+ "astronomy": 80.9,
14
+ "college_biology": 86.8,
15
+ "college_chemistry": 63.0,
16
+ "college_computer_science": 73.0,
17
+ "college_mathematics": 75.0,
18
+ "college_physics": 84.3,
19
+ "computer_security": 61.0,
20
+ "conceptual_physics": 81.3,
21
+ "electrical_engineering": 75.9,
22
+ "elementary_mathematics": 88.9,
23
+ "high_school_biology": 88.7,
24
+ "high_school_chemistry": 83.3,
25
+ "high_school_computer_science": 82.0,
26
+ "high_school_mathematics": 70.4,
27
+ "high_school_physics": 82.8,
28
+ "high_school_statistics": 80.1,
29
+ "machine_learning": 71.4
30
+ },
31
+ "time_use_in_second": 444.7963571548462,
32
+ "time_use_in_minite": "7:24"
33
+ }
eval_results/global_step_100/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_100/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 675,
3
+ "num_scores": 675,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 44.6,
7
+ "pass_acc": 44.6,
8
+ "pass@k": {
9
+ "1": 44.6
10
+ },
11
+ "time_use_in_second": 748.5008101463318,
12
+ "time_use_in_minite": "12:28"
13
+ }
eval_results/global_step_20/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_20/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 1319,
3
+ "num_scores": 1319,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 1,
6
+ "acc": 90.4,
7
+ "pass_acc": 90.4,
8
+ "pass@k": {
9
+ "1": 90.4
10
+ },
11
+ "time_use_in_second": 243.8115472793579,
12
+ "time_use_in_minite": "4:03"
13
+ }
eval_results/global_step_20/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_20/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 500,
3
+ "num_scores": 500,
4
+ "timeout_samples": 1,
5
+ "empty_samples": 2,
6
+ "acc": 77.2,
7
+ "pass_acc": 77.2,
8
+ "pass@k": {
9
+ "1": 77.2
10
+ },
11
+ "time_use_in_second": 229.60104632377625,
12
+ "time_use_in_minite": "3:49"
13
+ }
eval_results/global_step_20/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_20/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 272,
3
+ "num_scores": 272,
4
+ "timeout_samples": 1,
5
+ "empty_samples": 1,
6
+ "acc": 34.2,
7
+ "pass_acc": 34.2,
8
+ "pass@k": {
9
+ "1": 34.2
10
+ },
11
+ "type_acc": {
12
+ "Differential Equations (18.03 Spring 2010)": 60.4,
13
+ "Dynamics and Control (2.003 Spring 2005)": 50.0,
14
+ "Ecology I (1.018J Fall 2009)": 20.0,
15
+ "Information and Entropy (6.050J Spring 2008)": 66.7,
16
+ "Introduction to Astronomy (8.282J Spring 2006)": 22.6,
17
+ "Introduction to Solid State Chemistry (3.091 Fall 2010)": 20.6,
18
+ "Physical Chemistry (5.61 Fall 2017)": 27.3,
19
+ "Principles of Microeconomics (14.01 Fall 2011)": 50.0,
20
+ "Relativity (8.033 Fall 2006)": 36.4
21
+ },
22
+ "time_use_in_second": 173.70800304412842,
23
+ "time_use_in_minite": "2:53"
24
+ }
eval_results/global_step_20/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 3018,
3
+ "num_scores": 3018,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 60.2,
7
+ "pass_acc": 60.2,
8
+ "pass@k": {
9
+ "1": 60.2
10
+ },
11
+ "type_acc": {
12
+ "abstract_algebra": 50.0,
13
+ "astronomy": 77.0,
14
+ "college_biology": 81.9,
15
+ "college_chemistry": 46.0,
16
+ "college_computer_science": 57.0,
17
+ "college_mathematics": 46.0,
18
+ "college_physics": 42.2,
19
+ "computer_security": 58.0,
20
+ "conceptual_physics": 76.6,
21
+ "electrical_engineering": 64.8,
22
+ "elementary_mathematics": 47.1,
23
+ "high_school_biology": 81.9,
24
+ "high_school_chemistry": 70.9,
25
+ "high_school_computer_science": 75.0,
26
+ "high_school_mathematics": 21.5,
27
+ "high_school_physics": 58.3,
28
+ "high_school_statistics": 66.2,
29
+ "machine_learning": 61.6
30
+ },
31
+ "time_use_in_second": 450.3080425262451,
32
+ "time_use_in_minite": "7:30"
33
+ }
eval_results/global_step_20/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_20/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 675,
3
+ "num_scores": 675,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 39.7,
7
+ "pass_acc": 39.7,
8
+ "pass@k": {
9
+ "1": 39.7
10
+ },
11
+ "time_use_in_second": 486.6519501209259,
12
+ "time_use_in_minite": "8:06"
13
+ }
eval_results/global_step_30/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_30/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 1319,
3
+ "num_scores": 1319,
4
+ "timeout_samples": 1,
5
+ "empty_samples": 0,
6
+ "acc": 92.1,
7
+ "pass_acc": 92.1,
8
+ "pass@k": {
9
+ "1": 92.1
10
+ },
11
+ "time_use_in_second": 222.42508149147034,
12
+ "time_use_in_minite": "3:42"
13
+ }
eval_results/global_step_30/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_30/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 500,
3
+ "num_scores": 500,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 1,
6
+ "acc": 76.8,
7
+ "pass_acc": 76.8,
8
+ "pass@k": {
9
+ "1": 76.8
10
+ },
11
+ "time_use_in_second": 227.60929775238037,
12
+ "time_use_in_minite": "3:47"
13
+ }
eval_results/global_step_30/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_30/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 272,
3
+ "num_scores": 272,
4
+ "timeout_samples": 1,
5
+ "empty_samples": 2,
6
+ "acc": 40.8,
7
+ "pass_acc": 40.8,
8
+ "pass@k": {
9
+ "1": 40.8
10
+ },
11
+ "type_acc": {
12
+ "Differential Equations (18.03 Spring 2010)": 60.4,
13
+ "Dynamics and Control (2.003 Spring 2005)": 61.5,
14
+ "Ecology I (1.018J Fall 2009)": 20.0,
15
+ "Information and Entropy (6.050J Spring 2008)": 66.7,
16
+ "Introduction to Astronomy (8.282J Spring 2006)": 22.6,
17
+ "Introduction to Solid State Chemistry (3.091 Fall 2010)": 29.9,
18
+ "Physical Chemistry (5.61 Fall 2017)": 27.3,
19
+ "Principles of Microeconomics (14.01 Fall 2011)": 72.2,
20
+ "Relativity (8.033 Fall 2006)": 54.5
21
+ },
22
+ "time_use_in_second": 182.01493453979492,
23
+ "time_use_in_minite": "3:02"
24
+ }
eval_results/global_step_30/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 3018,
3
+ "num_scores": 3018,
4
+ "timeout_samples": 1,
5
+ "empty_samples": 0,
6
+ "acc": 63.4,
7
+ "pass_acc": 63.4,
8
+ "pass@k": {
9
+ "1": 63.4
10
+ },
11
+ "type_acc": {
12
+ "abstract_algebra": 55.0,
13
+ "astronomy": 78.9,
14
+ "college_biology": 84.0,
15
+ "college_chemistry": 53.0,
16
+ "college_computer_science": 61.0,
17
+ "college_mathematics": 52.0,
18
+ "college_physics": 47.1,
19
+ "computer_security": 64.0,
20
+ "conceptual_physics": 74.0,
21
+ "electrical_engineering": 70.3,
22
+ "elementary_mathematics": 52.6,
23
+ "high_school_biology": 85.2,
24
+ "high_school_chemistry": 69.0,
25
+ "high_school_computer_science": 75.0,
26
+ "high_school_mathematics": 27.8,
27
+ "high_school_physics": 61.6,
28
+ "high_school_statistics": 68.1,
29
+ "machine_learning": 61.6
30
+ },
31
+ "time_use_in_second": 410.0813932418823,
32
+ "time_use_in_minite": "6:50"
33
+ }
eval_results/global_step_30/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 675,
3
+ "num_scores": 675,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 39.7,
7
+ "pass_acc": 39.7,
8
+ "pass@k": {
9
+ "1": 39.7
10
+ },
11
+ "time_use_in_second": 607.3179030418396,
12
+ "time_use_in_minite": "10:07"
13
+ }
eval_results/global_step_40/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 1319,
3
+ "num_scores": 1319,
4
+ "timeout_samples": 1,
5
+ "empty_samples": 1,
6
+ "acc": 92.4,
7
+ "pass_acc": 92.4,
8
+ "pass@k": {
9
+ "1": 92.4
10
+ },
11
+ "time_use_in_second": 228.25764894485474,
12
+ "time_use_in_minite": "3:48"
13
+ }
eval_results/global_step_40/math500/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 500,
3
+ "num_scores": 500,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 1,
6
+ "acc": 78.0,
7
+ "pass_acc": 78.0,
8
+ "pass@k": {
9
+ "1": 78.0
10
+ },
11
+ "time_use_in_second": 293.48533272743225,
12
+ "time_use_in_minite": "4:53"
13
+ }
eval_results/global_step_40/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results/global_step_40/minerva_math/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 272,
3
+ "num_scores": 272,
4
+ "timeout_samples": 1,
5
+ "empty_samples": 0,
6
+ "acc": 37.9,
7
+ "pass_acc": 37.9,
8
+ "pass@k": {
9
+ "1": 37.9
10
+ },
11
+ "type_acc": {
12
+ "Differential Equations (18.03 Spring 2010)": 58.3,
13
+ "Dynamics and Control (2.003 Spring 2005)": 53.8,
14
+ "Ecology I (1.018J Fall 2009)": 40.0,
15
+ "Information and Entropy (6.050J Spring 2008)": 66.7,
16
+ "Introduction to Astronomy (8.282J Spring 2006)": 30.2,
17
+ "Introduction to Solid State Chemistry (3.091 Fall 2010)": 24.7,
18
+ "Physical Chemistry (5.61 Fall 2017)": 18.2,
19
+ "Principles of Microeconomics (14.01 Fall 2011)": 61.1,
20
+ "Relativity (8.033 Fall 2006)": 36.4
21
+ },
22
+ "time_use_in_second": 71.62848281860352,
23
+ "time_use_in_minite": "1:11"
24
+ }
eval_results/global_step_40/mmlu_stem/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 3018,
3
+ "num_scores": 3018,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 65.0,
7
+ "pass_acc": 65.0,
8
+ "pass@k": {
9
+ "1": 65.0
10
+ },
11
+ "type_acc": {
12
+ "abstract_algebra": 64.0,
13
+ "astronomy": 82.2,
14
+ "college_biology": 86.8,
15
+ "college_chemistry": 57.0,
16
+ "college_computer_science": 65.0,
17
+ "college_mathematics": 49.0,
18
+ "college_physics": 52.0,
19
+ "computer_security": 58.0,
20
+ "conceptual_physics": 77.0,
21
+ "electrical_engineering": 69.0,
22
+ "elementary_mathematics": 56.1,
23
+ "high_school_biology": 83.2,
24
+ "high_school_chemistry": 68.5,
25
+ "high_school_computer_science": 85.0,
26
+ "high_school_mathematics": 27.8,
27
+ "high_school_physics": 66.2,
28
+ "high_school_statistics": 68.1,
29
+ "machine_learning": 62.5
30
+ },
31
+ "time_use_in_second": 479.89184403419495,
32
+ "time_use_in_minite": "7:59"
33
+ }
eval_results/global_step_40/olympiadbench/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 675,
3
+ "num_scores": 675,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 40.0,
7
+ "pass_acc": 40.0,
8
+ "pass@k": {
9
+ "1": 40.0
10
+ },
11
+ "time_use_in_second": 679.7414863109589,
12
+ "time_use_in_minite": "11:19"
13
+ }
eval_results/global_step_50/gsm8k/test_qwen-boxed_-1_seed0_t1.0_s0_e-1_metrics.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "num_samples": 1319,
3
+ "num_scores": 1319,
4
+ "timeout_samples": 0,
5
+ "empty_samples": 0,
6
+ "acc": 92.2,
7
+ "pass_acc": 92.2,
8
+ "pass@k": {
9
+ "1": 92.2
10
+ },
11
+ "time_use_in_second": 227.82489013671875,
12
+ "time_use_in_minite": "3:47"
13
+ }
eval_results_avg32/global_step_10/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg32/global_step_100/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg32/global_step_100/aime25/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg32/global_step_20/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
eval_results_avg32/global_step_30/aime24/test_qwen-boxed_-1_seed0_t1.0_s0_e-1.jsonl ADDED
The diff for this file is too large to render. See raw diff