{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.2, "eval_steps": 100, "global_step": 2000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 0.7081545192748309, "epoch": 0.016, "grad_norm": 119.5, "learning_rate": 6.000000000000001e-07, "loss": 30.4211, "mean_token_accuracy": 0.7030085142701864, "num_tokens": 338352.0, "step": 10 }, { "entropy": 0.7043411549180746, "epoch": 0.032, "grad_norm": 111.5, "learning_rate": 1.2666666666666669e-06, "loss": 30.1214, "mean_token_accuracy": 0.7080705553293228, "num_tokens": 671193.0, "step": 20 }, { "entropy": 0.7044402219355106, "epoch": 0.048, "grad_norm": 106.0, "learning_rate": 1.9333333333333336e-06, "loss": 29.7341, "mean_token_accuracy": 0.7064134150743484, "num_tokens": 1001008.0, "step": 30 }, { "entropy": 0.733870861120522, "epoch": 0.064, "grad_norm": 96.0, "learning_rate": 2.6e-06, "loss": 30.1494, "mean_token_accuracy": 0.7005880128592252, "num_tokens": 1331208.0, "step": 40 }, { "entropy": 0.7398215295746923, "epoch": 0.08, "grad_norm": 90.5, "learning_rate": 3.266666666666667e-06, "loss": 28.7202, "mean_token_accuracy": 0.7051243670284748, "num_tokens": 1667532.0, "step": 50 }, { "entropy": 0.738157893717289, "epoch": 0.096, "grad_norm": 76.5, "learning_rate": 3.9333333333333335e-06, "loss": 27.1633, "mean_token_accuracy": 0.71303277797997, "num_tokens": 2007176.0, "step": 60 }, { "entropy": 0.8131938450038433, "epoch": 0.112, "grad_norm": 57.5, "learning_rate": 4.600000000000001e-06, "loss": 26.4428, "mean_token_accuracy": 0.7120848346501589, "num_tokens": 2342259.0, "step": 70 }, { "entropy": 0.9618701986968518, "epoch": 0.128, "grad_norm": 43.75, "learning_rate": 5.2666666666666665e-06, "loss": 25.9908, "mean_token_accuracy": 0.7077770136296749, "num_tokens": 2663987.0, "step": 80 }, { "entropy": 1.0031472396105527, "epoch": 0.144, "grad_norm": 30.125, "learning_rate": 5.933333333333335e-06, "loss": 23.3524, "mean_token_accuracy": 0.7268724206835031, "num_tokens": 2997824.0, "step": 90 }, { "entropy": 1.0786659345030785, "epoch": 0.16, "grad_norm": 20.75, "learning_rate": 6.600000000000001e-06, "loss": 21.8746, "mean_token_accuracy": 0.7358953565359115, "num_tokens": 3330597.0, "step": 100 }, { "epoch": 0.16, "eval_biology_entropy": 3.260982286453247, "eval_biology_loss": 3.7093346118927, "eval_biology_mean_token_accuracy": 0.46263515305519104, "eval_biology_num_tokens": 3330597.0, "eval_biology_runtime": 57.0058, "eval_biology_samples_per_second": 8.771, "eval_biology_steps_per_second": 2.193, "step": 100 }, { "epoch": 0.16, "eval_chemistry_entropy": 1.6407185397148132, "eval_chemistry_loss": 1.7927597761154175, "eval_chemistry_mean_token_accuracy": 0.677183347940445, "eval_chemistry_num_tokens": 3330597.0, "eval_chemistry_runtime": 71.6705, "eval_chemistry_samples_per_second": 6.976, "eval_chemistry_steps_per_second": 1.744, "step": 100 }, { "epoch": 0.16, "eval_math_entropy": 0.9620108480453491, "eval_math_loss": 1.4342334270477295, "eval_math_mean_token_accuracy": 0.7288489332199096, "eval_math_num_tokens": 3330597.0, "eval_math_runtime": 74.2141, "eval_math_samples_per_second": 6.737, "eval_math_steps_per_second": 1.684, "step": 100 }, { "epoch": 0.16, "eval_physics_entropy": 1.1280243759155273, "eval_physics_loss": 1.3446351289749146, "eval_physics_mean_token_accuracy": 0.7332401766777038, "eval_physics_num_tokens": 3330597.0, "eval_physics_runtime": 84.9827, "eval_physics_samples_per_second": 5.884, "eval_physics_steps_per_second": 1.471, "step": 100 }, { "entropy": 1.1533773414790631, "epoch": 0.176, "grad_norm": 19.75, "learning_rate": 7.266666666666668e-06, "loss": 21.3285, "mean_token_accuracy": 0.7356826655566693, "num_tokens": 3658264.0, "step": 110 }, { "entropy": 1.112282781302929, "epoch": 0.192, "grad_norm": 12.625, "learning_rate": 7.933333333333334e-06, "loss": 20.4084, "mean_token_accuracy": 0.7426700782030821, "num_tokens": 3995568.0, "step": 120 }, { "entropy": 1.086438114568591, "epoch": 0.208, "grad_norm": 13.875, "learning_rate": 8.6e-06, "loss": 19.3962, "mean_token_accuracy": 0.7525120593607426, "num_tokens": 4321436.0, "step": 130 }, { "entropy": 1.089601282030344, "epoch": 0.224, "grad_norm": 11.8125, "learning_rate": 9.266666666666667e-06, "loss": 19.0472, "mean_token_accuracy": 0.7530231148004531, "num_tokens": 4648491.0, "step": 140 }, { "entropy": 1.0804125092923642, "epoch": 0.24, "grad_norm": 11.8125, "learning_rate": 9.933333333333334e-06, "loss": 18.7739, "mean_token_accuracy": 0.7554640270769596, "num_tokens": 4986175.0, "step": 150 }, { "entropy": 1.0836318269371987, "epoch": 0.256, "grad_norm": 11.9375, "learning_rate": 1.0600000000000002e-05, "loss": 18.4704, "mean_token_accuracy": 0.7584241010248661, "num_tokens": 5329320.0, "step": 160 }, { "entropy": 1.0882270745933056, "epoch": 0.272, "grad_norm": 13.375, "learning_rate": 1.1266666666666668e-05, "loss": 18.6237, "mean_token_accuracy": 0.754904105886817, "num_tokens": 5658796.0, "step": 170 }, { "entropy": 1.076054659858346, "epoch": 0.288, "grad_norm": 11.0625, "learning_rate": 1.1933333333333335e-05, "loss": 18.4139, "mean_token_accuracy": 0.7581049785017967, "num_tokens": 5980201.0, "step": 180 }, { "entropy": 1.072243456915021, "epoch": 0.304, "grad_norm": 15.5, "learning_rate": 1.2600000000000001e-05, "loss": 18.1532, "mean_token_accuracy": 0.7589787419885397, "num_tokens": 6298858.0, "step": 190 }, { "entropy": 1.0509168311953545, "epoch": 0.32, "grad_norm": 12.0, "learning_rate": 1.3266666666666668e-05, "loss": 17.8979, "mean_token_accuracy": 0.7612847778946161, "num_tokens": 6622798.0, "step": 200 }, { "epoch": 0.32, "eval_biology_entropy": 2.7866374349594114, "eval_biology_loss": 3.2049155235290527, "eval_biology_mean_token_accuracy": 0.502942953824997, "eval_biology_num_tokens": 6622798.0, "eval_biology_runtime": 56.7587, "eval_biology_samples_per_second": 8.809, "eval_biology_steps_per_second": 2.202, "step": 200 }, { "epoch": 0.32, "eval_chemistry_entropy": 1.41602649641037, "eval_chemistry_loss": 1.4916778802871704, "eval_chemistry_mean_token_accuracy": 0.7079435839653015, "eval_chemistry_num_tokens": 6622798.0, "eval_chemistry_runtime": 71.0228, "eval_chemistry_samples_per_second": 7.04, "eval_chemistry_steps_per_second": 1.76, "step": 200 }, { "epoch": 0.32, "eval_math_entropy": 0.9821910600662231, "eval_math_loss": 1.256189227104187, "eval_math_mean_token_accuracy": 0.7425830450057983, "eval_math_num_tokens": 6622798.0, "eval_math_runtime": 75.4477, "eval_math_samples_per_second": 6.627, "eval_math_steps_per_second": 1.657, "step": 200 }, { "epoch": 0.32, "eval_physics_entropy": 1.0466554942131043, "eval_physics_loss": 1.1129223108291626, "eval_physics_mean_token_accuracy": 0.7617223796844482, "eval_physics_num_tokens": 6622798.0, "eval_physics_runtime": 85.3386, "eval_physics_samples_per_second": 5.859, "eval_physics_steps_per_second": 1.465, "step": 200 }, { "entropy": 1.0462193094193935, "epoch": 0.336, "grad_norm": 11.875, "learning_rate": 1.3933333333333334e-05, "loss": 17.7296, "mean_token_accuracy": 0.7623197592794895, "num_tokens": 6953611.0, "step": 210 }, { "entropy": 1.0164767142385245, "epoch": 0.352, "grad_norm": 11.25, "learning_rate": 1.46e-05, "loss": 17.1525, "mean_token_accuracy": 0.7695928748697043, "num_tokens": 7290378.0, "step": 220 }, { "entropy": 0.9987690806388855, "epoch": 0.368, "grad_norm": 11.4375, "learning_rate": 1.5266666666666667e-05, "loss": 16.9228, "mean_token_accuracy": 0.7714470315724611, "num_tokens": 7621082.0, "step": 230 }, { "entropy": 1.0506686847656965, "epoch": 0.384, "grad_norm": 11.3125, "learning_rate": 1.5933333333333336e-05, "loss": 17.6416, "mean_token_accuracy": 0.7599896423518657, "num_tokens": 7955570.0, "step": 240 }, { "entropy": 1.0135670874267817, "epoch": 0.4, "grad_norm": 14.5, "learning_rate": 1.66e-05, "loss": 16.9698, "mean_token_accuracy": 0.7687441952526569, "num_tokens": 8291049.0, "step": 250 }, { "entropy": 0.9876859273761511, "epoch": 0.416, "grad_norm": 13.75, "learning_rate": 1.726666666666667e-05, "loss": 16.6909, "mean_token_accuracy": 0.7725014701485634, "num_tokens": 8627310.0, "step": 260 }, { "entropy": 1.0216444052755833, "epoch": 0.432, "grad_norm": 12.125, "learning_rate": 1.7933333333333333e-05, "loss": 17.011, "mean_token_accuracy": 0.7689431305974722, "num_tokens": 8958371.0, "step": 270 }, { "entropy": 0.9649551097303629, "epoch": 0.448, "grad_norm": 12.1875, "learning_rate": 1.86e-05, "loss": 16.2252, "mean_token_accuracy": 0.7780237648636102, "num_tokens": 9294388.0, "step": 280 }, { "entropy": 0.9653072291985154, "epoch": 0.464, "grad_norm": 12.5, "learning_rate": 1.926666666666667e-05, "loss": 16.1144, "mean_token_accuracy": 0.7772431872785092, "num_tokens": 9620366.0, "step": 290 }, { "entropy": 0.9695413928478956, "epoch": 0.48, "grad_norm": 11.8125, "learning_rate": 1.9933333333333334e-05, "loss": 16.4447, "mean_token_accuracy": 0.7733453687280416, "num_tokens": 9955431.0, "step": 300 }, { "epoch": 0.48, "eval_biology_entropy": 2.7462952222824097, "eval_biology_loss": 3.060053825378418, "eval_biology_mean_token_accuracy": 0.5114872539043427, "eval_biology_num_tokens": 9955431.0, "eval_biology_runtime": 56.9805, "eval_biology_samples_per_second": 8.775, "eval_biology_steps_per_second": 2.194, "step": 300 }, { "epoch": 0.48, "eval_chemistry_entropy": 1.3710836601257324, "eval_chemistry_loss": 1.4066872596740723, "eval_chemistry_mean_token_accuracy": 0.7168692064285278, "eval_chemistry_num_tokens": 9955431.0, "eval_chemistry_runtime": 71.3717, "eval_chemistry_samples_per_second": 7.006, "eval_chemistry_steps_per_second": 1.751, "step": 300 }, { "epoch": 0.48, "eval_math_entropy": 0.9784291224479675, "eval_math_loss": 1.2082456350326538, "eval_math_mean_token_accuracy": 0.7478420066833497, "eval_math_num_tokens": 9955431.0, "eval_math_runtime": 74.3706, "eval_math_samples_per_second": 6.723, "eval_math_steps_per_second": 1.681, "step": 300 }, { "epoch": 0.48, "eval_physics_entropy": 1.007008915424347, "eval_physics_loss": 1.0401662588119507, "eval_physics_mean_token_accuracy": 0.7707204632759094, "eval_physics_num_tokens": 9955431.0, "eval_physics_runtime": 84.6939, "eval_physics_samples_per_second": 5.904, "eval_physics_steps_per_second": 1.476, "step": 300 }, { "entropy": 1.007104156538844, "epoch": 0.496, "grad_norm": 10.5, "learning_rate": 1.9933333333333334e-05, "loss": 16.6639, "mean_token_accuracy": 0.771253039687872, "num_tokens": 10289883.0, "step": 310 }, { "entropy": 0.9723599992692471, "epoch": 0.512, "grad_norm": 12.4375, "learning_rate": 1.985925925925926e-05, "loss": 16.4387, "mean_token_accuracy": 0.7748051699250936, "num_tokens": 10619468.0, "step": 320 }, { "entropy": 0.9979474730789661, "epoch": 0.528, "grad_norm": 10.8125, "learning_rate": 1.9785185185185187e-05, "loss": 16.6398, "mean_token_accuracy": 0.7712166555225849, "num_tokens": 10944904.0, "step": 330 }, { "entropy": 0.9800913408398628, "epoch": 0.544, "grad_norm": 11.625, "learning_rate": 1.971111111111111e-05, "loss": 16.374, "mean_token_accuracy": 0.7737775973975658, "num_tokens": 11266409.0, "step": 340 }, { "entropy": 0.9599094491451978, "epoch": 0.56, "grad_norm": 10.625, "learning_rate": 1.963703703703704e-05, "loss": 16.0194, "mean_token_accuracy": 0.778597004711628, "num_tokens": 11605544.0, "step": 350 }, { "entropy": 0.948445113003254, "epoch": 0.576, "grad_norm": 11.75, "learning_rate": 1.9562962962962964e-05, "loss": 15.9791, "mean_token_accuracy": 0.7805619373917579, "num_tokens": 11931715.0, "step": 360 }, { "entropy": 0.9618662863969802, "epoch": 0.592, "grad_norm": 13.0625, "learning_rate": 1.948888888888889e-05, "loss": 16.0713, "mean_token_accuracy": 0.776887471601367, "num_tokens": 12257243.0, "step": 370 }, { "entropy": 0.9480277199298144, "epoch": 0.608, "grad_norm": 10.8125, "learning_rate": 1.9414814814814817e-05, "loss": 15.8656, "mean_token_accuracy": 0.7797395702451467, "num_tokens": 12583154.0, "step": 380 }, { "entropy": 0.9716805059462785, "epoch": 0.624, "grad_norm": 10.625, "learning_rate": 1.9340740740740743e-05, "loss": 16.1886, "mean_token_accuracy": 0.7762768242508173, "num_tokens": 12905392.0, "step": 390 }, { "entropy": 0.9691856369376183, "epoch": 0.64, "grad_norm": 11.8125, "learning_rate": 1.926666666666667e-05, "loss": 16.2314, "mean_token_accuracy": 0.7761619180440903, "num_tokens": 13232198.0, "step": 400 }, { "epoch": 0.64, "eval_biology_entropy": 2.67273916721344, "eval_biology_loss": 2.96358585357666, "eval_biology_mean_token_accuracy": 0.5216054661273957, "eval_biology_num_tokens": 13232198.0, "eval_biology_runtime": 57.8851, "eval_biology_samples_per_second": 8.638, "eval_biology_steps_per_second": 2.159, "step": 400 }, { "epoch": 0.64, "eval_chemistry_entropy": 1.3173069462776184, "eval_chemistry_loss": 1.36112380027771, "eval_chemistry_mean_token_accuracy": 0.7232093935012818, "eval_chemistry_num_tokens": 13232198.0, "eval_chemistry_runtime": 71.8687, "eval_chemistry_samples_per_second": 6.957, "eval_chemistry_steps_per_second": 1.739, "step": 400 }, { "epoch": 0.64, "eval_math_entropy": 0.9528153438568115, "eval_math_loss": 1.1841319799423218, "eval_math_mean_token_accuracy": 0.7507999067306519, "eval_math_num_tokens": 13232198.0, "eval_math_runtime": 74.8756, "eval_math_samples_per_second": 6.678, "eval_math_steps_per_second": 1.669, "step": 400 }, { "epoch": 0.64, "eval_physics_entropy": 0.9614342203140259, "eval_physics_loss": 1.0016775131225586, "eval_physics_mean_token_accuracy": 0.7765112986564636, "eval_physics_num_tokens": 13232198.0, "eval_physics_runtime": 85.1175, "eval_physics_samples_per_second": 5.874, "eval_physics_steps_per_second": 1.469, "step": 400 }, { "entropy": 0.9530647125095129, "epoch": 0.656, "grad_norm": 11.375, "learning_rate": 1.9192592592592593e-05, "loss": 15.9429, "mean_token_accuracy": 0.7780249908566474, "num_tokens": 13575902.0, "step": 410 }, { "entropy": 0.9399624351412058, "epoch": 0.672, "grad_norm": 12.0625, "learning_rate": 1.911851851851852e-05, "loss": 15.7284, "mean_token_accuracy": 0.7817466016858816, "num_tokens": 13895997.0, "step": 420 }, { "entropy": 0.9540142439305782, "epoch": 0.688, "grad_norm": 10.6875, "learning_rate": 1.9044444444444446e-05, "loss": 15.7763, "mean_token_accuracy": 0.7817316662520171, "num_tokens": 14234888.0, "step": 430 }, { "entropy": 0.9538515221327544, "epoch": 0.704, "grad_norm": 11.625, "learning_rate": 1.8970370370370372e-05, "loss": 16.0838, "mean_token_accuracy": 0.775813952833414, "num_tokens": 14567908.0, "step": 440 }, { "entropy": 0.9464580094441771, "epoch": 0.72, "grad_norm": 12.375, "learning_rate": 1.8896296296296295e-05, "loss": 15.704, "mean_token_accuracy": 0.7821726374328136, "num_tokens": 14882927.0, "step": 450 }, { "entropy": 0.9363573594018817, "epoch": 0.736, "grad_norm": 13.0625, "learning_rate": 1.8822222222222225e-05, "loss": 15.7075, "mean_token_accuracy": 0.7809599131345749, "num_tokens": 15217342.0, "step": 460 }, { "entropy": 0.9193624388426542, "epoch": 0.752, "grad_norm": 10.4375, "learning_rate": 1.874814814814815e-05, "loss": 15.3677, "mean_token_accuracy": 0.7846659943461418, "num_tokens": 15547710.0, "step": 470 }, { "entropy": 0.9613647162914276, "epoch": 0.768, "grad_norm": 11.1875, "learning_rate": 1.8674074074074075e-05, "loss": 16.0934, "mean_token_accuracy": 0.7759233452379704, "num_tokens": 15877177.0, "step": 480 }, { "entropy": 0.9139260027557612, "epoch": 0.784, "grad_norm": 10.1875, "learning_rate": 1.86e-05, "loss": 15.3216, "mean_token_accuracy": 0.786144433543086, "num_tokens": 16219640.0, "step": 490 }, { "entropy": 0.9349881060421467, "epoch": 0.8, "grad_norm": 12.0, "learning_rate": 1.8525925925925928e-05, "loss": 15.6534, "mean_token_accuracy": 0.7819662269204855, "num_tokens": 16548261.0, "step": 500 }, { "epoch": 0.8, "eval_biology_entropy": 2.617759398460388, "eval_biology_loss": 2.9445557594299316, "eval_biology_mean_token_accuracy": 0.5238966343402862, "eval_biology_num_tokens": 16548261.0, "eval_biology_runtime": 57.6905, "eval_biology_samples_per_second": 8.667, "eval_biology_steps_per_second": 2.167, "step": 500 }, { "epoch": 0.8, "eval_chemistry_entropy": 1.2937732858657838, "eval_chemistry_loss": 1.3414490222930908, "eval_chemistry_mean_token_accuracy": 0.7248484263420105, "eval_chemistry_num_tokens": 16548261.0, "eval_chemistry_runtime": 71.1659, "eval_chemistry_samples_per_second": 7.026, "eval_chemistry_steps_per_second": 1.756, "step": 500 }, { "epoch": 0.8, "eval_math_entropy": 0.93889684009552, "eval_math_loss": 1.17385995388031, "eval_math_mean_token_accuracy": 0.7522515163421631, "eval_math_num_tokens": 16548261.0, "eval_math_runtime": 74.7142, "eval_math_samples_per_second": 6.692, "eval_math_steps_per_second": 1.673, "step": 500 }, { "epoch": 0.8, "eval_physics_entropy": 0.9420124840736389, "eval_physics_loss": 0.9809063673019409, "eval_physics_mean_token_accuracy": 0.7797514395713806, "eval_physics_num_tokens": 16548261.0, "eval_physics_runtime": 84.5204, "eval_physics_samples_per_second": 5.916, "eval_physics_steps_per_second": 1.479, "step": 500 }, { "entropy": 0.9288215478882194, "epoch": 0.816, "grad_norm": 11.3125, "learning_rate": 1.8451851851851855e-05, "loss": 15.5725, "mean_token_accuracy": 0.7813547737896442, "num_tokens": 16870533.0, "step": 510 }, { "entropy": 0.9446330968290567, "epoch": 0.832, "grad_norm": 15.875, "learning_rate": 1.8377777777777778e-05, "loss": 15.7952, "mean_token_accuracy": 0.7800830528140068, "num_tokens": 17196371.0, "step": 520 }, { "entropy": 0.9318377874791622, "epoch": 0.848, "grad_norm": 10.625, "learning_rate": 1.8303703703703704e-05, "loss": 15.4484, "mean_token_accuracy": 0.7836398217827082, "num_tokens": 17527066.0, "step": 530 }, { "entropy": 0.9298348639160394, "epoch": 0.864, "grad_norm": 10.625, "learning_rate": 1.822962962962963e-05, "loss": 15.6026, "mean_token_accuracy": 0.7814645994454622, "num_tokens": 17859605.0, "step": 540 }, { "entropy": 0.9549105998128653, "epoch": 0.88, "grad_norm": 12.625, "learning_rate": 1.8155555555555557e-05, "loss": 15.7997, "mean_token_accuracy": 0.7805559232831001, "num_tokens": 18187315.0, "step": 550 }, { "entropy": 0.8820257507264614, "epoch": 0.896, "grad_norm": 10.5625, "learning_rate": 1.8081481481481484e-05, "loss": 14.7783, "mean_token_accuracy": 0.7907331828027964, "num_tokens": 18536790.0, "step": 560 }, { "entropy": 0.9362711973488331, "epoch": 0.912, "grad_norm": 12.5625, "learning_rate": 1.800740740740741e-05, "loss": 15.6226, "mean_token_accuracy": 0.7802051860839129, "num_tokens": 18869344.0, "step": 570 }, { "entropy": 0.9295620009303093, "epoch": 0.928, "grad_norm": 12.75, "learning_rate": 1.7933333333333333e-05, "loss": 15.5339, "mean_token_accuracy": 0.7815026968717576, "num_tokens": 19199834.0, "step": 580 }, { "entropy": 0.9112746389582753, "epoch": 0.944, "grad_norm": 12.5, "learning_rate": 1.785925925925926e-05, "loss": 15.2153, "mean_token_accuracy": 0.7855059750378132, "num_tokens": 19525385.0, "step": 590 }, { "entropy": 0.9181267462670804, "epoch": 0.96, "grad_norm": 12.75, "learning_rate": 1.7785185185185186e-05, "loss": 15.3751, "mean_token_accuracy": 0.7829241104424, "num_tokens": 19856973.0, "step": 600 }, { "epoch": 0.96, "eval_biology_entropy": 2.594178828239441, "eval_biology_loss": 2.9207751750946045, "eval_biology_mean_token_accuracy": 0.5271391036510468, "eval_biology_num_tokens": 19856973.0, "eval_biology_runtime": 57.0207, "eval_biology_samples_per_second": 8.769, "eval_biology_steps_per_second": 2.192, "step": 600 }, { "epoch": 0.96, "eval_chemistry_entropy": 1.270768217086792, "eval_chemistry_loss": 1.3242253065109253, "eval_chemistry_mean_token_accuracy": 0.7280859136581421, "eval_chemistry_num_tokens": 19856973.0, "eval_chemistry_runtime": 72.006, "eval_chemistry_samples_per_second": 6.944, "eval_chemistry_steps_per_second": 1.736, "step": 600 }, { "epoch": 0.96, "eval_math_entropy": 0.9325860266685486, "eval_math_loss": 1.1655522584915161, "eval_math_mean_token_accuracy": 0.7533689908981324, "eval_math_num_tokens": 19856973.0, "eval_math_runtime": 74.0585, "eval_math_samples_per_second": 6.751, "eval_math_steps_per_second": 1.688, "step": 600 }, { "epoch": 0.96, "eval_physics_entropy": 0.9269161529541016, "eval_physics_loss": 0.9667912125587463, "eval_physics_mean_token_accuracy": 0.7814396271705627, "eval_physics_num_tokens": 19856973.0, "eval_physics_runtime": 84.5863, "eval_physics_samples_per_second": 5.911, "eval_physics_steps_per_second": 1.478, "step": 600 }, { "entropy": 0.9162795972079039, "epoch": 0.976, "grad_norm": 12.125, "learning_rate": 1.7711111111111113e-05, "loss": 15.2813, "mean_token_accuracy": 0.7860829800367355, "num_tokens": 20190600.0, "step": 610 }, { "entropy": 0.9235894799232482, "epoch": 0.992, "grad_norm": 11.5, "learning_rate": 1.763703703703704e-05, "loss": 15.4241, "mean_token_accuracy": 0.784028148651123, "num_tokens": 20518014.0, "step": 620 }, { "entropy": 0.9041305113583803, "epoch": 1.008, "grad_norm": 11.0625, "learning_rate": 1.7562962962962962e-05, "loss": 14.9099, "mean_token_accuracy": 0.7884359251707792, "num_tokens": 20859215.0, "step": 630 }, { "entropy": 0.9018714495003224, "epoch": 1.024, "grad_norm": 11.1875, "learning_rate": 1.7488888888888892e-05, "loss": 15.1658, "mean_token_accuracy": 0.7857498530298471, "num_tokens": 21186658.0, "step": 640 }, { "entropy": 0.916073745302856, "epoch": 1.04, "grad_norm": 10.8125, "learning_rate": 1.7414814814814815e-05, "loss": 15.1367, "mean_token_accuracy": 0.7867844242602586, "num_tokens": 21517237.0, "step": 650 }, { "entropy": 0.8802109774202108, "epoch": 1.056, "grad_norm": 11.0, "learning_rate": 1.7340740740740742e-05, "loss": 14.7961, "mean_token_accuracy": 0.7896511305123568, "num_tokens": 21852528.0, "step": 660 }, { "entropy": 0.9102928217500448, "epoch": 1.072, "grad_norm": 10.0, "learning_rate": 1.726666666666667e-05, "loss": 15.1014, "mean_token_accuracy": 0.7862772591412067, "num_tokens": 22192380.0, "step": 670 }, { "entropy": 0.9107371259480714, "epoch": 1.088, "grad_norm": 13.4375, "learning_rate": 1.7192592592592595e-05, "loss": 15.2493, "mean_token_accuracy": 0.7848471954464913, "num_tokens": 22517971.0, "step": 680 }, { "entropy": 0.8938208010047675, "epoch": 1.104, "grad_norm": 10.6875, "learning_rate": 1.711851851851852e-05, "loss": 14.9017, "mean_token_accuracy": 0.7887144055217504, "num_tokens": 22848694.0, "step": 690 }, { "entropy": 0.9129897281527519, "epoch": 1.12, "grad_norm": 11.4375, "learning_rate": 1.7044444444444445e-05, "loss": 15.2722, "mean_token_accuracy": 0.7840389590710402, "num_tokens": 23183889.0, "step": 700 }, { "epoch": 1.12, "eval_biology_entropy": 2.5242818908691405, "eval_biology_loss": 2.9092111587524414, "eval_biology_mean_token_accuracy": 0.5278490686416626, "eval_biology_num_tokens": 23183889.0, "eval_biology_runtime": 57.2117, "eval_biology_samples_per_second": 8.739, "eval_biology_steps_per_second": 2.185, "step": 700 }, { "epoch": 1.12, "eval_chemistry_entropy": 1.2367294158935547, "eval_chemistry_loss": 1.3155030012130737, "eval_chemistry_mean_token_accuracy": 0.7286214451789856, "eval_chemistry_num_tokens": 23183889.0, "eval_chemistry_runtime": 71.8224, "eval_chemistry_samples_per_second": 6.962, "eval_chemistry_steps_per_second": 1.74, "step": 700 }, { "epoch": 1.12, "eval_math_entropy": 0.9127886519432068, "eval_math_loss": 1.1603002548217773, "eval_math_mean_token_accuracy": 0.754603410243988, "eval_math_num_tokens": 23183889.0, "eval_math_runtime": 74.5403, "eval_math_samples_per_second": 6.708, "eval_math_steps_per_second": 1.677, "step": 700 }, { "epoch": 1.12, "eval_physics_entropy": 0.9048804240226745, "eval_physics_loss": 0.9561019539833069, "eval_physics_mean_token_accuracy": 0.7832872924804688, "eval_physics_num_tokens": 23183889.0, "eval_physics_runtime": 84.6107, "eval_physics_samples_per_second": 5.909, "eval_physics_steps_per_second": 1.477, "step": 700 }, { "entropy": 0.8741975981742144, "epoch": 1.1360000000000001, "grad_norm": 10.1875, "learning_rate": 1.697037037037037e-05, "loss": 14.564, "mean_token_accuracy": 0.7930445209145546, "num_tokens": 23523903.0, "step": 710 }, { "entropy": 0.8917050797492265, "epoch": 1.152, "grad_norm": 12.75, "learning_rate": 1.6896296296296298e-05, "loss": 14.9557, "mean_token_accuracy": 0.7896463591605425, "num_tokens": 23848848.0, "step": 720 }, { "entropy": 0.9238279532641173, "epoch": 1.168, "grad_norm": 11.6875, "learning_rate": 1.6822222222222224e-05, "loss": 15.2756, "mean_token_accuracy": 0.7821665916591882, "num_tokens": 24174589.0, "step": 730 }, { "entropy": 0.9052068604156375, "epoch": 1.184, "grad_norm": 13.5625, "learning_rate": 1.6748148148148147e-05, "loss": 15.0924, "mean_token_accuracy": 0.7870742384344339, "num_tokens": 24495981.0, "step": 740 }, { "entropy": 0.877324876561761, "epoch": 1.2, "grad_norm": 11.1875, "learning_rate": 1.6674074074074077e-05, "loss": 14.7224, "mean_token_accuracy": 0.7919886518269778, "num_tokens": 24826245.0, "step": 750 }, { "entropy": 0.9162866469472647, "epoch": 1.216, "grad_norm": 14.1875, "learning_rate": 1.66e-05, "loss": 15.1252, "mean_token_accuracy": 0.7850615303963423, "num_tokens": 25147775.0, "step": 760 }, { "entropy": 0.8838450819253921, "epoch": 1.232, "grad_norm": 12.1875, "learning_rate": 1.6525925925925927e-05, "loss": 14.8829, "mean_token_accuracy": 0.7893829133361578, "num_tokens": 25480033.0, "step": 770 }, { "entropy": 0.8952008984982968, "epoch": 1.248, "grad_norm": 12.875, "learning_rate": 1.6451851851851853e-05, "loss": 14.9213, "mean_token_accuracy": 0.7889137178659439, "num_tokens": 25811978.0, "step": 780 }, { "entropy": 0.8893922835588455, "epoch": 1.264, "grad_norm": 12.6875, "learning_rate": 1.637777777777778e-05, "loss": 14.7989, "mean_token_accuracy": 0.7897166382521391, "num_tokens": 26146299.0, "step": 790 }, { "entropy": 0.9172267891466618, "epoch": 1.28, "grad_norm": 11.75, "learning_rate": 1.6303703703703706e-05, "loss": 15.3289, "mean_token_accuracy": 0.782142074778676, "num_tokens": 26478207.0, "step": 800 }, { "epoch": 1.28, "eval_biology_entropy": 2.5403532466888428, "eval_biology_loss": 2.9071850776672363, "eval_biology_mean_token_accuracy": 0.529075275182724, "eval_biology_num_tokens": 26478207.0, "eval_biology_runtime": 56.934, "eval_biology_samples_per_second": 8.782, "eval_biology_steps_per_second": 2.196, "step": 800 }, { "epoch": 1.28, "eval_chemistry_entropy": 1.234187596321106, "eval_chemistry_loss": 1.306308388710022, "eval_chemistry_mean_token_accuracy": 0.7299280157089233, "eval_chemistry_num_tokens": 26478207.0, "eval_chemistry_runtime": 71.938, "eval_chemistry_samples_per_second": 6.95, "eval_chemistry_steps_per_second": 1.738, "step": 800 }, { "epoch": 1.28, "eval_math_entropy": 0.9124487805366516, "eval_math_loss": 1.1551406383514404, "eval_math_mean_token_accuracy": 0.7556050124168396, "eval_math_num_tokens": 26478207.0, "eval_math_runtime": 75.1703, "eval_math_samples_per_second": 6.652, "eval_math_steps_per_second": 1.663, "step": 800 }, { "epoch": 1.28, "eval_physics_entropy": 0.8998128218650818, "eval_physics_loss": 0.9489150047302246, "eval_physics_mean_token_accuracy": 0.7840672006607056, "eval_physics_num_tokens": 26478207.0, "eval_physics_runtime": 85.0147, "eval_physics_samples_per_second": 5.881, "eval_physics_steps_per_second": 1.47, "step": 800 }, { "entropy": 0.8760543465614319, "epoch": 1.296, "grad_norm": 12.8125, "learning_rate": 1.622962962962963e-05, "loss": 14.6611, "mean_token_accuracy": 0.7923007346689701, "num_tokens": 26806477.0, "step": 810 }, { "entropy": 0.917656009271741, "epoch": 1.312, "grad_norm": 13.1875, "learning_rate": 1.6155555555555556e-05, "loss": 15.2379, "mean_token_accuracy": 0.7841715186834335, "num_tokens": 27137528.0, "step": 820 }, { "entropy": 0.8967037841677665, "epoch": 1.328, "grad_norm": 13.25, "learning_rate": 1.6081481481481482e-05, "loss": 14.9513, "mean_token_accuracy": 0.7878007985651493, "num_tokens": 27465859.0, "step": 830 }, { "entropy": 0.901091538183391, "epoch": 1.3439999999999999, "grad_norm": 13.375, "learning_rate": 1.600740740740741e-05, "loss": 15.1044, "mean_token_accuracy": 0.7861052300781012, "num_tokens": 27795568.0, "step": 840 }, { "entropy": 0.9121165912598371, "epoch": 1.3599999999999999, "grad_norm": 11.625, "learning_rate": 1.5933333333333336e-05, "loss": 15.0779, "mean_token_accuracy": 0.7859285034239292, "num_tokens": 28129988.0, "step": 850 }, { "entropy": 0.8819791020825505, "epoch": 1.376, "grad_norm": 13.875, "learning_rate": 1.5859259259259262e-05, "loss": 14.7997, "mean_token_accuracy": 0.7886631991714239, "num_tokens": 28465903.0, "step": 860 }, { "entropy": 0.8854727063328027, "epoch": 1.392, "grad_norm": 13.25, "learning_rate": 1.5785185185185185e-05, "loss": 14.61, "mean_token_accuracy": 0.7927087835967541, "num_tokens": 28801554.0, "step": 870 }, { "entropy": 0.8729526123031974, "epoch": 1.408, "grad_norm": 12.4375, "learning_rate": 1.571111111111111e-05, "loss": 14.7562, "mean_token_accuracy": 0.7899845536798239, "num_tokens": 29125114.0, "step": 880 }, { "entropy": 0.913954821228981, "epoch": 1.424, "grad_norm": 10.9375, "learning_rate": 1.5637037037037038e-05, "loss": 15.2188, "mean_token_accuracy": 0.783445942774415, "num_tokens": 29454334.0, "step": 890 }, { "entropy": 0.8652864851057529, "epoch": 1.44, "grad_norm": 12.1875, "learning_rate": 1.5562962962962965e-05, "loss": 14.3528, "mean_token_accuracy": 0.794481047987938, "num_tokens": 29789253.0, "step": 900 }, { "epoch": 1.44, "eval_biology_entropy": 2.4760996789932253, "eval_biology_loss": 2.897955894470215, "eval_biology_mean_token_accuracy": 0.5296812095642089, "eval_biology_num_tokens": 29789253.0, "eval_biology_runtime": 57.0842, "eval_biology_samples_per_second": 8.759, "eval_biology_steps_per_second": 2.19, "step": 900 }, { "epoch": 1.44, "eval_chemistry_entropy": 1.2120891718864442, "eval_chemistry_loss": 1.2991372346878052, "eval_chemistry_mean_token_accuracy": 0.7307901701927185, "eval_chemistry_num_tokens": 29789253.0, "eval_chemistry_runtime": 71.5779, "eval_chemistry_samples_per_second": 6.985, "eval_chemistry_steps_per_second": 1.746, "step": 900 }, { "epoch": 1.44, "eval_math_entropy": 0.9021294908523559, "eval_math_loss": 1.1532059907913208, "eval_math_mean_token_accuracy": 0.755294855594635, "eval_math_num_tokens": 29789253.0, "eval_math_runtime": 74.8641, "eval_math_samples_per_second": 6.679, "eval_math_steps_per_second": 1.67, "step": 900 }, { "epoch": 1.44, "eval_physics_entropy": 0.8853664617538453, "eval_physics_loss": 0.9413084983825684, "eval_physics_mean_token_accuracy": 0.7854379539489746, "eval_physics_num_tokens": 29789253.0, "eval_physics_runtime": 84.3345, "eval_physics_samples_per_second": 5.929, "eval_physics_steps_per_second": 1.482, "step": 900 }, { "entropy": 0.8592871313914656, "epoch": 1.456, "grad_norm": 10.5, "learning_rate": 1.548888888888889e-05, "loss": 14.4092, "mean_token_accuracy": 0.7947866775095462, "num_tokens": 30128576.0, "step": 910 }, { "entropy": 0.9165356293320656, "epoch": 1.472, "grad_norm": 12.8125, "learning_rate": 1.5414814814814814e-05, "loss": 15.368, "mean_token_accuracy": 0.7829437594860792, "num_tokens": 30452719.0, "step": 920 }, { "entropy": 0.9012311644852161, "epoch": 1.488, "grad_norm": 10.9375, "learning_rate": 1.5340740740740744e-05, "loss": 14.9137, "mean_token_accuracy": 0.7879349775612354, "num_tokens": 30782343.0, "step": 930 }, { "entropy": 0.8793585889041424, "epoch": 1.504, "grad_norm": 11.75, "learning_rate": 1.5266666666666667e-05, "loss": 14.6612, "mean_token_accuracy": 0.7913552910089493, "num_tokens": 31113238.0, "step": 940 }, { "entropy": 0.896404268220067, "epoch": 1.52, "grad_norm": 12.25, "learning_rate": 1.5192592592592594e-05, "loss": 15.0177, "mean_token_accuracy": 0.7869206938892603, "num_tokens": 31444589.0, "step": 950 }, { "entropy": 0.9115363927558064, "epoch": 1.536, "grad_norm": 13.625, "learning_rate": 1.5118518518518519e-05, "loss": 15.1508, "mean_token_accuracy": 0.7850655887275935, "num_tokens": 31774137.0, "step": 960 }, { "entropy": 0.8715274870395661, "epoch": 1.552, "grad_norm": 12.5, "learning_rate": 1.5044444444444445e-05, "loss": 14.4724, "mean_token_accuracy": 0.7919801536947488, "num_tokens": 32101776.0, "step": 970 }, { "entropy": 0.8716113748028874, "epoch": 1.568, "grad_norm": 11.9375, "learning_rate": 1.497037037037037e-05, "loss": 14.5441, "mean_token_accuracy": 0.7928904049098492, "num_tokens": 32428021.0, "step": 980 }, { "entropy": 0.9078653456643224, "epoch": 1.584, "grad_norm": 12.375, "learning_rate": 1.4896296296296298e-05, "loss": 15.1365, "mean_token_accuracy": 0.7834811493754387, "num_tokens": 32752735.0, "step": 990 }, { "entropy": 0.8948801588267088, "epoch": 1.6, "grad_norm": 11.4375, "learning_rate": 1.4822222222222225e-05, "loss": 14.975, "mean_token_accuracy": 0.7872815534472466, "num_tokens": 33075822.0, "step": 1000 }, { "epoch": 1.6, "eval_biology_entropy": 2.508749726295471, "eval_biology_loss": 2.8778915405273438, "eval_biology_mean_token_accuracy": 0.5310802006721497, "eval_biology_num_tokens": 33075822.0, "eval_biology_runtime": 57.246, "eval_biology_samples_per_second": 8.734, "eval_biology_steps_per_second": 2.184, "step": 1000 }, { "epoch": 1.6, "eval_chemistry_entropy": 1.2282180700302123, "eval_chemistry_loss": 1.2942099571228027, "eval_chemistry_mean_token_accuracy": 0.7318216099739074, "eval_chemistry_num_tokens": 33075822.0, "eval_chemistry_runtime": 71.0784, "eval_chemistry_samples_per_second": 7.034, "eval_chemistry_steps_per_second": 1.759, "step": 1000 }, { "epoch": 1.6, "eval_math_entropy": 0.9116221733093262, "eval_math_loss": 1.1501303911209106, "eval_math_mean_token_accuracy": 0.7557219748497009, "eval_math_num_tokens": 33075822.0, "eval_math_runtime": 74.8337, "eval_math_samples_per_second": 6.681, "eval_math_steps_per_second": 1.67, "step": 1000 }, { "epoch": 1.6, "eval_physics_entropy": 0.8932713055610657, "eval_physics_loss": 0.9365818500518799, "eval_physics_mean_token_accuracy": 0.785537106513977, "eval_physics_num_tokens": 33075822.0, "eval_physics_runtime": 84.6443, "eval_physics_samples_per_second": 5.907, "eval_physics_steps_per_second": 1.477, "step": 1000 }, { "entropy": 0.8604803189635277, "epoch": 1.616, "grad_norm": 12.5, "learning_rate": 1.474814814814815e-05, "loss": 14.3639, "mean_token_accuracy": 0.7951964545994997, "num_tokens": 33411850.0, "step": 1010 }, { "entropy": 0.8778028018772602, "epoch": 1.6320000000000001, "grad_norm": 12.5625, "learning_rate": 1.4674074074074076e-05, "loss": 14.6452, "mean_token_accuracy": 0.7921328764408827, "num_tokens": 33739456.0, "step": 1020 }, { "entropy": 0.853967827372253, "epoch": 1.6480000000000001, "grad_norm": 12.375, "learning_rate": 1.46e-05, "loss": 14.2504, "mean_token_accuracy": 0.7954076964408159, "num_tokens": 34072022.0, "step": 1030 }, { "entropy": 0.8785163436084986, "epoch": 1.6640000000000001, "grad_norm": 13.5625, "learning_rate": 1.4525925925925927e-05, "loss": 14.7032, "mean_token_accuracy": 0.7915361914783716, "num_tokens": 34398959.0, "step": 1040 }, { "entropy": 0.8694784231483936, "epoch": 1.6800000000000002, "grad_norm": 11.1875, "learning_rate": 1.4451851851851852e-05, "loss": 14.3839, "mean_token_accuracy": 0.7940326742827892, "num_tokens": 34732284.0, "step": 1050 }, { "entropy": 0.8717694684863091, "epoch": 1.696, "grad_norm": 13.1875, "learning_rate": 1.4377777777777779e-05, "loss": 14.6494, "mean_token_accuracy": 0.7915462471544743, "num_tokens": 35053712.0, "step": 1060 }, { "entropy": 0.8986424550414085, "epoch": 1.712, "grad_norm": 14.4375, "learning_rate": 1.4303703703703703e-05, "loss": 14.8976, "mean_token_accuracy": 0.7876383919268847, "num_tokens": 35393088.0, "step": 1070 }, { "entropy": 0.8717451239004731, "epoch": 1.728, "grad_norm": 12.6875, "learning_rate": 1.4229629629629632e-05, "loss": 14.5646, "mean_token_accuracy": 0.7918220609426498, "num_tokens": 35729007.0, "step": 1080 }, { "entropy": 0.8697011994197965, "epoch": 1.744, "grad_norm": 13.5, "learning_rate": 1.4155555555555556e-05, "loss": 14.5182, "mean_token_accuracy": 0.7922107793390751, "num_tokens": 36072489.0, "step": 1090 }, { "entropy": 0.8740223359316588, "epoch": 1.76, "grad_norm": 12.0, "learning_rate": 1.4081481481481483e-05, "loss": 14.4753, "mean_token_accuracy": 0.7926081418991089, "num_tokens": 36398285.0, "step": 1100 }, { "epoch": 1.76, "eval_biology_entropy": 2.487686212539673, "eval_biology_loss": 2.874035120010376, "eval_biology_mean_token_accuracy": 0.5334258432388306, "eval_biology_num_tokens": 36398285.0, "eval_biology_runtime": 57.5039, "eval_biology_samples_per_second": 8.695, "eval_biology_steps_per_second": 2.174, "step": 1100 }, { "epoch": 1.76, "eval_chemistry_entropy": 1.2144387559890748, "eval_chemistry_loss": 1.29018235206604, "eval_chemistry_mean_token_accuracy": 0.7329304647445679, "eval_chemistry_num_tokens": 36398285.0, "eval_chemistry_runtime": 71.5726, "eval_chemistry_samples_per_second": 6.986, "eval_chemistry_steps_per_second": 1.746, "step": 1100 }, { "epoch": 1.76, "eval_math_entropy": 0.9051191382408142, "eval_math_loss": 1.1472207307815552, "eval_math_mean_token_accuracy": 0.7566594734191895, "eval_math_num_tokens": 36398285.0, "eval_math_runtime": 74.0714, "eval_math_samples_per_second": 6.75, "eval_math_steps_per_second": 1.688, "step": 1100 }, { "epoch": 1.76, "eval_physics_entropy": 0.8845951933860778, "eval_physics_loss": 0.9326092600822449, "eval_physics_mean_token_accuracy": 0.7863762426376343, "eval_physics_num_tokens": 36398285.0, "eval_physics_runtime": 84.7278, "eval_physics_samples_per_second": 5.901, "eval_physics_steps_per_second": 1.475, "step": 1100 }, { "entropy": 0.9020034002140165, "epoch": 1.776, "grad_norm": 12.625, "learning_rate": 1.400740740740741e-05, "loss": 15.0724, "mean_token_accuracy": 0.7856271650642157, "num_tokens": 36730479.0, "step": 1110 }, { "entropy": 0.862918908149004, "epoch": 1.792, "grad_norm": 14.6875, "learning_rate": 1.3933333333333334e-05, "loss": 14.5005, "mean_token_accuracy": 0.7926642511039972, "num_tokens": 37057425.0, "step": 1120 }, { "entropy": 0.8617109794169664, "epoch": 1.808, "grad_norm": 11.8125, "learning_rate": 1.385925925925926e-05, "loss": 14.2625, "mean_token_accuracy": 0.7952517606317997, "num_tokens": 37395449.0, "step": 1130 }, { "entropy": 0.8684184370562434, "epoch": 1.8239999999999998, "grad_norm": 12.5, "learning_rate": 1.3785185185185186e-05, "loss": 14.5241, "mean_token_accuracy": 0.7931536667048931, "num_tokens": 37729535.0, "step": 1140 }, { "entropy": 0.8654272655025125, "epoch": 1.8399999999999999, "grad_norm": 12.4375, "learning_rate": 1.3711111111111112e-05, "loss": 14.4054, "mean_token_accuracy": 0.7938401244580746, "num_tokens": 38057839.0, "step": 1150 }, { "entropy": 0.86546247061342, "epoch": 1.8559999999999999, "grad_norm": 13.4375, "learning_rate": 1.3637037037037037e-05, "loss": 14.4546, "mean_token_accuracy": 0.7931617993861437, "num_tokens": 38398033.0, "step": 1160 }, { "entropy": 0.8690170094370842, "epoch": 1.8719999999999999, "grad_norm": 11.8125, "learning_rate": 1.3562962962962965e-05, "loss": 14.4475, "mean_token_accuracy": 0.7915640283375979, "num_tokens": 38729203.0, "step": 1170 }, { "entropy": 0.8866201037541032, "epoch": 1.888, "grad_norm": 13.0625, "learning_rate": 1.3488888888888888e-05, "loss": 14.8192, "mean_token_accuracy": 0.7894200544804335, "num_tokens": 39058791.0, "step": 1180 }, { "entropy": 0.8984544143080712, "epoch": 1.904, "grad_norm": 12.25, "learning_rate": 1.3414814814814817e-05, "loss": 15.0873, "mean_token_accuracy": 0.7861377280205488, "num_tokens": 39386170.0, "step": 1190 }, { "entropy": 0.9040035644546152, "epoch": 1.92, "grad_norm": 12.0625, "learning_rate": 1.3340740740740741e-05, "loss": 14.9941, "mean_token_accuracy": 0.7868796121329069, "num_tokens": 39710381.0, "step": 1200 }, { "epoch": 1.92, "eval_biology_entropy": 2.485040246009827, "eval_biology_loss": 2.8711678981781006, "eval_biology_mean_token_accuracy": 0.5342430763244629, "eval_biology_num_tokens": 39710381.0, "eval_biology_runtime": 57.5639, "eval_biology_samples_per_second": 8.686, "eval_biology_steps_per_second": 2.171, "step": 1200 }, { "epoch": 1.92, "eval_chemistry_entropy": 1.208739689350128, "eval_chemistry_loss": 1.285979151725769, "eval_chemistry_mean_token_accuracy": 0.7334310331344605, "eval_chemistry_num_tokens": 39710381.0, "eval_chemistry_runtime": 71.5821, "eval_chemistry_samples_per_second": 6.985, "eval_chemistry_steps_per_second": 1.746, "step": 1200 }, { "epoch": 1.92, "eval_math_entropy": 0.9018509998321533, "eval_math_loss": 1.1460354328155518, "eval_math_mean_token_accuracy": 0.7567954559326172, "eval_math_num_tokens": 39710381.0, "eval_math_runtime": 75.1131, "eval_math_samples_per_second": 6.657, "eval_math_steps_per_second": 1.664, "step": 1200 }, { "epoch": 1.92, "eval_physics_entropy": 0.8798905625343323, "eval_physics_loss": 0.929322361946106, "eval_physics_mean_token_accuracy": 0.787430817604065, "eval_physics_num_tokens": 39710381.0, "eval_physics_runtime": 84.784, "eval_physics_samples_per_second": 5.897, "eval_physics_steps_per_second": 1.474, "step": 1200 }, { "entropy": 0.8708227036520839, "epoch": 1.936, "grad_norm": 13.8125, "learning_rate": 1.3266666666666668e-05, "loss": 14.626, "mean_token_accuracy": 0.7907645083963871, "num_tokens": 40043354.0, "step": 1210 }, { "entropy": 0.8374214060604572, "epoch": 1.952, "grad_norm": 11.5, "learning_rate": 1.3192592592592594e-05, "loss": 13.9343, "mean_token_accuracy": 0.7994464814662934, "num_tokens": 40375790.0, "step": 1220 }, { "entropy": 0.8776708476245403, "epoch": 1.968, "grad_norm": 12.25, "learning_rate": 1.311851851851852e-05, "loss": 14.5653, "mean_token_accuracy": 0.7916903175413609, "num_tokens": 40705451.0, "step": 1230 }, { "entropy": 0.8854383405297994, "epoch": 1.984, "grad_norm": 13.0, "learning_rate": 1.3044444444444446e-05, "loss": 14.7486, "mean_token_accuracy": 0.7898764718323946, "num_tokens": 41040313.0, "step": 1240 }, { "entropy": 0.879773647710681, "epoch": 2.0, "grad_norm": 12.6875, "learning_rate": 1.297037037037037e-05, "loss": 14.6916, "mean_token_accuracy": 0.7911134500056505, "num_tokens": 41373608.0, "step": 1250 }, { "entropy": 0.8601289037615061, "epoch": 2.016, "grad_norm": 10.5625, "learning_rate": 1.2896296296296299e-05, "loss": 14.1908, "mean_token_accuracy": 0.7955393616110087, "num_tokens": 41708226.0, "step": 1260 }, { "entropy": 0.8542119370773434, "epoch": 2.032, "grad_norm": 11.4375, "learning_rate": 1.2822222222222222e-05, "loss": 14.2867, "mean_token_accuracy": 0.7959718242287636, "num_tokens": 42045433.0, "step": 1270 }, { "entropy": 0.8921715309843421, "epoch": 2.048, "grad_norm": 13.1875, "learning_rate": 1.274814814814815e-05, "loss": 14.8772, "mean_token_accuracy": 0.7874497707933188, "num_tokens": 42372694.0, "step": 1280 }, { "entropy": 0.8619997894391418, "epoch": 2.064, "grad_norm": 12.375, "learning_rate": 1.2674074074074075e-05, "loss": 14.2556, "mean_token_accuracy": 0.7947564154863358, "num_tokens": 42710240.0, "step": 1290 }, { "entropy": 0.8751549379900098, "epoch": 2.08, "grad_norm": 11.5625, "learning_rate": 1.2600000000000001e-05, "loss": 14.6573, "mean_token_accuracy": 0.7916811484843492, "num_tokens": 43037844.0, "step": 1300 }, { "epoch": 2.08, "eval_biology_entropy": 2.5093424930572508, "eval_biology_loss": 2.864481210708618, "eval_biology_mean_token_accuracy": 0.533148678779602, "eval_biology_num_tokens": 43037844.0, "eval_biology_runtime": 57.534, "eval_biology_samples_per_second": 8.691, "eval_biology_steps_per_second": 2.173, "step": 1300 }, { "epoch": 2.08, "eval_chemistry_entropy": 1.2106250019073486, "eval_chemistry_loss": 1.2846547365188599, "eval_chemistry_mean_token_accuracy": 0.7332292509078979, "eval_chemistry_num_tokens": 43037844.0, "eval_chemistry_runtime": 71.3949, "eval_chemistry_samples_per_second": 7.003, "eval_chemistry_steps_per_second": 1.751, "step": 1300 }, { "epoch": 2.08, "eval_math_entropy": 0.9025118923187256, "eval_math_loss": 1.1455634832382202, "eval_math_mean_token_accuracy": 0.7569516344070435, "eval_math_num_tokens": 43037844.0, "eval_math_runtime": 75.2696, "eval_math_samples_per_second": 6.643, "eval_math_steps_per_second": 1.661, "step": 1300 }, { "epoch": 2.08, "eval_physics_entropy": 0.8776944227218628, "eval_physics_loss": 0.9265705943107605, "eval_physics_mean_token_accuracy": 0.7875892186164856, "eval_physics_num_tokens": 43037844.0, "eval_physics_runtime": 85.0265, "eval_physics_samples_per_second": 5.881, "eval_physics_steps_per_second": 1.47, "step": 1300 }, { "entropy": 0.8755308095365762, "epoch": 2.096, "grad_norm": 12.9375, "learning_rate": 1.2525925925925928e-05, "loss": 14.5668, "mean_token_accuracy": 0.7918181721121073, "num_tokens": 43368675.0, "step": 1310 }, { "entropy": 0.8589686010032892, "epoch": 2.112, "grad_norm": 12.25, "learning_rate": 1.2451851851851853e-05, "loss": 14.3302, "mean_token_accuracy": 0.7948925741016865, "num_tokens": 43699544.0, "step": 1320 }, { "entropy": 0.8743958847597242, "epoch": 2.128, "grad_norm": 12.3125, "learning_rate": 1.237777777777778e-05, "loss": 14.5493, "mean_token_accuracy": 0.7907001547515392, "num_tokens": 44012632.0, "step": 1330 }, { "entropy": 0.8594208642840385, "epoch": 2.144, "grad_norm": 11.0625, "learning_rate": 1.2303703703703704e-05, "loss": 14.3324, "mean_token_accuracy": 0.7929412983357906, "num_tokens": 44349732.0, "step": 1340 }, { "entropy": 0.8937922870740295, "epoch": 2.16, "grad_norm": 12.625, "learning_rate": 1.222962962962963e-05, "loss": 14.8834, "mean_token_accuracy": 0.7873219456523657, "num_tokens": 44680410.0, "step": 1350 }, { "entropy": 0.8523136384785175, "epoch": 2.176, "grad_norm": 11.25, "learning_rate": 1.2155555555555555e-05, "loss": 14.2544, "mean_token_accuracy": 0.7959212839603425, "num_tokens": 45017922.0, "step": 1360 }, { "entropy": 0.8626351818442345, "epoch": 2.192, "grad_norm": 12.1875, "learning_rate": 1.2081481481481484e-05, "loss": 14.276, "mean_token_accuracy": 0.7951519623398781, "num_tokens": 45348179.0, "step": 1370 }, { "entropy": 0.8525467690080404, "epoch": 2.208, "grad_norm": 12.8125, "learning_rate": 1.2007407407407408e-05, "loss": 14.2013, "mean_token_accuracy": 0.7966051142662763, "num_tokens": 45678397.0, "step": 1380 }, { "entropy": 0.8527483340352774, "epoch": 2.224, "grad_norm": 11.5625, "learning_rate": 1.1933333333333335e-05, "loss": 14.2627, "mean_token_accuracy": 0.7949573740363121, "num_tokens": 46010052.0, "step": 1390 }, { "entropy": 0.855360858142376, "epoch": 2.24, "grad_norm": 11.4375, "learning_rate": 1.185925925925926e-05, "loss": 14.2457, "mean_token_accuracy": 0.7951466862112284, "num_tokens": 46347116.0, "step": 1400 }, { "epoch": 2.24, "eval_biology_entropy": 2.5089185886383056, "eval_biology_loss": 2.863374948501587, "eval_biology_mean_token_accuracy": 0.5349591159820557, "eval_biology_num_tokens": 46347116.0, "eval_biology_runtime": 57.3344, "eval_biology_samples_per_second": 8.721, "eval_biology_steps_per_second": 2.18, "step": 1400 }, { "epoch": 2.24, "eval_chemistry_entropy": 1.2185512976646424, "eval_chemistry_loss": 1.2818259000778198, "eval_chemistry_mean_token_accuracy": 0.7342271280288696, "eval_chemistry_num_tokens": 46347116.0, "eval_chemistry_runtime": 71.4639, "eval_chemistry_samples_per_second": 6.997, "eval_chemistry_steps_per_second": 1.749, "step": 1400 }, { "epoch": 2.24, "eval_math_entropy": 0.9067861828804016, "eval_math_loss": 1.1431993246078491, "eval_math_mean_token_accuracy": 0.7572210087776184, "eval_math_num_tokens": 46347116.0, "eval_math_runtime": 75.3577, "eval_math_samples_per_second": 6.635, "eval_math_steps_per_second": 1.659, "step": 1400 }, { "epoch": 2.24, "eval_physics_entropy": 0.8840774774551392, "eval_physics_loss": 0.9245942234992981, "eval_physics_mean_token_accuracy": 0.7878187990188599, "eval_physics_num_tokens": 46347116.0, "eval_physics_runtime": 84.9011, "eval_physics_samples_per_second": 5.889, "eval_physics_steps_per_second": 1.472, "step": 1400 }, { "entropy": 0.9004593985155225, "epoch": 2.2560000000000002, "grad_norm": 13.4375, "learning_rate": 1.1785185185185186e-05, "loss": 14.9385, "mean_token_accuracy": 0.7881197012960911, "num_tokens": 46673074.0, "step": 1410 }, { "entropy": 0.8454325065016747, "epoch": 2.2720000000000002, "grad_norm": 12.125, "learning_rate": 1.1711111111111113e-05, "loss": 14.0933, "mean_token_accuracy": 0.7979227758944034, "num_tokens": 47009386.0, "step": 1420 }, { "entropy": 0.8827449329197407, "epoch": 2.288, "grad_norm": 12.25, "learning_rate": 1.1637037037037037e-05, "loss": 14.712, "mean_token_accuracy": 0.7902342595160008, "num_tokens": 47344231.0, "step": 1430 }, { "entropy": 0.8806605780497193, "epoch": 2.304, "grad_norm": 13.4375, "learning_rate": 1.1562962962962964e-05, "loss": 14.5674, "mean_token_accuracy": 0.7924861270934344, "num_tokens": 47658392.0, "step": 1440 }, { "entropy": 0.859540069103241, "epoch": 2.32, "grad_norm": 12.1875, "learning_rate": 1.1488888888888889e-05, "loss": 14.3445, "mean_token_accuracy": 0.7936220798641443, "num_tokens": 47995015.0, "step": 1450 }, { "entropy": 0.8417845372110605, "epoch": 2.336, "grad_norm": 11.8125, "learning_rate": 1.1414814814814817e-05, "loss": 14.0115, "mean_token_accuracy": 0.7978896964341402, "num_tokens": 48336708.0, "step": 1460 }, { "entropy": 0.8409199349582195, "epoch": 2.352, "grad_norm": 11.5, "learning_rate": 1.1340740740740742e-05, "loss": 13.9932, "mean_token_accuracy": 0.7971417736262083, "num_tokens": 48665311.0, "step": 1470 }, { "entropy": 0.8575670935213566, "epoch": 2.368, "grad_norm": 11.75, "learning_rate": 1.1266666666666668e-05, "loss": 14.3902, "mean_token_accuracy": 0.7932044431567192, "num_tokens": 49005334.0, "step": 1480 }, { "entropy": 0.8846392493695021, "epoch": 2.384, "grad_norm": 13.5, "learning_rate": 1.1192592592592593e-05, "loss": 14.5912, "mean_token_accuracy": 0.7901103302836419, "num_tokens": 49331936.0, "step": 1490 }, { "entropy": 0.8669425016269088, "epoch": 2.4, "grad_norm": 12.4375, "learning_rate": 1.111851851851852e-05, "loss": 14.4765, "mean_token_accuracy": 0.7921019058674574, "num_tokens": 49671048.0, "step": 1500 }, { "epoch": 2.4, "eval_biology_entropy": 2.456291440010071, "eval_biology_loss": 2.8605849742889404, "eval_biology_mean_token_accuracy": 0.5338850898742675, "eval_biology_num_tokens": 49671048.0, "eval_biology_runtime": 57.1011, "eval_biology_samples_per_second": 8.756, "eval_biology_steps_per_second": 2.189, "step": 1500 }, { "epoch": 2.4, "eval_chemistry_entropy": 1.1939663624763488, "eval_chemistry_loss": 1.280471682548523, "eval_chemistry_mean_token_accuracy": 0.7337013077735901, "eval_chemistry_num_tokens": 49671048.0, "eval_chemistry_runtime": 71.0078, "eval_chemistry_samples_per_second": 7.041, "eval_chemistry_steps_per_second": 1.76, "step": 1500 }, { "epoch": 2.4, "eval_math_entropy": 0.8943547616004944, "eval_math_loss": 1.1426331996917725, "eval_math_mean_token_accuracy": 0.7575869603157044, "eval_math_num_tokens": 49671048.0, "eval_math_runtime": 74.2145, "eval_math_samples_per_second": 6.737, "eval_math_steps_per_second": 1.684, "step": 1500 }, { "epoch": 2.4, "eval_physics_entropy": 0.8692562022209167, "eval_physics_loss": 0.9228239059448242, "eval_physics_mean_token_accuracy": 0.7882768630981445, "eval_physics_num_tokens": 49671048.0, "eval_physics_runtime": 85.556, "eval_physics_samples_per_second": 5.844, "eval_physics_steps_per_second": 1.461, "step": 1500 }, { "entropy": 0.8500566266477108, "epoch": 2.416, "grad_norm": 11.4375, "learning_rate": 1.1044444444444444e-05, "loss": 14.158, "mean_token_accuracy": 0.7961105518043041, "num_tokens": 50018182.0, "step": 1510 }, { "entropy": 0.8643727529793978, "epoch": 2.432, "grad_norm": 12.75, "learning_rate": 1.0970370370370371e-05, "loss": 14.4046, "mean_token_accuracy": 0.7947129063308239, "num_tokens": 50341187.0, "step": 1520 }, { "entropy": 0.8951457351446152, "epoch": 2.448, "grad_norm": 11.25, "learning_rate": 1.0896296296296298e-05, "loss": 14.8649, "mean_token_accuracy": 0.7867488101124763, "num_tokens": 50667019.0, "step": 1530 }, { "entropy": 0.8816273506730795, "epoch": 2.464, "grad_norm": 12.8125, "learning_rate": 1.0822222222222222e-05, "loss": 14.6, "mean_token_accuracy": 0.7916153628379107, "num_tokens": 50991113.0, "step": 1540 }, { "entropy": 0.8664202645421029, "epoch": 2.48, "grad_norm": 12.3125, "learning_rate": 1.074814814814815e-05, "loss": 14.5346, "mean_token_accuracy": 0.7921106025576592, "num_tokens": 51319624.0, "step": 1550 }, { "entropy": 0.8818996708840132, "epoch": 2.496, "grad_norm": 11.625, "learning_rate": 1.0674074074074074e-05, "loss": 14.6185, "mean_token_accuracy": 0.7901674829423427, "num_tokens": 51654633.0, "step": 1560 }, { "entropy": 0.8544634876772761, "epoch": 2.512, "grad_norm": 13.25, "learning_rate": 1.0600000000000002e-05, "loss": 14.2895, "mean_token_accuracy": 0.7949651554226875, "num_tokens": 51980051.0, "step": 1570 }, { "entropy": 0.8599010031670332, "epoch": 2.528, "grad_norm": 11.5625, "learning_rate": 1.0525925925925927e-05, "loss": 14.2789, "mean_token_accuracy": 0.7945577628910542, "num_tokens": 52316109.0, "step": 1580 }, { "entropy": 0.8746189616620541, "epoch": 2.544, "grad_norm": 13.6875, "learning_rate": 1.0451851851851853e-05, "loss": 14.6318, "mean_token_accuracy": 0.7903719995170831, "num_tokens": 52634528.0, "step": 1590 }, { "entropy": 0.8605435255914926, "epoch": 2.56, "grad_norm": 12.0, "learning_rate": 1.0377777777777778e-05, "loss": 14.3017, "mean_token_accuracy": 0.7939822390675545, "num_tokens": 52964035.0, "step": 1600 }, { "epoch": 2.56, "eval_biology_entropy": 2.439929902076721, "eval_biology_loss": 2.855186939239502, "eval_biology_mean_token_accuracy": 0.5345207722187042, "eval_biology_num_tokens": 52964035.0, "eval_biology_runtime": 57.4094, "eval_biology_samples_per_second": 8.709, "eval_biology_steps_per_second": 2.177, "step": 1600 }, { "epoch": 2.56, "eval_chemistry_entropy": 1.1931069812774657, "eval_chemistry_loss": 1.278794527053833, "eval_chemistry_mean_token_accuracy": 0.7340463919639587, "eval_chemistry_num_tokens": 52964035.0, "eval_chemistry_runtime": 71.3365, "eval_chemistry_samples_per_second": 7.009, "eval_chemistry_steps_per_second": 1.752, "step": 1600 }, { "epoch": 2.56, "eval_math_entropy": 0.8954266090393066, "eval_math_loss": 1.1417946815490723, "eval_math_mean_token_accuracy": 0.7577371735572815, "eval_math_num_tokens": 52964035.0, "eval_math_runtime": 74.3663, "eval_math_samples_per_second": 6.723, "eval_math_steps_per_second": 1.681, "step": 1600 }, { "epoch": 2.56, "eval_physics_entropy": 0.8706050214767456, "eval_physics_loss": 0.9212888479232788, "eval_physics_mean_token_accuracy": 0.7886158428192138, "eval_physics_num_tokens": 52964035.0, "eval_physics_runtime": 85.0105, "eval_physics_samples_per_second": 5.882, "eval_physics_steps_per_second": 1.47, "step": 1600 }, { "entropy": 0.8518974561244249, "epoch": 2.576, "grad_norm": 12.8125, "learning_rate": 1.0303703703703705e-05, "loss": 14.1697, "mean_token_accuracy": 0.7962028808891773, "num_tokens": 53293345.0, "step": 1610 }, { "entropy": 0.8667042430490255, "epoch": 2.592, "grad_norm": 10.625, "learning_rate": 1.0229629629629631e-05, "loss": 14.4307, "mean_token_accuracy": 0.7936250735074282, "num_tokens": 53620896.0, "step": 1620 }, { "entropy": 0.8402661351487041, "epoch": 2.608, "grad_norm": 11.25, "learning_rate": 1.0155555555555556e-05, "loss": 14.021, "mean_token_accuracy": 0.7978927873075008, "num_tokens": 53950989.0, "step": 1630 }, { "entropy": 0.8603313663974405, "epoch": 2.624, "grad_norm": 11.125, "learning_rate": 1.0081481481481484e-05, "loss": 14.3819, "mean_token_accuracy": 0.7938243903219699, "num_tokens": 54282732.0, "step": 1640 }, { "entropy": 0.8600944321602583, "epoch": 2.64, "grad_norm": 12.8125, "learning_rate": 1.0007407407407407e-05, "loss": 14.2866, "mean_token_accuracy": 0.7947955295443535, "num_tokens": 54609748.0, "step": 1650 }, { "entropy": 0.8892827957868576, "epoch": 2.656, "grad_norm": 12.5625, "learning_rate": 9.933333333333334e-06, "loss": 14.8315, "mean_token_accuracy": 0.7884532894939185, "num_tokens": 54953583.0, "step": 1660 }, { "entropy": 0.867825073376298, "epoch": 2.672, "grad_norm": 12.0, "learning_rate": 9.85925925925926e-06, "loss": 14.4804, "mean_token_accuracy": 0.791333881765604, "num_tokens": 55281945.0, "step": 1670 }, { "entropy": 0.8512159805744887, "epoch": 2.6879999999999997, "grad_norm": 12.0625, "learning_rate": 9.785185185185187e-06, "loss": 14.0027, "mean_token_accuracy": 0.798706853762269, "num_tokens": 55612391.0, "step": 1680 }, { "entropy": 0.8470454571768642, "epoch": 2.7039999999999997, "grad_norm": 11.25, "learning_rate": 9.711111111111111e-06, "loss": 14.2474, "mean_token_accuracy": 0.7953305941075086, "num_tokens": 55949791.0, "step": 1690 }, { "entropy": 0.8569748856127262, "epoch": 2.7199999999999998, "grad_norm": 14.625, "learning_rate": 9.637037037037038e-06, "loss": 14.2518, "mean_token_accuracy": 0.7947139970958232, "num_tokens": 56282204.0, "step": 1700 }, { "epoch": 2.7199999999999998, "eval_biology_entropy": 2.4623048429489134, "eval_biology_loss": 2.855733633041382, "eval_biology_mean_token_accuracy": 0.5338225281238556, "eval_biology_num_tokens": 56282204.0, "eval_biology_runtime": 57.2989, "eval_biology_samples_per_second": 8.726, "eval_biology_steps_per_second": 2.182, "step": 1700 }, { "epoch": 2.7199999999999998, "eval_chemistry_entropy": 1.1981362085342406, "eval_chemistry_loss": 1.2777812480926514, "eval_chemistry_mean_token_accuracy": 0.7337445435523987, "eval_chemistry_num_tokens": 56282204.0, "eval_chemistry_runtime": 72.002, "eval_chemistry_samples_per_second": 6.944, "eval_chemistry_steps_per_second": 1.736, "step": 1700 }, { "epoch": 2.7199999999999998, "eval_math_entropy": 0.8990244059562683, "eval_math_loss": 1.1414705514907837, "eval_math_mean_token_accuracy": 0.7577006416320801, "eval_math_num_tokens": 56282204.0, "eval_math_runtime": 74.1316, "eval_math_samples_per_second": 6.745, "eval_math_steps_per_second": 1.686, "step": 1700 }, { "epoch": 2.7199999999999998, "eval_physics_entropy": 0.8712537698745727, "eval_physics_loss": 0.9199900031089783, "eval_physics_mean_token_accuracy": 0.7888414950370789, "eval_physics_num_tokens": 56282204.0, "eval_physics_runtime": 85.5277, "eval_physics_samples_per_second": 5.846, "eval_physics_steps_per_second": 1.462, "step": 1700 }, { "entropy": 0.8889217928051949, "epoch": 2.7359999999999998, "grad_norm": 12.8125, "learning_rate": 9.562962962962965e-06, "loss": 14.8794, "mean_token_accuracy": 0.787835606187582, "num_tokens": 56605478.0, "step": 1710 }, { "entropy": 0.867193017527461, "epoch": 2.752, "grad_norm": 11.6875, "learning_rate": 9.48888888888889e-06, "loss": 14.5299, "mean_token_accuracy": 0.7921088628470898, "num_tokens": 56935874.0, "step": 1720 }, { "entropy": 0.8495670940726996, "epoch": 2.768, "grad_norm": 12.0, "learning_rate": 9.414814814814816e-06, "loss": 14.0654, "mean_token_accuracy": 0.7975914411246776, "num_tokens": 57270552.0, "step": 1730 }, { "entropy": 0.8686243133619428, "epoch": 2.784, "grad_norm": 12.5, "learning_rate": 9.34074074074074e-06, "loss": 14.4341, "mean_token_accuracy": 0.7923915140330792, "num_tokens": 57602968.0, "step": 1740 }, { "entropy": 0.9133183425292373, "epoch": 2.8, "grad_norm": 13.375, "learning_rate": 9.266666666666667e-06, "loss": 15.2497, "mean_token_accuracy": 0.7833729684352875, "num_tokens": 57925886.0, "step": 1750 }, { "entropy": 0.8448837269097567, "epoch": 2.816, "grad_norm": 11.875, "learning_rate": 9.192592592592594e-06, "loss": 13.9445, "mean_token_accuracy": 0.7979842260479927, "num_tokens": 58262960.0, "step": 1760 }, { "entropy": 0.871445388160646, "epoch": 2.832, "grad_norm": 12.3125, "learning_rate": 9.118518518518518e-06, "loss": 14.5598, "mean_token_accuracy": 0.7907900519669055, "num_tokens": 58586324.0, "step": 1770 }, { "entropy": 0.8576038857921958, "epoch": 2.848, "grad_norm": 13.0, "learning_rate": 9.044444444444445e-06, "loss": 14.3799, "mean_token_accuracy": 0.7930273406207562, "num_tokens": 58912949.0, "step": 1780 }, { "entropy": 0.8503576427698135, "epoch": 2.864, "grad_norm": 11.0, "learning_rate": 8.970370370370372e-06, "loss": 14.1151, "mean_token_accuracy": 0.7967234510928393, "num_tokens": 59252494.0, "step": 1790 }, { "entropy": 0.865095803141594, "epoch": 2.88, "grad_norm": 14.125, "learning_rate": 8.896296296296298e-06, "loss": 14.4, "mean_token_accuracy": 0.7936870157718658, "num_tokens": 59574951.0, "step": 1800 }, { "epoch": 2.88, "eval_biology_entropy": 2.4605735635757444, "eval_biology_loss": 2.8438503742218018, "eval_biology_mean_token_accuracy": 0.5349292945861817, "eval_biology_num_tokens": 59574951.0, "eval_biology_runtime": 57.1516, "eval_biology_samples_per_second": 8.749, "eval_biology_steps_per_second": 2.187, "step": 1800 }, { "epoch": 2.88, "eval_chemistry_entropy": 1.1904199600219727, "eval_chemistry_loss": 1.2762060165405273, "eval_chemistry_mean_token_accuracy": 0.7342206130027771, "eval_chemistry_num_tokens": 59574951.0, "eval_chemistry_runtime": 71.4215, "eval_chemistry_samples_per_second": 7.001, "eval_chemistry_steps_per_second": 1.75, "step": 1800 }, { "epoch": 2.88, "eval_math_entropy": 0.8948101491928101, "eval_math_loss": 1.1407443284988403, "eval_math_mean_token_accuracy": 0.7578761463165283, "eval_math_num_tokens": 59574951.0, "eval_math_runtime": 74.9351, "eval_math_samples_per_second": 6.672, "eval_math_steps_per_second": 1.668, "step": 1800 }, { "epoch": 2.88, "eval_physics_entropy": 0.8652872314453125, "eval_physics_loss": 0.9192015528678894, "eval_physics_mean_token_accuracy": 0.7890026559829711, "eval_physics_num_tokens": 59574951.0, "eval_physics_runtime": 84.6211, "eval_physics_samples_per_second": 5.909, "eval_physics_steps_per_second": 1.477, "step": 1800 }, { "entropy": 0.8504932520911097, "epoch": 2.896, "grad_norm": 12.75, "learning_rate": 8.822222222222223e-06, "loss": 14.2948, "mean_token_accuracy": 0.7946070168167353, "num_tokens": 59903274.0, "step": 1810 }, { "entropy": 0.8513661539182067, "epoch": 2.912, "grad_norm": 12.5625, "learning_rate": 8.74814814814815e-06, "loss": 14.1948, "mean_token_accuracy": 0.7956477370113134, "num_tokens": 60234203.0, "step": 1820 }, { "entropy": 0.8567061256617308, "epoch": 2.928, "grad_norm": 12.0625, "learning_rate": 8.674074074074074e-06, "loss": 14.2065, "mean_token_accuracy": 0.7965482659637928, "num_tokens": 60559123.0, "step": 1830 }, { "entropy": 0.8522207867354155, "epoch": 2.944, "grad_norm": 13.8125, "learning_rate": 8.6e-06, "loss": 14.1203, "mean_token_accuracy": 0.7966358475387096, "num_tokens": 60893423.0, "step": 1840 }, { "entropy": 0.8581688396632672, "epoch": 2.96, "grad_norm": 11.875, "learning_rate": 8.525925925925927e-06, "loss": 14.4079, "mean_token_accuracy": 0.7932413015514612, "num_tokens": 61228692.0, "step": 1850 }, { "entropy": 0.8690817683935166, "epoch": 2.976, "grad_norm": 11.9375, "learning_rate": 8.451851851851852e-06, "loss": 14.3962, "mean_token_accuracy": 0.793168543279171, "num_tokens": 61560852.0, "step": 1860 }, { "entropy": 0.8346356390044093, "epoch": 2.992, "grad_norm": 11.9375, "learning_rate": 8.377777777777779e-06, "loss": 13.939, "mean_token_accuracy": 0.7977076359093189, "num_tokens": 61898951.0, "step": 1870 }, { "entropy": 0.8470860140398144, "epoch": 3.008, "grad_norm": 11.4375, "learning_rate": 8.303703703703705e-06, "loss": 14.0544, "mean_token_accuracy": 0.7982985734939575, "num_tokens": 62223875.0, "step": 1880 }, { "entropy": 0.8579535936936736, "epoch": 3.024, "grad_norm": 12.0625, "learning_rate": 8.229629629629632e-06, "loss": 14.2346, "mean_token_accuracy": 0.7953595809638501, "num_tokens": 62553166.0, "step": 1890 }, { "entropy": 0.8405753003433347, "epoch": 3.04, "grad_norm": 12.0625, "learning_rate": 8.155555555555556e-06, "loss": 13.9614, "mean_token_accuracy": 0.7975836973637342, "num_tokens": 62893113.0, "step": 1900 }, { "epoch": 3.04, "eval_biology_entropy": 2.466928297996521, "eval_biology_loss": 2.8451387882232666, "eval_biology_mean_token_accuracy": 0.5350441763401032, "eval_biology_num_tokens": 62893113.0, "eval_biology_runtime": 57.2798, "eval_biology_samples_per_second": 8.729, "eval_biology_steps_per_second": 2.182, "step": 1900 }, { "epoch": 3.04, "eval_chemistry_entropy": 1.1924911136627196, "eval_chemistry_loss": 1.2755722999572754, "eval_chemistry_mean_token_accuracy": 0.734372947216034, "eval_chemistry_num_tokens": 62893113.0, "eval_chemistry_runtime": 71.12, "eval_chemistry_samples_per_second": 7.03, "eval_chemistry_steps_per_second": 1.758, "step": 1900 }, { "epoch": 3.04, "eval_math_entropy": 0.8945438508987427, "eval_math_loss": 1.1409516334533691, "eval_math_mean_token_accuracy": 0.7579452800750732, "eval_math_num_tokens": 62893113.0, "eval_math_runtime": 74.6477, "eval_math_samples_per_second": 6.698, "eval_math_steps_per_second": 1.675, "step": 1900 }, { "epoch": 3.04, "eval_physics_entropy": 0.8654837455749512, "eval_physics_loss": 0.9181889295578003, "eval_physics_mean_token_accuracy": 0.7893532915115357, "eval_physics_num_tokens": 62893113.0, "eval_physics_runtime": 85.3356, "eval_physics_samples_per_second": 5.859, "eval_physics_steps_per_second": 1.465, "step": 1900 }, { "entropy": 0.8652172423899174, "epoch": 3.056, "grad_norm": 12.375, "learning_rate": 8.081481481481483e-06, "loss": 14.4707, "mean_token_accuracy": 0.7926124636083841, "num_tokens": 63218088.0, "step": 1910 }, { "entropy": 0.8625176247209311, "epoch": 3.072, "grad_norm": 13.5, "learning_rate": 8.007407407407408e-06, "loss": 14.2819, "mean_token_accuracy": 0.7927148371934891, "num_tokens": 63551201.0, "step": 1920 }, { "entropy": 0.8709696922451258, "epoch": 3.088, "grad_norm": 13.25, "learning_rate": 7.933333333333334e-06, "loss": 14.535, "mean_token_accuracy": 0.7909463763236999, "num_tokens": 63876362.0, "step": 1930 }, { "entropy": 0.8365857848897577, "epoch": 3.104, "grad_norm": 12.6875, "learning_rate": 7.859259259259259e-06, "loss": 13.917, "mean_token_accuracy": 0.7985875517129898, "num_tokens": 64208171.0, "step": 1940 }, { "entropy": 0.8436792023479939, "epoch": 3.12, "grad_norm": 12.5, "learning_rate": 7.785185185185185e-06, "loss": 13.9247, "mean_token_accuracy": 0.7994871154427529, "num_tokens": 64544493.0, "step": 1950 }, { "entropy": 0.8724946033209562, "epoch": 3.136, "grad_norm": 12.875, "learning_rate": 7.711111111111112e-06, "loss": 14.5896, "mean_token_accuracy": 0.790136469900608, "num_tokens": 64872488.0, "step": 1960 }, { "entropy": 0.8340439153835177, "epoch": 3.152, "grad_norm": 12.6875, "learning_rate": 7.637037037037037e-06, "loss": 13.9309, "mean_token_accuracy": 0.7989891562610865, "num_tokens": 65201124.0, "step": 1970 }, { "entropy": 0.8577351365238428, "epoch": 3.168, "grad_norm": 11.125, "learning_rate": 7.562962962962963e-06, "loss": 14.332, "mean_token_accuracy": 0.7956987973302603, "num_tokens": 65525868.0, "step": 1980 }, { "entropy": 0.8472510496154427, "epoch": 3.184, "grad_norm": 12.75, "learning_rate": 7.48888888888889e-06, "loss": 14.0577, "mean_token_accuracy": 0.7991137109696865, "num_tokens": 65861603.0, "step": 1990 }, { "entropy": 0.8585745912045241, "epoch": 3.2, "grad_norm": 11.3125, "learning_rate": 7.4148148148148155e-06, "loss": 14.2601, "mean_token_accuracy": 0.7946750316768885, "num_tokens": 66193675.0, "step": 2000 }, { "epoch": 3.2, "eval_biology_entropy": 2.4529026355743406, "eval_biology_loss": 2.8515477180480957, "eval_biology_mean_token_accuracy": 0.5361672310829163, "eval_biology_num_tokens": 66193675.0, "eval_biology_runtime": 56.9907, "eval_biology_samples_per_second": 8.773, "eval_biology_steps_per_second": 2.193, "step": 2000 }, { "epoch": 3.2, "eval_chemistry_entropy": 1.1896215171813964, "eval_chemistry_loss": 1.2748806476593018, "eval_chemistry_mean_token_accuracy": 0.7352399125099182, "eval_chemistry_num_tokens": 66193675.0, "eval_chemistry_runtime": 71.1598, "eval_chemistry_samples_per_second": 7.026, "eval_chemistry_steps_per_second": 1.757, "step": 2000 }, { "epoch": 3.2, "eval_math_entropy": 0.8935150079727173, "eval_math_loss": 1.140594482421875, "eval_math_mean_token_accuracy": 0.7576922287940979, "eval_math_num_tokens": 66193675.0, "eval_math_runtime": 74.3826, "eval_math_samples_per_second": 6.722, "eval_math_steps_per_second": 1.681, "step": 2000 }, { "epoch": 3.2, "eval_physics_entropy": 0.8648786902427673, "eval_physics_loss": 0.9176769852638245, "eval_physics_mean_token_accuracy": 0.7893374056816101, "eval_physics_num_tokens": 66193675.0, "eval_physics_runtime": 84.6448, "eval_physics_samples_per_second": 5.907, "eval_physics_steps_per_second": 1.477, "step": 2000 } ], "logging_steps": 10, "max_steps": 3000, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.614072648666411e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }