{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 100, "global_step": 2500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.0476444251835346, "epoch": 0.016, "grad_norm": 158.0, "learning_rate": 6.000000000000001e-07, "loss": 34.7404, "mean_token_accuracy": 0.6322720043361187, "num_tokens": 280941.0, "step": 10 }, { "entropy": 1.103129617869854, "epoch": 0.032, "grad_norm": 140.0, "learning_rate": 1.2666666666666669e-06, "loss": 35.2833, "mean_token_accuracy": 0.627065640874207, "num_tokens": 558056.0, "step": 20 }, { "entropy": 1.1186710916459561, "epoch": 0.048, "grad_norm": 125.5, "learning_rate": 1.9333333333333336e-06, "loss": 34.279, "mean_token_accuracy": 0.6317529268562794, "num_tokens": 836753.0, "step": 30 }, { "entropy": 1.2232312321662904, "epoch": 0.064, "grad_norm": 99.0, "learning_rate": 2.6e-06, "loss": 33.2898, "mean_token_accuracy": 0.6292566264048218, "num_tokens": 1126446.0, "step": 40 }, { "entropy": 1.3642345190048217, "epoch": 0.08, "grad_norm": 78.0, "learning_rate": 3.266666666666667e-06, "loss": 31.1081, "mean_token_accuracy": 0.643590996041894, "num_tokens": 1413596.0, "step": 50 }, { "entropy": 1.4666775342077016, "epoch": 0.096, "grad_norm": 50.0, "learning_rate": 3.9333333333333335e-06, "loss": 28.0855, "mean_token_accuracy": 0.6677330181002616, "num_tokens": 1701193.0, "step": 60 }, { "entropy": 1.479279049113393, "epoch": 0.112, "grad_norm": 44.5, "learning_rate": 4.600000000000001e-06, "loss": 25.9511, "mean_token_accuracy": 0.6763931937515736, "num_tokens": 1979232.0, "step": 70 }, { "entropy": 1.4356445044279098, "epoch": 0.128, "grad_norm": 31.5, "learning_rate": 5.2666666666666665e-06, "loss": 24.2383, "mean_token_accuracy": 0.6900208964943886, "num_tokens": 2274177.0, "step": 80 }, { "entropy": 1.3773791242390871, "epoch": 0.144, "grad_norm": 28.25, "learning_rate": 5.933333333333335e-06, "loss": 22.3128, "mean_token_accuracy": 0.7042559664696455, "num_tokens": 2548445.0, "step": 90 }, { "entropy": 1.3055105797946454, "epoch": 0.16, "grad_norm": 29.25, "learning_rate": 6.600000000000001e-06, "loss": 20.6858, "mean_token_accuracy": 0.718151243403554, "num_tokens": 2824418.0, "step": 100 }, { "epoch": 0.16, "eval_biology_entropy": 3.5783245754241944, "eval_biology_loss": 4.867801189422607, "eval_biology_mean_token_accuracy": 0.3678655695915222, "eval_biology_num_tokens": 2824418.0, "eval_biology_runtime": 50.0015, "eval_biology_samples_per_second": 10.0, "eval_biology_steps_per_second": 2.5, "step": 100 }, { "epoch": 0.16, "eval_chemistry_entropy": 2.541287685394287, "eval_chemistry_loss": 2.7937169075012207, "eval_chemistry_mean_token_accuracy": 0.539857518196106, "eval_chemistry_num_tokens": 2824418.0, "eval_chemistry_runtime": 61.8974, "eval_chemistry_samples_per_second": 8.078, "eval_chemistry_steps_per_second": 2.019, "step": 100 }, { "epoch": 0.16, "eval_math_entropy": 1.325653440952301, "eval_math_loss": 1.2963088750839233, "eval_math_mean_token_accuracy": 0.7152829113006591, "eval_math_num_tokens": 2824418.0, "eval_math_runtime": 63.4831, "eval_math_samples_per_second": 7.876, "eval_math_steps_per_second": 1.969, "step": 100 }, { "epoch": 0.16, "eval_physics_entropy": 2.0472509851455687, "eval_physics_loss": 2.108372688293457, "eval_physics_mean_token_accuracy": 0.6150235052108765, "eval_physics_num_tokens": 2824418.0, "eval_physics_runtime": 72.2792, "eval_physics_samples_per_second": 6.918, "eval_physics_steps_per_second": 1.729, "step": 100 }, { "entropy": 1.2799372777342797, "epoch": 0.176, "grad_norm": 23.25, "learning_rate": 7.266666666666668e-06, "loss": 20.1214, "mean_token_accuracy": 0.7223256763070822, "num_tokens": 3110313.0, "step": 110 }, { "entropy": 1.271880804002285, "epoch": 0.192, "grad_norm": 22.625, "learning_rate": 7.933333333333334e-06, "loss": 19.8735, "mean_token_accuracy": 0.7208763293921947, "num_tokens": 3394170.0, "step": 120 }, { "entropy": 1.2141485448926688, "epoch": 0.208, "grad_norm": 20.25, "learning_rate": 8.6e-06, "loss": 19.3285, "mean_token_accuracy": 0.7284529402852058, "num_tokens": 3673600.0, "step": 130 }, { "entropy": 1.1842644162476064, "epoch": 0.224, "grad_norm": 22.25, "learning_rate": 9.266666666666667e-06, "loss": 18.8222, "mean_token_accuracy": 0.7318667653948069, "num_tokens": 3953732.0, "step": 140 }, { "entropy": 1.1367911960929633, "epoch": 0.24, "grad_norm": 21.875, "learning_rate": 9.933333333333334e-06, "loss": 18.0331, "mean_token_accuracy": 0.7411410234868526, "num_tokens": 4243655.0, "step": 150 }, { "entropy": 1.099366496130824, "epoch": 0.256, "grad_norm": 20.875, "learning_rate": 1.0600000000000002e-05, "loss": 17.5038, "mean_token_accuracy": 0.7470747366547584, "num_tokens": 4531471.0, "step": 160 }, { "entropy": 1.1072874456644057, "epoch": 0.272, "grad_norm": 18.125, "learning_rate": 1.1266666666666668e-05, "loss": 17.6358, "mean_token_accuracy": 0.7453399378806352, "num_tokens": 4810284.0, "step": 170 }, { "entropy": 1.0879811342805623, "epoch": 0.288, "grad_norm": 16.375, "learning_rate": 1.1933333333333335e-05, "loss": 17.2142, "mean_token_accuracy": 0.749498899653554, "num_tokens": 5095104.0, "step": 180 }, { "entropy": 1.0824080746620894, "epoch": 0.304, "grad_norm": 16.625, "learning_rate": 1.2600000000000001e-05, "loss": 17.223, "mean_token_accuracy": 0.7493264392018318, "num_tokens": 5383732.0, "step": 190 }, { "entropy": 1.048988512158394, "epoch": 0.32, "grad_norm": 19.125, "learning_rate": 1.3266666666666668e-05, "loss": 16.7231, "mean_token_accuracy": 0.7555124312639236, "num_tokens": 5676334.0, "step": 200 }, { "epoch": 0.32, "eval_biology_entropy": 3.5249868144989014, "eval_biology_loss": 4.3318328857421875, "eval_biology_mean_token_accuracy": 0.39426790857315064, "eval_biology_num_tokens": 5676334.0, "eval_biology_runtime": 49.8836, "eval_biology_samples_per_second": 10.023, "eval_biology_steps_per_second": 2.506, "step": 200 }, { "epoch": 0.32, "eval_chemistry_entropy": 2.3051584205627442, "eval_chemistry_loss": 2.4184799194335938, "eval_chemistry_mean_token_accuracy": 0.5702217166423798, "eval_chemistry_num_tokens": 5676334.0, "eval_chemistry_runtime": 61.8203, "eval_chemistry_samples_per_second": 8.088, "eval_chemistry_steps_per_second": 2.022, "step": 200 }, { "epoch": 0.32, "eval_math_entropy": 1.0760114483833314, "eval_math_loss": 1.0703883171081543, "eval_math_mean_token_accuracy": 0.7492856097221374, "eval_math_num_tokens": 5676334.0, "eval_math_runtime": 63.3918, "eval_math_samples_per_second": 7.887, "eval_math_steps_per_second": 1.972, "step": 200 }, { "epoch": 0.32, "eval_physics_entropy": 1.7677014241218567, "eval_physics_loss": 1.8187588453292847, "eval_physics_mean_token_accuracy": 0.6450894718170166, "eval_physics_num_tokens": 5676334.0, "eval_physics_runtime": 72.1893, "eval_physics_samples_per_second": 6.926, "eval_physics_steps_per_second": 1.732, "step": 200 }, { "entropy": 1.0659823317080737, "epoch": 0.336, "grad_norm": 18.125, "learning_rate": 1.3933333333333334e-05, "loss": 17.0855, "mean_token_accuracy": 0.7487949147820473, "num_tokens": 5958480.0, "step": 210 }, { "entropy": 1.0319596778601408, "epoch": 0.352, "grad_norm": 16.625, "learning_rate": 1.46e-05, "loss": 16.3603, "mean_token_accuracy": 0.7570414818823338, "num_tokens": 6242161.0, "step": 220 }, { "entropy": 1.0307059772312641, "epoch": 0.368, "grad_norm": 19.125, "learning_rate": 1.5266666666666667e-05, "loss": 16.4803, "mean_token_accuracy": 0.7558304250240326, "num_tokens": 6523679.0, "step": 230 }, { "entropy": 1.008485846966505, "epoch": 0.384, "grad_norm": 17.75, "learning_rate": 1.5933333333333336e-05, "loss": 16.1119, "mean_token_accuracy": 0.7602646600455045, "num_tokens": 6810978.0, "step": 240 }, { "entropy": 1.0437373481690884, "epoch": 0.4, "grad_norm": 17.125, "learning_rate": 1.66e-05, "loss": 16.936, "mean_token_accuracy": 0.7534951154142618, "num_tokens": 7096903.0, "step": 250 }, { "entropy": 1.0108945474028588, "epoch": 0.416, "grad_norm": 16.75, "learning_rate": 1.726666666666667e-05, "loss": 16.165, "mean_token_accuracy": 0.7585977960377932, "num_tokens": 7377181.0, "step": 260 }, { "entropy": 0.9808746818453074, "epoch": 0.432, "grad_norm": 19.125, "learning_rate": 1.7933333333333333e-05, "loss": 15.708, "mean_token_accuracy": 0.7660888392478228, "num_tokens": 7650523.0, "step": 270 }, { "entropy": 0.9656771093606948, "epoch": 0.448, "grad_norm": 19.375, "learning_rate": 1.86e-05, "loss": 15.4934, "mean_token_accuracy": 0.7665207251906395, "num_tokens": 7936788.0, "step": 280 }, { "entropy": 0.970192850753665, "epoch": 0.464, "grad_norm": 21.125, "learning_rate": 1.926666666666667e-05, "loss": 15.697, "mean_token_accuracy": 0.7657037891447545, "num_tokens": 8223147.0, "step": 290 }, { "entropy": 0.9936408184468746, "epoch": 0.48, "grad_norm": 18.375, "learning_rate": 1.9933333333333334e-05, "loss": 15.8279, "mean_token_accuracy": 0.7644913751631975, "num_tokens": 8506030.0, "step": 300 }, { "epoch": 0.48, "eval_biology_entropy": 3.547258296966553, "eval_biology_loss": 4.036286354064941, "eval_biology_mean_token_accuracy": 0.4092668824195862, "eval_biology_num_tokens": 8506030.0, "eval_biology_runtime": 49.8646, "eval_biology_samples_per_second": 10.027, "eval_biology_steps_per_second": 2.507, "step": 300 }, { "epoch": 0.48, "eval_chemistry_entropy": 2.243741189956665, "eval_chemistry_loss": 2.2735037803649902, "eval_chemistry_mean_token_accuracy": 0.585998829126358, "eval_chemistry_num_tokens": 8506030.0, "eval_chemistry_runtime": 61.8087, "eval_chemistry_samples_per_second": 8.089, "eval_chemistry_steps_per_second": 2.022, "step": 300 }, { "epoch": 0.48, "eval_math_entropy": 0.969578085899353, "eval_math_loss": 0.9849457740783691, "eval_math_mean_token_accuracy": 0.7633716745376586, "eval_math_num_tokens": 8506030.0, "eval_math_runtime": 63.4567, "eval_math_samples_per_second": 7.879, "eval_math_steps_per_second": 1.97, "step": 300 }, { "epoch": 0.48, "eval_physics_entropy": 1.6970900883674622, "eval_physics_loss": 1.7149206399917603, "eval_physics_mean_token_accuracy": 0.6588896939754486, "eval_physics_num_tokens": 8506030.0, "eval_physics_runtime": 72.1904, "eval_physics_samples_per_second": 6.926, "eval_physics_steps_per_second": 1.732, "step": 300 }, { "entropy": 0.9509127460420131, "epoch": 0.496, "grad_norm": 27.125, "learning_rate": 1.9933333333333334e-05, "loss": 15.2509, "mean_token_accuracy": 0.7699485655874014, "num_tokens": 8788726.0, "step": 310 }, { "entropy": 0.9583811838179827, "epoch": 0.512, "grad_norm": 17.75, "learning_rate": 1.985925925925926e-05, "loss": 15.3156, "mean_token_accuracy": 0.7701035752892494, "num_tokens": 9078039.0, "step": 320 }, { "entropy": 0.9286689113825559, "epoch": 0.528, "grad_norm": 21.375, "learning_rate": 1.9785185185185187e-05, "loss": 14.9975, "mean_token_accuracy": 0.7747602745890617, "num_tokens": 9373860.0, "step": 330 }, { "entropy": 0.93574563190341, "epoch": 0.544, "grad_norm": 19.0, "learning_rate": 1.971111111111111e-05, "loss": 15.1982, "mean_token_accuracy": 0.7723268665373325, "num_tokens": 9660940.0, "step": 340 }, { "entropy": 0.9833962079137564, "epoch": 0.56, "grad_norm": 27.375, "learning_rate": 1.963703703703704e-05, "loss": 15.6566, "mean_token_accuracy": 0.7635074742138386, "num_tokens": 9932302.0, "step": 350 }, { "entropy": 0.9380292896181345, "epoch": 0.576, "grad_norm": 19.5, "learning_rate": 1.9562962962962964e-05, "loss": 15.0896, "mean_token_accuracy": 0.771076611801982, "num_tokens": 10215462.0, "step": 360 }, { "entropy": 0.9226088404655457, "epoch": 0.592, "grad_norm": 17.125, "learning_rate": 1.948888888888889e-05, "loss": 14.9585, "mean_token_accuracy": 0.7726155135780572, "num_tokens": 10504396.0, "step": 370 }, { "entropy": 0.9383664086461068, "epoch": 0.608, "grad_norm": 16.875, "learning_rate": 1.9414814814814817e-05, "loss": 15.0747, "mean_token_accuracy": 0.772657984867692, "num_tokens": 10793126.0, "step": 380 }, { "entropy": 0.9145499892532826, "epoch": 0.624, "grad_norm": 18.875, "learning_rate": 1.9340740740740743e-05, "loss": 14.7611, "mean_token_accuracy": 0.7738783087581396, "num_tokens": 11081768.0, "step": 390 }, { "entropy": 0.9076093930751086, "epoch": 0.64, "grad_norm": 17.5, "learning_rate": 1.926666666666667e-05, "loss": 14.491, "mean_token_accuracy": 0.7761218074709177, "num_tokens": 11370320.0, "step": 400 }, { "epoch": 0.64, "eval_biology_entropy": 3.4644125595092774, "eval_biology_loss": 3.9791781902313232, "eval_biology_mean_token_accuracy": 0.4146076333522797, "eval_biology_num_tokens": 11370320.0, "eval_biology_runtime": 49.8972, "eval_biology_samples_per_second": 10.021, "eval_biology_steps_per_second": 2.505, "step": 400 }, { "epoch": 0.64, "eval_chemistry_entropy": 2.1470180315971374, "eval_chemistry_loss": 2.1781718730926514, "eval_chemistry_mean_token_accuracy": 0.5962020962238311, "eval_chemistry_num_tokens": 11370320.0, "eval_chemistry_runtime": 61.8451, "eval_chemistry_samples_per_second": 8.085, "eval_chemistry_steps_per_second": 2.021, "step": 400 }, { "epoch": 0.64, "eval_math_entropy": 0.9346609992980957, "eval_math_loss": 0.9384942054748535, "eval_math_mean_token_accuracy": 0.771290735244751, "eval_math_num_tokens": 11370320.0, "eval_math_runtime": 63.3885, "eval_math_samples_per_second": 7.888, "eval_math_steps_per_second": 1.972, "step": 400 }, { "epoch": 0.64, "eval_physics_entropy": 1.6158739218711853, "eval_physics_loss": 1.6468137502670288, "eval_physics_mean_token_accuracy": 0.6678030126094818, "eval_physics_num_tokens": 11370320.0, "eval_physics_runtime": 72.1919, "eval_physics_samples_per_second": 6.926, "eval_physics_steps_per_second": 1.731, "step": 400 }, { "entropy": 0.8844340030103922, "epoch": 0.656, "grad_norm": 15.625, "learning_rate": 1.9192592592592593e-05, "loss": 14.2919, "mean_token_accuracy": 0.7790201846510172, "num_tokens": 11657835.0, "step": 410 }, { "entropy": 0.9230461683124304, "epoch": 0.672, "grad_norm": 17.25, "learning_rate": 1.911851851851852e-05, "loss": 14.8067, "mean_token_accuracy": 0.7730407487601042, "num_tokens": 11949262.0, "step": 420 }, { "entropy": 0.8864722024649382, "epoch": 0.688, "grad_norm": 14.5625, "learning_rate": 1.9044444444444446e-05, "loss": 14.2543, "mean_token_accuracy": 0.7801163170486689, "num_tokens": 12227640.0, "step": 430 }, { "entropy": 0.8921929065138101, "epoch": 0.704, "grad_norm": 17.5, "learning_rate": 1.8970370370370372e-05, "loss": 14.3854, "mean_token_accuracy": 0.7790493600070476, "num_tokens": 12516641.0, "step": 440 }, { "entropy": 0.9114352393895387, "epoch": 0.72, "grad_norm": 18.75, "learning_rate": 1.8896296296296295e-05, "loss": 14.7549, "mean_token_accuracy": 0.7744539573788642, "num_tokens": 12793343.0, "step": 450 }, { "entropy": 0.9004437670111656, "epoch": 0.736, "grad_norm": 16.625, "learning_rate": 1.8822222222222225e-05, "loss": 14.4296, "mean_token_accuracy": 0.7780555929988623, "num_tokens": 13077981.0, "step": 460 }, { "entropy": 0.8800053089857102, "epoch": 0.752, "grad_norm": 19.5, "learning_rate": 1.874814814814815e-05, "loss": 14.1044, "mean_token_accuracy": 0.7824321106076241, "num_tokens": 13358957.0, "step": 470 }, { "entropy": 0.8506957441568375, "epoch": 0.768, "grad_norm": 15.25, "learning_rate": 1.8674074074074075e-05, "loss": 13.7613, "mean_token_accuracy": 0.7884838156402111, "num_tokens": 13653412.0, "step": 480 }, { "entropy": 0.9173242174088955, "epoch": 0.784, "grad_norm": 18.125, "learning_rate": 1.86e-05, "loss": 14.7366, "mean_token_accuracy": 0.776311819255352, "num_tokens": 13940856.0, "step": 490 }, { "entropy": 0.9153516864404082, "epoch": 0.8, "grad_norm": 18.5, "learning_rate": 1.8525925925925928e-05, "loss": 14.84, "mean_token_accuracy": 0.7749685924500227, "num_tokens": 14230754.0, "step": 500 }, { "epoch": 0.8, "eval_biology_entropy": 3.587346399307251, "eval_biology_loss": 3.8396143913269043, "eval_biology_mean_token_accuracy": 0.42098368620872495, "eval_biology_num_tokens": 14230754.0, "eval_biology_runtime": 49.8808, "eval_biology_samples_per_second": 10.024, "eval_biology_steps_per_second": 2.506, "step": 500 }, { "epoch": 0.8, "eval_chemistry_entropy": 2.168390507221222, "eval_chemistry_loss": 2.1044840812683105, "eval_chemistry_mean_token_accuracy": 0.6037136344909668, "eval_chemistry_num_tokens": 14230754.0, "eval_chemistry_runtime": 61.4676, "eval_chemistry_samples_per_second": 8.134, "eval_chemistry_steps_per_second": 2.034, "step": 500 }, { "epoch": 0.8, "eval_math_entropy": 0.9173241977691651, "eval_math_loss": 0.911715030670166, "eval_math_mean_token_accuracy": 0.7763807888031006, "eval_math_num_tokens": 14230754.0, "eval_math_runtime": 63.353, "eval_math_samples_per_second": 7.892, "eval_math_steps_per_second": 1.973, "step": 500 }, { "epoch": 0.8, "eval_physics_entropy": 1.6311655960083007, "eval_physics_loss": 1.610815167427063, "eval_physics_mean_token_accuracy": 0.6733945074081421, "eval_physics_num_tokens": 14230754.0, "eval_physics_runtime": 72.3928, "eval_physics_samples_per_second": 6.907, "eval_physics_steps_per_second": 1.727, "step": 500 }, { "entropy": 0.8935465440154076, "epoch": 0.816, "grad_norm": 16.875, "learning_rate": 1.8451851851851855e-05, "loss": 14.1977, "mean_token_accuracy": 0.7802476394921541, "num_tokens": 14519893.0, "step": 510 }, { "entropy": 0.8838269270956516, "epoch": 0.832, "grad_norm": 17.375, "learning_rate": 1.8377777777777778e-05, "loss": 14.2483, "mean_token_accuracy": 0.7819081086665391, "num_tokens": 14805088.0, "step": 520 }, { "entropy": 0.9093101542443037, "epoch": 0.848, "grad_norm": 18.125, "learning_rate": 1.8303703703703704e-05, "loss": 14.6777, "mean_token_accuracy": 0.7753469321876765, "num_tokens": 15086306.0, "step": 530 }, { "entropy": 0.900922878831625, "epoch": 0.864, "grad_norm": 15.0, "learning_rate": 1.822962962962963e-05, "loss": 14.4243, "mean_token_accuracy": 0.778496702387929, "num_tokens": 15370985.0, "step": 540 }, { "entropy": 0.9037761628627777, "epoch": 0.88, "grad_norm": 20.5, "learning_rate": 1.8155555555555557e-05, "loss": 14.5507, "mean_token_accuracy": 0.7772340852767229, "num_tokens": 15653836.0, "step": 550 }, { "entropy": 0.8908271692693234, "epoch": 0.896, "grad_norm": 21.375, "learning_rate": 1.8081481481481484e-05, "loss": 14.3982, "mean_token_accuracy": 0.7781253006309271, "num_tokens": 15932179.0, "step": 560 }, { "entropy": 0.8755331316962838, "epoch": 0.912, "grad_norm": 17.5, "learning_rate": 1.800740740740741e-05, "loss": 13.9566, "mean_token_accuracy": 0.7827421851456166, "num_tokens": 16219842.0, "step": 570 }, { "entropy": 0.8637437466531992, "epoch": 0.928, "grad_norm": 19.875, "learning_rate": 1.7933333333333333e-05, "loss": 13.9502, "mean_token_accuracy": 0.7828048501163721, "num_tokens": 16499842.0, "step": 580 }, { "entropy": 0.8896576169878244, "epoch": 0.944, "grad_norm": 35.5, "learning_rate": 1.785925925925926e-05, "loss": 14.3781, "mean_token_accuracy": 0.7790289539843798, "num_tokens": 16781882.0, "step": 590 }, { "entropy": 0.9052465215325356, "epoch": 0.96, "grad_norm": 15.4375, "learning_rate": 1.7785185185185186e-05, "loss": 14.4585, "mean_token_accuracy": 0.7787603437900543, "num_tokens": 17067407.0, "step": 600 }, { "epoch": 0.96, "eval_biology_entropy": 3.393627851486206, "eval_biology_loss": 3.804689407348633, "eval_biology_mean_token_accuracy": 0.42461460971832277, "eval_biology_num_tokens": 17067407.0, "eval_biology_runtime": 49.7301, "eval_biology_samples_per_second": 10.054, "eval_biology_steps_per_second": 2.514, "step": 600 }, { "epoch": 0.96, "eval_chemistry_entropy": 2.023163953304291, "eval_chemistry_loss": 2.059583902359009, "eval_chemistry_mean_token_accuracy": 0.608015153169632, "eval_chemistry_num_tokens": 17067407.0, "eval_chemistry_runtime": 61.6301, "eval_chemistry_samples_per_second": 8.113, "eval_chemistry_steps_per_second": 2.028, "step": 600 }, { "epoch": 0.96, "eval_math_entropy": 0.886635826587677, "eval_math_loss": 0.8929259777069092, "eval_math_mean_token_accuracy": 0.7791896166801453, "eval_math_num_tokens": 17067407.0, "eval_math_runtime": 63.1881, "eval_math_samples_per_second": 7.913, "eval_math_steps_per_second": 1.978, "step": 600 }, { "epoch": 0.96, "eval_physics_entropy": 1.5457578830718994, "eval_physics_loss": 1.5915517807006836, "eval_physics_mean_token_accuracy": 0.6765441355705262, "eval_physics_num_tokens": 17067407.0, "eval_physics_runtime": 71.9901, "eval_physics_samples_per_second": 6.945, "eval_physics_steps_per_second": 1.736, "step": 600 }, { "entropy": 0.8741790272295475, "epoch": 0.976, "grad_norm": 20.125, "learning_rate": 1.7711111111111113e-05, "loss": 14.0666, "mean_token_accuracy": 0.7843787413090467, "num_tokens": 17350994.0, "step": 610 }, { "entropy": 0.8895185798406601, "epoch": 0.992, "grad_norm": 16.875, "learning_rate": 1.763703703703704e-05, "loss": 14.2795, "mean_token_accuracy": 0.7792422734200954, "num_tokens": 17637514.0, "step": 620 }, { "entropy": 0.8725822484120727, "epoch": 1.008, "grad_norm": 16.25, "learning_rate": 1.7562962962962962e-05, "loss": 13.9234, "mean_token_accuracy": 0.7825319610536099, "num_tokens": 17926570.0, "step": 630 }, { "entropy": 0.859299965761602, "epoch": 1.024, "grad_norm": 16.375, "learning_rate": 1.7488888888888892e-05, "loss": 13.7487, "mean_token_accuracy": 0.7849128067493438, "num_tokens": 18207652.0, "step": 640 }, { "entropy": 0.8529570057988167, "epoch": 1.04, "grad_norm": 18.5, "learning_rate": 1.7414814814814815e-05, "loss": 13.7357, "mean_token_accuracy": 0.7864760752767325, "num_tokens": 18484931.0, "step": 650 }, { "entropy": 0.8539858607575297, "epoch": 1.056, "grad_norm": 18.0, "learning_rate": 1.7340740740740742e-05, "loss": 13.8974, "mean_token_accuracy": 0.7846961792558431, "num_tokens": 18773457.0, "step": 660 }, { "entropy": 0.8639433614909648, "epoch": 1.072, "grad_norm": 17.625, "learning_rate": 1.726666666666667e-05, "loss": 13.7869, "mean_token_accuracy": 0.7835282500833273, "num_tokens": 19055365.0, "step": 670 }, { "entropy": 0.8525660615414381, "epoch": 1.088, "grad_norm": 15.875, "learning_rate": 1.7192592592592595e-05, "loss": 13.7686, "mean_token_accuracy": 0.786788547411561, "num_tokens": 19345730.0, "step": 680 }, { "entropy": 0.8591295637190342, "epoch": 1.104, "grad_norm": 14.4375, "learning_rate": 1.711851851851852e-05, "loss": 13.7706, "mean_token_accuracy": 0.7860686339437961, "num_tokens": 19637390.0, "step": 690 }, { "entropy": 0.8316973434761167, "epoch": 1.12, "grad_norm": 18.0, "learning_rate": 1.7044444444444445e-05, "loss": 13.3392, "mean_token_accuracy": 0.7907124478369951, "num_tokens": 19923914.0, "step": 700 }, { "epoch": 1.12, "eval_biology_entropy": 3.372835781097412, "eval_biology_loss": 3.8594861030578613, "eval_biology_mean_token_accuracy": 0.42449810791015624, "eval_biology_num_tokens": 19923914.0, "eval_biology_runtime": 49.7902, "eval_biology_samples_per_second": 10.042, "eval_biology_steps_per_second": 2.511, "step": 700 }, { "epoch": 1.12, "eval_chemistry_entropy": 1.985270866394043, "eval_chemistry_loss": 2.0690011978149414, "eval_chemistry_mean_token_accuracy": 0.6098231360912323, "eval_chemistry_num_tokens": 19923914.0, "eval_chemistry_runtime": 61.935, "eval_chemistry_samples_per_second": 8.073, "eval_chemistry_steps_per_second": 2.018, "step": 700 }, { "epoch": 1.12, "eval_math_entropy": 0.8660009272098541, "eval_math_loss": 0.8819393515586853, "eval_math_mean_token_accuracy": 0.7814530291557312, "eval_math_num_tokens": 19923914.0, "eval_math_runtime": 63.377, "eval_math_samples_per_second": 7.889, "eval_math_steps_per_second": 1.972, "step": 700 }, { "epoch": 1.12, "eval_physics_entropy": 1.5101813049316406, "eval_physics_loss": 1.5800307989120483, "eval_physics_mean_token_accuracy": 0.6781299662590027, "eval_physics_num_tokens": 19923914.0, "eval_physics_runtime": 72.2227, "eval_physics_samples_per_second": 6.923, "eval_physics_steps_per_second": 1.731, "step": 700 }, { "entropy": 0.8278432639315725, "epoch": 1.1360000000000001, "grad_norm": 16.875, "learning_rate": 1.697037037037037e-05, "loss": 13.3745, "mean_token_accuracy": 0.7893257014453411, "num_tokens": 20201892.0, "step": 710 }, { "entropy": 0.8526920214295387, "epoch": 1.152, "grad_norm": 19.0, "learning_rate": 1.6896296296296298e-05, "loss": 13.6557, "mean_token_accuracy": 0.7878582254052162, "num_tokens": 20490282.0, "step": 720 }, { "entropy": 0.8450694579631091, "epoch": 1.168, "grad_norm": 15.875, "learning_rate": 1.6822222222222224e-05, "loss": 13.6233, "mean_token_accuracy": 0.787573279067874, "num_tokens": 20785786.0, "step": 730 }, { "entropy": 0.8410906136035919, "epoch": 1.184, "grad_norm": 19.5, "learning_rate": 1.6748148148148147e-05, "loss": 13.5188, "mean_token_accuracy": 0.7897454522550106, "num_tokens": 21074205.0, "step": 740 }, { "entropy": 0.8331539599224925, "epoch": 1.2, "grad_norm": 18.625, "learning_rate": 1.6674074074074077e-05, "loss": 13.3771, "mean_token_accuracy": 0.7902466226369143, "num_tokens": 21369159.0, "step": 750 }, { "entropy": 0.8552144020795822, "epoch": 1.216, "grad_norm": 22.25, "learning_rate": 1.66e-05, "loss": 13.7361, "mean_token_accuracy": 0.7870386108756066, "num_tokens": 21649178.0, "step": 760 }, { "entropy": 0.8264798279851675, "epoch": 1.232, "grad_norm": 18.25, "learning_rate": 1.6525925925925927e-05, "loss": 13.2327, "mean_token_accuracy": 0.7920120354741812, "num_tokens": 21930239.0, "step": 770 }, { "entropy": 0.8188742805272341, "epoch": 1.248, "grad_norm": 16.625, "learning_rate": 1.6451851851851853e-05, "loss": 13.1839, "mean_token_accuracy": 0.7921639878302813, "num_tokens": 22216387.0, "step": 780 }, { "entropy": 0.8594657603651286, "epoch": 1.264, "grad_norm": 19.125, "learning_rate": 1.637777777777778e-05, "loss": 13.88, "mean_token_accuracy": 0.7837325409054756, "num_tokens": 22501002.0, "step": 790 }, { "entropy": 0.8564920598641038, "epoch": 1.28, "grad_norm": 21.375, "learning_rate": 1.6303703703703706e-05, "loss": 13.6499, "mean_token_accuracy": 0.7877908434718848, "num_tokens": 22779682.0, "step": 800 }, { "epoch": 1.28, "eval_biology_entropy": 3.376770601272583, "eval_biology_loss": 3.801596164703369, "eval_biology_mean_token_accuracy": 0.4262664563655853, "eval_biology_num_tokens": 22779682.0, "eval_biology_runtime": 49.8531, "eval_biology_samples_per_second": 10.029, "eval_biology_steps_per_second": 2.507, "step": 800 }, { "epoch": 1.28, "eval_chemistry_entropy": 1.9474885282516479, "eval_chemistry_loss": 2.0175445079803467, "eval_chemistry_mean_token_accuracy": 0.6140047969818115, "eval_chemistry_num_tokens": 22779682.0, "eval_chemistry_runtime": 61.8009, "eval_chemistry_samples_per_second": 8.09, "eval_chemistry_steps_per_second": 2.023, "step": 800 }, { "epoch": 1.28, "eval_math_entropy": 0.839567274093628, "eval_math_loss": 0.8709415197372437, "eval_math_mean_token_accuracy": 0.783750717163086, "eval_math_num_tokens": 22779682.0, "eval_math_runtime": 63.3865, "eval_math_samples_per_second": 7.888, "eval_math_steps_per_second": 1.972, "step": 800 }, { "epoch": 1.28, "eval_physics_entropy": 1.488646312236786, "eval_physics_loss": 1.5581644773483276, "eval_physics_mean_token_accuracy": 0.6805251049995422, "eval_physics_num_tokens": 22779682.0, "eval_physics_runtime": 72.4407, "eval_physics_samples_per_second": 6.902, "eval_physics_steps_per_second": 1.726, "step": 800 }, { "entropy": 0.8242643151432276, "epoch": 1.296, "grad_norm": 16.75, "learning_rate": 1.622962962962963e-05, "loss": 13.3485, "mean_token_accuracy": 0.7901775784790516, "num_tokens": 23057744.0, "step": 810 }, { "entropy": 0.8360366895794868, "epoch": 1.312, "grad_norm": 20.25, "learning_rate": 1.6155555555555556e-05, "loss": 13.6015, "mean_token_accuracy": 0.7880489483475686, "num_tokens": 23344644.0, "step": 820 }, { "entropy": 0.8606906643137335, "epoch": 1.328, "grad_norm": 19.0, "learning_rate": 1.6081481481481482e-05, "loss": 13.6974, "mean_token_accuracy": 0.7855447053909301, "num_tokens": 23622405.0, "step": 830 }, { "entropy": 0.8278559835627675, "epoch": 1.3439999999999999, "grad_norm": 16.5, "learning_rate": 1.600740740740741e-05, "loss": 13.2704, "mean_token_accuracy": 0.7906895779073239, "num_tokens": 23899771.0, "step": 840 }, { "entropy": 0.8220528084784746, "epoch": 1.3599999999999999, "grad_norm": 15.5625, "learning_rate": 1.5933333333333336e-05, "loss": 13.3103, "mean_token_accuracy": 0.7912323564291001, "num_tokens": 24187023.0, "step": 850 }, { "entropy": 0.8924444895237684, "epoch": 1.376, "grad_norm": 20.875, "learning_rate": 1.5859259259259262e-05, "loss": 14.3535, "mean_token_accuracy": 0.7774244502186776, "num_tokens": 24466132.0, "step": 860 }, { "entropy": 0.8332192454487085, "epoch": 1.392, "grad_norm": 19.5, "learning_rate": 1.5785185185185185e-05, "loss": 13.3708, "mean_token_accuracy": 0.7895895481109619, "num_tokens": 24748043.0, "step": 870 }, { "entropy": 0.816296424344182, "epoch": 1.408, "grad_norm": 19.75, "learning_rate": 1.571111111111111e-05, "loss": 13.2019, "mean_token_accuracy": 0.7928898848593235, "num_tokens": 25036674.0, "step": 880 }, { "entropy": 0.81595440171659, "epoch": 1.424, "grad_norm": 14.375, "learning_rate": 1.5637037037037038e-05, "loss": 13.1288, "mean_token_accuracy": 0.793658908829093, "num_tokens": 25324579.0, "step": 890 }, { "entropy": 0.8419610138982534, "epoch": 1.44, "grad_norm": 18.75, "learning_rate": 1.5562962962962965e-05, "loss": 13.6293, "mean_token_accuracy": 0.7877266079187393, "num_tokens": 25606824.0, "step": 900 }, { "epoch": 1.44, "eval_biology_entropy": 3.37851238822937, "eval_biology_loss": 3.7537248134613037, "eval_biology_mean_token_accuracy": 0.42950630354881286, "eval_biology_num_tokens": 25606824.0, "eval_biology_runtime": 49.7825, "eval_biology_samples_per_second": 10.044, "eval_biology_steps_per_second": 2.511, "step": 900 }, { "epoch": 1.44, "eval_chemistry_entropy": 1.9617934045791625, "eval_chemistry_loss": 2.017207145690918, "eval_chemistry_mean_token_accuracy": 0.6149281630516052, "eval_chemistry_num_tokens": 25606824.0, "eval_chemistry_runtime": 61.6914, "eval_chemistry_samples_per_second": 8.105, "eval_chemistry_steps_per_second": 2.026, "step": 900 }, { "epoch": 1.44, "eval_math_entropy": 0.8517674751281739, "eval_math_loss": 0.8629322648048401, "eval_math_mean_token_accuracy": 0.7848198075294495, "eval_math_num_tokens": 25606824.0, "eval_math_runtime": 63.2603, "eval_math_samples_per_second": 7.904, "eval_math_steps_per_second": 1.976, "step": 900 }, { "epoch": 1.44, "eval_physics_entropy": 1.496031648159027, "eval_physics_loss": 1.5447328090667725, "eval_physics_mean_token_accuracy": 0.6830805406570435, "eval_physics_num_tokens": 25606824.0, "eval_physics_runtime": 72.0951, "eval_physics_samples_per_second": 6.935, "eval_physics_steps_per_second": 1.734, "step": 900 }, { "entropy": 0.8286773880943656, "epoch": 1.456, "grad_norm": 15.3125, "learning_rate": 1.548888888888889e-05, "loss": 13.1968, "mean_token_accuracy": 0.7905922997742891, "num_tokens": 25886396.0, "step": 910 }, { "entropy": 0.8260000789538026, "epoch": 1.472, "grad_norm": 17.125, "learning_rate": 1.5414814814814814e-05, "loss": 13.2698, "mean_token_accuracy": 0.790665191039443, "num_tokens": 26163618.0, "step": 920 }, { "entropy": 0.8521020160987973, "epoch": 1.488, "grad_norm": 19.0, "learning_rate": 1.5340740740740744e-05, "loss": 13.8606, "mean_token_accuracy": 0.7840919081121683, "num_tokens": 26438338.0, "step": 930 }, { "entropy": 0.8122210435569286, "epoch": 1.504, "grad_norm": 21.0, "learning_rate": 1.5266666666666667e-05, "loss": 13.1256, "mean_token_accuracy": 0.7937585778534413, "num_tokens": 26729255.0, "step": 940 }, { "entropy": 0.8557528490200639, "epoch": 1.52, "grad_norm": 16.5, "learning_rate": 1.5192592592592594e-05, "loss": 13.6488, "mean_token_accuracy": 0.7870340771973133, "num_tokens": 27017935.0, "step": 950 }, { "entropy": 0.8247261323034764, "epoch": 1.536, "grad_norm": 15.1875, "learning_rate": 1.5118518518518519e-05, "loss": 13.3195, "mean_token_accuracy": 0.7908138386905194, "num_tokens": 27306339.0, "step": 960 }, { "entropy": 0.8346869014203548, "epoch": 1.552, "grad_norm": 17.375, "learning_rate": 1.5044444444444445e-05, "loss": 13.3502, "mean_token_accuracy": 0.7901338282972574, "num_tokens": 27591959.0, "step": 970 }, { "entropy": 0.8050638379529118, "epoch": 1.568, "grad_norm": 15.625, "learning_rate": 1.497037037037037e-05, "loss": 13.0359, "mean_token_accuracy": 0.7952109534293413, "num_tokens": 27884398.0, "step": 980 }, { "entropy": 0.8222229275852442, "epoch": 1.584, "grad_norm": 19.75, "learning_rate": 1.4896296296296298e-05, "loss": 13.2707, "mean_token_accuracy": 0.7913329377770424, "num_tokens": 28171274.0, "step": 990 }, { "entropy": 0.8244392290711403, "epoch": 1.6, "grad_norm": 20.0, "learning_rate": 1.4822222222222225e-05, "loss": 13.1036, "mean_token_accuracy": 0.7916798040270805, "num_tokens": 28457624.0, "step": 1000 }, { "epoch": 1.6, "eval_biology_entropy": 3.3615621147155763, "eval_biology_loss": 3.8202450275421143, "eval_biology_mean_token_accuracy": 0.4290500741004944, "eval_biology_num_tokens": 28457624.0, "eval_biology_runtime": 49.8587, "eval_biology_samples_per_second": 10.028, "eval_biology_steps_per_second": 2.507, "step": 1000 }, { "epoch": 1.6, "eval_chemistry_entropy": 1.937631681919098, "eval_chemistry_loss": 2.0187153816223145, "eval_chemistry_mean_token_accuracy": 0.6154364368915558, "eval_chemistry_num_tokens": 28457624.0, "eval_chemistry_runtime": 61.3814, "eval_chemistry_samples_per_second": 8.146, "eval_chemistry_steps_per_second": 2.036, "step": 1000 }, { "epoch": 1.6, "eval_math_entropy": 0.8357908477783204, "eval_math_loss": 0.857028603553772, "eval_math_mean_token_accuracy": 0.7858444275856018, "eval_math_num_tokens": 28457624.0, "eval_math_runtime": 63.398, "eval_math_samples_per_second": 7.887, "eval_math_steps_per_second": 1.972, "step": 1000 }, { "epoch": 1.6, "eval_physics_entropy": 1.4734057002067567, "eval_physics_loss": 1.5425677299499512, "eval_physics_mean_token_accuracy": 0.6834757437705994, "eval_physics_num_tokens": 28457624.0, "eval_physics_runtime": 72.3807, "eval_physics_samples_per_second": 6.908, "eval_physics_steps_per_second": 1.727, "step": 1000 }, { "entropy": 0.8183613562956452, "epoch": 1.616, "grad_norm": 15.8125, "learning_rate": 1.474814814814815e-05, "loss": 13.3252, "mean_token_accuracy": 0.789626444876194, "num_tokens": 28743099.0, "step": 1010 }, { "entropy": 0.8398883065208793, "epoch": 1.6320000000000001, "grad_norm": 18.25, "learning_rate": 1.4674074074074076e-05, "loss": 13.3764, "mean_token_accuracy": 0.7887058384716511, "num_tokens": 29017297.0, "step": 1020 }, { "entropy": 0.809584242105484, "epoch": 1.6480000000000001, "grad_norm": 18.5, "learning_rate": 1.46e-05, "loss": 13.0665, "mean_token_accuracy": 0.7953496225178242, "num_tokens": 29303707.0, "step": 1030 }, { "entropy": 0.8161404246464372, "epoch": 1.6640000000000001, "grad_norm": 18.0, "learning_rate": 1.4525925925925927e-05, "loss": 13.1101, "mean_token_accuracy": 0.7914217729121447, "num_tokens": 29597156.0, "step": 1040 }, { "entropy": 0.8350240230560303, "epoch": 1.6800000000000002, "grad_norm": 16.375, "learning_rate": 1.4451851851851852e-05, "loss": 13.3906, "mean_token_accuracy": 0.7913927461951971, "num_tokens": 29883879.0, "step": 1050 }, { "entropy": 0.8350640160962939, "epoch": 1.696, "grad_norm": 18.75, "learning_rate": 1.4377777777777779e-05, "loss": 13.5113, "mean_token_accuracy": 0.7893182434141636, "num_tokens": 30165760.0, "step": 1060 }, { "entropy": 0.8064091168344021, "epoch": 1.712, "grad_norm": 17.0, "learning_rate": 1.4303703703703703e-05, "loss": 13.0678, "mean_token_accuracy": 0.7943233713507653, "num_tokens": 30460367.0, "step": 1070 }, { "entropy": 0.8139542754739523, "epoch": 1.728, "grad_norm": 23.25, "learning_rate": 1.4229629629629632e-05, "loss": 12.9895, "mean_token_accuracy": 0.7952147454023362, "num_tokens": 30739137.0, "step": 1080 }, { "entropy": 0.8204835077747703, "epoch": 1.744, "grad_norm": 18.5, "learning_rate": 1.4155555555555556e-05, "loss": 13.1414, "mean_token_accuracy": 0.7930131111294031, "num_tokens": 31022663.0, "step": 1090 }, { "entropy": 0.8065364792943001, "epoch": 1.76, "grad_norm": 17.375, "learning_rate": 1.4081481481481483e-05, "loss": 13.0088, "mean_token_accuracy": 0.7931559670716524, "num_tokens": 31306494.0, "step": 1100 }, { "epoch": 1.76, "eval_biology_entropy": 3.3287238235473633, "eval_biology_loss": 3.7768468856811523, "eval_biology_mean_token_accuracy": 0.4298539180755615, "eval_biology_num_tokens": 31306494.0, "eval_biology_runtime": 49.8201, "eval_biology_samples_per_second": 10.036, "eval_biology_steps_per_second": 2.509, "step": 1100 }, { "epoch": 1.76, "eval_chemistry_entropy": 1.9171943378448486, "eval_chemistry_loss": 2.0048129558563232, "eval_chemistry_mean_token_accuracy": 0.6168817098140716, "eval_chemistry_num_tokens": 31306494.0, "eval_chemistry_runtime": 61.7404, "eval_chemistry_samples_per_second": 8.098, "eval_chemistry_steps_per_second": 2.025, "step": 1100 }, { "epoch": 1.76, "eval_math_entropy": 0.8366622452735901, "eval_math_loss": 0.851693332195282, "eval_math_mean_token_accuracy": 0.7872344598770141, "eval_math_num_tokens": 31306494.0, "eval_math_runtime": 63.5475, "eval_math_samples_per_second": 7.868, "eval_math_steps_per_second": 1.967, "step": 1100 }, { "epoch": 1.76, "eval_physics_entropy": 1.4740583176612854, "eval_physics_loss": 1.5396031141281128, "eval_physics_mean_token_accuracy": 0.6840226426124573, "eval_physics_num_tokens": 31306494.0, "eval_physics_runtime": 72.1115, "eval_physics_samples_per_second": 6.934, "eval_physics_steps_per_second": 1.733, "step": 1100 }, { "entropy": 0.8262896530330182, "epoch": 1.776, "grad_norm": 17.75, "learning_rate": 1.400740740740741e-05, "loss": 13.274, "mean_token_accuracy": 0.7924778901040554, "num_tokens": 31595542.0, "step": 1110 }, { "entropy": 0.8307720847427845, "epoch": 1.792, "grad_norm": 17.125, "learning_rate": 1.3933333333333334e-05, "loss": 13.4174, "mean_token_accuracy": 0.7890391640365124, "num_tokens": 31881189.0, "step": 1120 }, { "entropy": 0.8346689926460386, "epoch": 1.808, "grad_norm": 16.625, "learning_rate": 1.385925925925926e-05, "loss": 13.3517, "mean_token_accuracy": 0.7908568613231182, "num_tokens": 32164196.0, "step": 1130 }, { "entropy": 0.7776938879862427, "epoch": 1.8239999999999998, "grad_norm": 18.0, "learning_rate": 1.3785185185185186e-05, "loss": 12.5276, "mean_token_accuracy": 0.7996609631925822, "num_tokens": 32441530.0, "step": 1140 }, { "entropy": 0.8081878338009119, "epoch": 1.8399999999999999, "grad_norm": 19.75, "learning_rate": 1.3711111111111112e-05, "loss": 13.0399, "mean_token_accuracy": 0.794286473467946, "num_tokens": 32723145.0, "step": 1150 }, { "entropy": 0.8392048073932529, "epoch": 1.8559999999999999, "grad_norm": 16.25, "learning_rate": 1.3637037037037037e-05, "loss": 13.4004, "mean_token_accuracy": 0.7900693397969007, "num_tokens": 33011263.0, "step": 1160 }, { "entropy": 0.8063056256622076, "epoch": 1.8719999999999999, "grad_norm": 16.625, "learning_rate": 1.3562962962962965e-05, "loss": 13.0163, "mean_token_accuracy": 0.7964250590652228, "num_tokens": 33298921.0, "step": 1170 }, { "entropy": 0.8290531987324357, "epoch": 1.888, "grad_norm": 19.0, "learning_rate": 1.3488888888888888e-05, "loss": 13.2527, "mean_token_accuracy": 0.7908387798815966, "num_tokens": 33576243.0, "step": 1180 }, { "entropy": 0.8150693111121654, "epoch": 1.904, "grad_norm": 16.25, "learning_rate": 1.3414814814814817e-05, "loss": 13.1745, "mean_token_accuracy": 0.7923198211938143, "num_tokens": 33850968.0, "step": 1190 }, { "entropy": 0.7930012326687574, "epoch": 1.92, "grad_norm": 15.875, "learning_rate": 1.3340740740740741e-05, "loss": 12.7681, "mean_token_accuracy": 0.7967522501945495, "num_tokens": 34128558.0, "step": 1200 }, { "epoch": 1.92, "eval_biology_entropy": 3.286720132827759, "eval_biology_loss": 3.7095859050750732, "eval_biology_mean_token_accuracy": 0.43265608382225035, "eval_biology_num_tokens": 34128558.0, "eval_biology_runtime": 49.805, "eval_biology_samples_per_second": 10.039, "eval_biology_steps_per_second": 2.51, "step": 1200 }, { "epoch": 1.92, "eval_chemistry_entropy": 1.898814220905304, "eval_chemistry_loss": 1.9991657733917236, "eval_chemistry_mean_token_accuracy": 0.6174907529354096, "eval_chemistry_num_tokens": 34128558.0, "eval_chemistry_runtime": 61.7352, "eval_chemistry_samples_per_second": 8.099, "eval_chemistry_steps_per_second": 2.025, "step": 1200 }, { "epoch": 1.92, "eval_math_entropy": 0.8344658007621765, "eval_math_loss": 0.8473447561264038, "eval_math_mean_token_accuracy": 0.7876402115821839, "eval_math_num_tokens": 34128558.0, "eval_math_runtime": 63.3639, "eval_math_samples_per_second": 7.891, "eval_math_steps_per_second": 1.973, "step": 1200 }, { "epoch": 1.92, "eval_physics_entropy": 1.4668292608261109, "eval_physics_loss": 1.5414491891860962, "eval_physics_mean_token_accuracy": 0.6841273403167725, "eval_physics_num_tokens": 34128558.0, "eval_physics_runtime": 72.1478, "eval_physics_samples_per_second": 6.93, "eval_physics_steps_per_second": 1.733, "step": 1200 }, { "entropy": 0.822460556961596, "epoch": 1.936, "grad_norm": 22.375, "learning_rate": 1.3266666666666668e-05, "loss": 13.1389, "mean_token_accuracy": 0.7921592094004154, "num_tokens": 34408056.0, "step": 1210 }, { "entropy": 0.807614759914577, "epoch": 1.952, "grad_norm": 19.625, "learning_rate": 1.3192592592592594e-05, "loss": 13.0525, "mean_token_accuracy": 0.7933488007634878, "num_tokens": 34684679.0, "step": 1220 }, { "entropy": 0.8354181025177241, "epoch": 1.968, "grad_norm": 18.375, "learning_rate": 1.311851851851852e-05, "loss": 13.5335, "mean_token_accuracy": 0.788308460265398, "num_tokens": 34971038.0, "step": 1230 }, { "entropy": 0.8086721919476986, "epoch": 1.984, "grad_norm": 16.5, "learning_rate": 1.3044444444444446e-05, "loss": 12.9467, "mean_token_accuracy": 0.7951289299875498, "num_tokens": 35262281.0, "step": 1240 }, { "entropy": 0.7930673930794001, "epoch": 2.0, "grad_norm": 22.75, "learning_rate": 1.297037037037037e-05, "loss": 12.7941, "mean_token_accuracy": 0.79657237008214, "num_tokens": 35560864.0, "step": 1250 }, { "entropy": 0.8225008610635995, "epoch": 2.016, "grad_norm": 15.25, "learning_rate": 1.2896296296296299e-05, "loss": 13.1411, "mean_token_accuracy": 0.7936458811163902, "num_tokens": 35846704.0, "step": 1260 }, { "entropy": 0.8110213376581669, "epoch": 2.032, "grad_norm": 19.125, "learning_rate": 1.2822222222222222e-05, "loss": 12.9766, "mean_token_accuracy": 0.7942904278635978, "num_tokens": 36128775.0, "step": 1270 }, { "entropy": 0.7852249689400196, "epoch": 2.048, "grad_norm": 18.0, "learning_rate": 1.274814814814815e-05, "loss": 12.5822, "mean_token_accuracy": 0.7999643433839083, "num_tokens": 36419504.0, "step": 1280 }, { "entropy": 0.782286673039198, "epoch": 2.064, "grad_norm": 17.25, "learning_rate": 1.2674074074074075e-05, "loss": 12.6666, "mean_token_accuracy": 0.7982493124902248, "num_tokens": 36706816.0, "step": 1290 }, { "entropy": 0.795855419524014, "epoch": 2.08, "grad_norm": 18.375, "learning_rate": 1.2600000000000001e-05, "loss": 12.6569, "mean_token_accuracy": 0.7973877970129252, "num_tokens": 36988475.0, "step": 1300 }, { "epoch": 2.08, "eval_biology_entropy": 3.2089530782699587, "eval_biology_loss": 3.7030863761901855, "eval_biology_mean_token_accuracy": 0.43272937917709353, "eval_biology_num_tokens": 36988475.0, "eval_biology_runtime": 49.7903, "eval_biology_samples_per_second": 10.042, "eval_biology_steps_per_second": 2.511, "step": 1300 }, { "epoch": 2.08, "eval_chemistry_entropy": 1.8433975267410279, "eval_chemistry_loss": 1.9999408721923828, "eval_chemistry_mean_token_accuracy": 0.6180174436569214, "eval_chemistry_num_tokens": 36988475.0, "eval_chemistry_runtime": 61.7274, "eval_chemistry_samples_per_second": 8.1, "eval_chemistry_steps_per_second": 2.025, "step": 1300 }, { "epoch": 2.08, "eval_math_entropy": 0.8109442203044891, "eval_math_loss": 0.8452479243278503, "eval_math_mean_token_accuracy": 0.7881795573234558, "eval_math_num_tokens": 36988475.0, "eval_math_runtime": 63.3275, "eval_math_samples_per_second": 7.895, "eval_math_steps_per_second": 1.974, "step": 1300 }, { "epoch": 2.08, "eval_physics_entropy": 1.4158136811256408, "eval_physics_loss": 1.5318591594696045, "eval_physics_mean_token_accuracy": 0.6849393043518066, "eval_physics_num_tokens": 36988475.0, "eval_physics_runtime": 72.137, "eval_physics_samples_per_second": 6.931, "eval_physics_steps_per_second": 1.733, "step": 1300 }, { "entropy": 0.8043522318825126, "epoch": 2.096, "grad_norm": 17.375, "learning_rate": 1.2525925925925928e-05, "loss": 12.9984, "mean_token_accuracy": 0.7961476005613803, "num_tokens": 37270131.0, "step": 1310 }, { "entropy": 0.7804812841117382, "epoch": 2.112, "grad_norm": 16.375, "learning_rate": 1.2451851851851853e-05, "loss": 12.5609, "mean_token_accuracy": 0.798238455876708, "num_tokens": 37563537.0, "step": 1320 }, { "entropy": 0.7818844333291054, "epoch": 2.128, "grad_norm": 19.375, "learning_rate": 1.237777777777778e-05, "loss": 12.4773, "mean_token_accuracy": 0.799476521462202, "num_tokens": 37843959.0, "step": 1330 }, { "entropy": 0.7816478908061981, "epoch": 2.144, "grad_norm": 17.625, "learning_rate": 1.2303703703703704e-05, "loss": 12.6277, "mean_token_accuracy": 0.7990053717046977, "num_tokens": 38133092.0, "step": 1340 }, { "entropy": 0.7931677252054214, "epoch": 2.16, "grad_norm": 16.375, "learning_rate": 1.222962962962963e-05, "loss": 12.7116, "mean_token_accuracy": 0.7987668544054032, "num_tokens": 38421229.0, "step": 1350 }, { "entropy": 0.7619917875155806, "epoch": 2.176, "grad_norm": 19.75, "learning_rate": 1.2155555555555555e-05, "loss": 12.2236, "mean_token_accuracy": 0.8035903237760067, "num_tokens": 38708043.0, "step": 1360 }, { "entropy": 0.7887991681694985, "epoch": 2.192, "grad_norm": 16.875, "learning_rate": 1.2081481481481484e-05, "loss": 12.6936, "mean_token_accuracy": 0.796902683749795, "num_tokens": 38996930.0, "step": 1370 }, { "entropy": 0.7967741012573242, "epoch": 2.208, "grad_norm": 30.25, "learning_rate": 1.2007407407407408e-05, "loss": 12.9174, "mean_token_accuracy": 0.7960829515010118, "num_tokens": 39279481.0, "step": 1380 }, { "entropy": 0.7880135742947459, "epoch": 2.224, "grad_norm": 19.75, "learning_rate": 1.1933333333333335e-05, "loss": 12.6788, "mean_token_accuracy": 0.7990153830498457, "num_tokens": 39569030.0, "step": 1390 }, { "entropy": 0.7916025001555681, "epoch": 2.24, "grad_norm": 18.75, "learning_rate": 1.185925925925926e-05, "loss": 12.7022, "mean_token_accuracy": 0.7965882256627083, "num_tokens": 39854873.0, "step": 1400 }, { "epoch": 2.24, "eval_biology_entropy": 3.280108278274536, "eval_biology_loss": 3.7498559951782227, "eval_biology_mean_token_accuracy": 0.43300582957267764, "eval_biology_num_tokens": 39854873.0, "eval_biology_runtime": 49.7856, "eval_biology_samples_per_second": 10.043, "eval_biology_steps_per_second": 2.511, "step": 1400 }, { "epoch": 2.24, "eval_chemistry_entropy": 1.8648150267601014, "eval_chemistry_loss": 1.9987932443618774, "eval_chemistry_mean_token_accuracy": 0.6183059096336365, "eval_chemistry_num_tokens": 39854873.0, "eval_chemistry_runtime": 61.751, "eval_chemistry_samples_per_second": 8.097, "eval_chemistry_steps_per_second": 2.024, "step": 1400 }, { "epoch": 2.24, "eval_math_entropy": 0.8120264797210693, "eval_math_loss": 0.8429214954376221, "eval_math_mean_token_accuracy": 0.7886019821166992, "eval_math_num_tokens": 39854873.0, "eval_math_runtime": 63.3235, "eval_math_samples_per_second": 7.896, "eval_math_steps_per_second": 1.974, "step": 1400 }, { "epoch": 2.24, "eval_physics_entropy": 1.4317502527236938, "eval_physics_loss": 1.5286813974380493, "eval_physics_mean_token_accuracy": 0.6863654246330261, "eval_physics_num_tokens": 39854873.0, "eval_physics_runtime": 72.133, "eval_physics_samples_per_second": 6.932, "eval_physics_steps_per_second": 1.733, "step": 1400 }, { "entropy": 0.7847368892282247, "epoch": 2.2560000000000002, "grad_norm": 17.25, "learning_rate": 1.1785185185185186e-05, "loss": 12.6258, "mean_token_accuracy": 0.7981156852096319, "num_tokens": 40141190.0, "step": 1410 }, { "entropy": 0.8105148211121559, "epoch": 2.2720000000000002, "grad_norm": 22.75, "learning_rate": 1.1711111111111113e-05, "loss": 13.0268, "mean_token_accuracy": 0.7950900815427303, "num_tokens": 40415203.0, "step": 1420 }, { "entropy": 0.7900150781497359, "epoch": 2.288, "grad_norm": 17.5, "learning_rate": 1.1637037037037037e-05, "loss": 12.6047, "mean_token_accuracy": 0.7972030244767666, "num_tokens": 40702393.0, "step": 1430 }, { "entropy": 0.7867585562169552, "epoch": 2.304, "grad_norm": 21.125, "learning_rate": 1.1562962962962964e-05, "loss": 12.6491, "mean_token_accuracy": 0.7997174769639969, "num_tokens": 40982775.0, "step": 1440 }, { "entropy": 0.8002804208546876, "epoch": 2.32, "grad_norm": 20.875, "learning_rate": 1.1488888888888889e-05, "loss": 12.8551, "mean_token_accuracy": 0.7973973777145147, "num_tokens": 41263356.0, "step": 1450 }, { "entropy": 0.7985019231215119, "epoch": 2.336, "grad_norm": 18.125, "learning_rate": 1.1414814814814817e-05, "loss": 12.8916, "mean_token_accuracy": 0.7960489392280579, "num_tokens": 41545235.0, "step": 1460 }, { "entropy": 0.8076209686696529, "epoch": 2.352, "grad_norm": 18.75, "learning_rate": 1.1340740740740742e-05, "loss": 12.9997, "mean_token_accuracy": 0.7936360351741314, "num_tokens": 41833417.0, "step": 1470 }, { "entropy": 0.7910049550235272, "epoch": 2.368, "grad_norm": 19.0, "learning_rate": 1.1266666666666668e-05, "loss": 12.7404, "mean_token_accuracy": 0.7980276115238667, "num_tokens": 42117030.0, "step": 1480 }, { "entropy": 0.811725552752614, "epoch": 2.384, "grad_norm": 22.875, "learning_rate": 1.1192592592592593e-05, "loss": 13.0613, "mean_token_accuracy": 0.793327846005559, "num_tokens": 42410990.0, "step": 1490 }, { "entropy": 0.8014231506735087, "epoch": 2.4, "grad_norm": 20.0, "learning_rate": 1.111851851851852e-05, "loss": 12.8548, "mean_token_accuracy": 0.796426559984684, "num_tokens": 42691890.0, "step": 1500 }, { "epoch": 2.4, "eval_biology_entropy": 3.213496032714844, "eval_biology_loss": 3.7113101482391357, "eval_biology_mean_token_accuracy": 0.43321818923950195, "eval_biology_num_tokens": 42691890.0, "eval_biology_runtime": 49.9484, "eval_biology_samples_per_second": 10.01, "eval_biology_steps_per_second": 2.503, "step": 1500 }, { "epoch": 2.4, "eval_chemistry_entropy": 1.8208427929878235, "eval_chemistry_loss": 1.9899694919586182, "eval_chemistry_mean_token_accuracy": 0.6187239620685577, "eval_chemistry_num_tokens": 42691890.0, "eval_chemistry_runtime": 61.2857, "eval_chemistry_samples_per_second": 8.159, "eval_chemistry_steps_per_second": 2.04, "step": 1500 }, { "epoch": 2.4, "eval_math_entropy": 0.7984081234931946, "eval_math_loss": 0.8405261635780334, "eval_math_mean_token_accuracy": 0.7889668345451355, "eval_math_num_tokens": 42691890.0, "eval_math_runtime": 63.266, "eval_math_samples_per_second": 7.903, "eval_math_steps_per_second": 1.976, "step": 1500 }, { "epoch": 2.4, "eval_physics_entropy": 1.4056278495788574, "eval_physics_loss": 1.5309016704559326, "eval_physics_mean_token_accuracy": 0.6855477833747864, "eval_physics_num_tokens": 42691890.0, "eval_physics_runtime": 72.1465, "eval_physics_samples_per_second": 6.93, "eval_physics_steps_per_second": 1.733, "step": 1500 }, { "entropy": 0.8084384858608246, "epoch": 2.416, "grad_norm": 20.5, "learning_rate": 1.1044444444444444e-05, "loss": 13.0963, "mean_token_accuracy": 0.792670714110136, "num_tokens": 42971588.0, "step": 1510 }, { "entropy": 0.7932598620653153, "epoch": 2.432, "grad_norm": 19.875, "learning_rate": 1.0970370370370371e-05, "loss": 12.668, "mean_token_accuracy": 0.79843137152493, "num_tokens": 43253821.0, "step": 1520 }, { "entropy": 0.7650916231796145, "epoch": 2.448, "grad_norm": 18.5, "learning_rate": 1.0896296296296298e-05, "loss": 12.4535, "mean_token_accuracy": 0.8016320440918208, "num_tokens": 43550902.0, "step": 1530 }, { "entropy": 0.8061025515198708, "epoch": 2.464, "grad_norm": 18.125, "learning_rate": 1.0822222222222222e-05, "loss": 12.8029, "mean_token_accuracy": 0.7956046860665082, "num_tokens": 43844259.0, "step": 1540 }, { "entropy": 0.8425655115395785, "epoch": 2.48, "grad_norm": 31.125, "learning_rate": 1.074814814814815e-05, "loss": 13.5112, "mean_token_accuracy": 0.7877779383212328, "num_tokens": 44115701.0, "step": 1550 }, { "entropy": 0.8077523436397314, "epoch": 2.496, "grad_norm": 17.375, "learning_rate": 1.0674074074074074e-05, "loss": 13.0914, "mean_token_accuracy": 0.7937187146395445, "num_tokens": 44405520.0, "step": 1560 }, { "entropy": 0.7986883038654924, "epoch": 2.512, "grad_norm": 17.0, "learning_rate": 1.0600000000000002e-05, "loss": 12.7733, "mean_token_accuracy": 0.7968008328229189, "num_tokens": 44686477.0, "step": 1570 }, { "entropy": 0.7973098121583462, "epoch": 2.528, "grad_norm": 17.625, "learning_rate": 1.0525925925925927e-05, "loss": 12.8409, "mean_token_accuracy": 0.7960524547845125, "num_tokens": 44969760.0, "step": 1580 }, { "entropy": 0.793004597350955, "epoch": 2.544, "grad_norm": 18.625, "learning_rate": 1.0451851851851853e-05, "loss": 12.7374, "mean_token_accuracy": 0.7978652317076922, "num_tokens": 45255326.0, "step": 1590 }, { "entropy": 0.7932099934667349, "epoch": 2.56, "grad_norm": 17.25, "learning_rate": 1.0377777777777778e-05, "loss": 12.7191, "mean_token_accuracy": 0.7962283626198768, "num_tokens": 45532525.0, "step": 1600 }, { "epoch": 2.56, "eval_biology_entropy": 3.2518484830856322, "eval_biology_loss": 3.700951099395752, "eval_biology_mean_token_accuracy": 0.4338484408855438, "eval_biology_num_tokens": 45532525.0, "eval_biology_runtime": 49.825, "eval_biology_samples_per_second": 10.035, "eval_biology_steps_per_second": 2.509, "step": 1600 }, { "epoch": 2.56, "eval_chemistry_entropy": 1.8449390592575072, "eval_chemistry_loss": 1.9900366067886353, "eval_chemistry_mean_token_accuracy": 0.6188704285621643, "eval_chemistry_num_tokens": 45532525.0, "eval_chemistry_runtime": 61.7822, "eval_chemistry_samples_per_second": 8.093, "eval_chemistry_steps_per_second": 2.023, "step": 1600 }, { "epoch": 2.56, "eval_math_entropy": 0.8004411847591401, "eval_math_loss": 0.8377605080604553, "eval_math_mean_token_accuracy": 0.7894859976768493, "eval_math_num_tokens": 45532525.0, "eval_math_runtime": 63.3799, "eval_math_samples_per_second": 7.889, "eval_math_steps_per_second": 1.972, "step": 1600 }, { "epoch": 2.56, "eval_physics_entropy": 1.4191389164924622, "eval_physics_loss": 1.5280442237854004, "eval_physics_mean_token_accuracy": 0.6858750200271606, "eval_physics_num_tokens": 45532525.0, "eval_physics_runtime": 72.1737, "eval_physics_samples_per_second": 6.928, "eval_physics_steps_per_second": 1.732, "step": 1600 }, { "entropy": 0.7856329085305334, "epoch": 2.576, "grad_norm": 18.25, "learning_rate": 1.0303703703703705e-05, "loss": 12.7499, "mean_token_accuracy": 0.7975605644285679, "num_tokens": 45817478.0, "step": 1610 }, { "entropy": 0.7819217784330249, "epoch": 2.592, "grad_norm": 31.375, "learning_rate": 1.0229629629629631e-05, "loss": 12.5615, "mean_token_accuracy": 0.7989848144352436, "num_tokens": 46109575.0, "step": 1620 }, { "entropy": 0.8088566403836012, "epoch": 2.608, "grad_norm": 21.375, "learning_rate": 1.0155555555555556e-05, "loss": 13.044, "mean_token_accuracy": 0.7929384857416153, "num_tokens": 46391461.0, "step": 1630 }, { "entropy": 0.7695744272321463, "epoch": 2.624, "grad_norm": 18.0, "learning_rate": 1.0081481481481484e-05, "loss": 12.3887, "mean_token_accuracy": 0.8046348694711923, "num_tokens": 46683117.0, "step": 1640 }, { "entropy": 0.804672497138381, "epoch": 2.64, "grad_norm": 18.625, "learning_rate": 1.0007407407407407e-05, "loss": 12.8594, "mean_token_accuracy": 0.7948314756155014, "num_tokens": 46965534.0, "step": 1650 }, { "entropy": 0.7649372734129429, "epoch": 2.656, "grad_norm": 18.125, "learning_rate": 9.933333333333334e-06, "loss": 12.2923, "mean_token_accuracy": 0.8014582961797714, "num_tokens": 47245852.0, "step": 1660 }, { "entropy": 0.8046383403241635, "epoch": 2.672, "grad_norm": 22.0, "learning_rate": 9.85925925925926e-06, "loss": 12.9533, "mean_token_accuracy": 0.7926652491092682, "num_tokens": 47524916.0, "step": 1670 }, { "entropy": 0.7880237903445959, "epoch": 2.6879999999999997, "grad_norm": 18.75, "learning_rate": 9.785185185185187e-06, "loss": 12.7005, "mean_token_accuracy": 0.7974669005721807, "num_tokens": 47807131.0, "step": 1680 }, { "entropy": 0.8009384609758854, "epoch": 2.7039999999999997, "grad_norm": 21.125, "learning_rate": 9.711111111111111e-06, "loss": 12.8682, "mean_token_accuracy": 0.7949213892221451, "num_tokens": 48099654.0, "step": 1690 }, { "entropy": 0.7890177585184575, "epoch": 2.7199999999999998, "grad_norm": 19.125, "learning_rate": 9.637037037037038e-06, "loss": 12.702, "mean_token_accuracy": 0.7969291642308235, "num_tokens": 48375019.0, "step": 1700 }, { "epoch": 2.7199999999999998, "eval_biology_entropy": 3.2781650161743165, "eval_biology_loss": 3.7301559448242188, "eval_biology_mean_token_accuracy": 0.43347722268104555, "eval_biology_num_tokens": 48375019.0, "eval_biology_runtime": 49.7998, "eval_biology_samples_per_second": 10.04, "eval_biology_steps_per_second": 2.51, "step": 1700 }, { "epoch": 2.7199999999999998, "eval_chemistry_entropy": 1.8482142910957335, "eval_chemistry_loss": 1.9887773990631104, "eval_chemistry_mean_token_accuracy": 0.6194968128204346, "eval_chemistry_num_tokens": 48375019.0, "eval_chemistry_runtime": 62.2368, "eval_chemistry_samples_per_second": 8.034, "eval_chemistry_steps_per_second": 2.008, "step": 1700 }, { "epoch": 2.7199999999999998, "eval_math_entropy": 0.809392058134079, "eval_math_loss": 0.8359456062316895, "eval_math_mean_token_accuracy": 0.7895615015029908, "eval_math_num_tokens": 48375019.0, "eval_math_runtime": 63.3325, "eval_math_samples_per_second": 7.895, "eval_math_steps_per_second": 1.974, "step": 1700 }, { "epoch": 2.7199999999999998, "eval_physics_entropy": 1.421650909423828, "eval_physics_loss": 1.5206794738769531, "eval_physics_mean_token_accuracy": 0.6867666440010071, "eval_physics_num_tokens": 48375019.0, "eval_physics_runtime": 72.209, "eval_physics_samples_per_second": 6.924, "eval_physics_steps_per_second": 1.731, "step": 1700 }, { "entropy": 0.7974690224975347, "epoch": 2.7359999999999998, "grad_norm": 19.75, "learning_rate": 9.562962962962965e-06, "loss": 12.7825, "mean_token_accuracy": 0.7953499432653188, "num_tokens": 48659284.0, "step": 1710 }, { "entropy": 0.7819192102178931, "epoch": 2.752, "grad_norm": 17.75, "learning_rate": 9.48888888888889e-06, "loss": 12.6506, "mean_token_accuracy": 0.7982836693525315, "num_tokens": 48943804.0, "step": 1720 }, { "entropy": 0.7997678998857737, "epoch": 2.768, "grad_norm": 18.625, "learning_rate": 9.414814814814816e-06, "loss": 12.7813, "mean_token_accuracy": 0.7972954675555229, "num_tokens": 49230939.0, "step": 1730 }, { "entropy": 0.8081969961524009, "epoch": 2.784, "grad_norm": 19.75, "learning_rate": 9.34074074074074e-06, "loss": 13.0832, "mean_token_accuracy": 0.7918697893619537, "num_tokens": 49504425.0, "step": 1740 }, { "entropy": 0.7892359040677548, "epoch": 2.8, "grad_norm": 15.375, "learning_rate": 9.266666666666667e-06, "loss": 12.7791, "mean_token_accuracy": 0.798084444925189, "num_tokens": 49782661.0, "step": 1750 }, { "entropy": 0.7996826503425837, "epoch": 2.816, "grad_norm": 22.75, "learning_rate": 9.192592592592594e-06, "loss": 12.8429, "mean_token_accuracy": 0.7955689582973718, "num_tokens": 50073632.0, "step": 1760 }, { "entropy": 0.7853869337588548, "epoch": 2.832, "grad_norm": 19.625, "learning_rate": 9.118518518518518e-06, "loss": 12.607, "mean_token_accuracy": 0.7983032051473856, "num_tokens": 50356964.0, "step": 1770 }, { "entropy": 0.8069319518283009, "epoch": 2.848, "grad_norm": 16.25, "learning_rate": 9.044444444444445e-06, "loss": 13.0585, "mean_token_accuracy": 0.7950124684721231, "num_tokens": 50644535.0, "step": 1780 }, { "entropy": 0.8049573350697756, "epoch": 2.864, "grad_norm": 21.25, "learning_rate": 8.970370370370372e-06, "loss": 12.8888, "mean_token_accuracy": 0.7958490017801523, "num_tokens": 50925653.0, "step": 1790 }, { "entropy": 0.8024192716926336, "epoch": 2.88, "grad_norm": 19.375, "learning_rate": 8.896296296296298e-06, "loss": 12.7601, "mean_token_accuracy": 0.7968584690243006, "num_tokens": 51204374.0, "step": 1800 }, { "epoch": 2.88, "eval_biology_entropy": 3.233502674102783, "eval_biology_loss": 3.723733425140381, "eval_biology_mean_token_accuracy": 0.43377703070640566, "eval_biology_num_tokens": 51204374.0, "eval_biology_runtime": 50.4406, "eval_biology_samples_per_second": 9.913, "eval_biology_steps_per_second": 2.478, "step": 1800 }, { "epoch": 2.88, "eval_chemistry_entropy": 1.8262592434883118, "eval_chemistry_loss": 1.9915226697921753, "eval_chemistry_mean_token_accuracy": 0.6196319291591644, "eval_chemistry_num_tokens": 51204374.0, "eval_chemistry_runtime": 62.4404, "eval_chemistry_samples_per_second": 8.008, "eval_chemistry_steps_per_second": 2.002, "step": 1800 }, { "epoch": 2.88, "eval_math_entropy": 0.801637535572052, "eval_math_loss": 0.8344833850860596, "eval_math_mean_token_accuracy": 0.7902569670677185, "eval_math_num_tokens": 51204374.0, "eval_math_runtime": 63.6911, "eval_math_samples_per_second": 7.85, "eval_math_steps_per_second": 1.963, "step": 1800 }, { "epoch": 2.88, "eval_physics_entropy": 1.4114351081848144, "eval_physics_loss": 1.5217763185501099, "eval_physics_mean_token_accuracy": 0.6866876316070557, "eval_physics_num_tokens": 51204374.0, "eval_physics_runtime": 72.2495, "eval_physics_samples_per_second": 6.92, "eval_physics_steps_per_second": 1.73, "step": 1800 }, { "entropy": 0.7727437706664204, "epoch": 2.896, "grad_norm": 20.5, "learning_rate": 8.822222222222223e-06, "loss": 12.5378, "mean_token_accuracy": 0.7991547420620918, "num_tokens": 51483944.0, "step": 1810 }, { "entropy": 0.7949327051639556, "epoch": 2.912, "grad_norm": 16.875, "learning_rate": 8.74814814814815e-06, "loss": 12.8129, "mean_token_accuracy": 0.7974283535033464, "num_tokens": 51765755.0, "step": 1820 }, { "entropy": 0.7891913030296565, "epoch": 2.928, "grad_norm": 19.875, "learning_rate": 8.674074074074074e-06, "loss": 12.7001, "mean_token_accuracy": 0.7998170506209135, "num_tokens": 52056379.0, "step": 1830 }, { "entropy": 0.7793466189876199, "epoch": 2.944, "grad_norm": 16.75, "learning_rate": 8.6e-06, "loss": 12.4224, "mean_token_accuracy": 0.7998687323182821, "num_tokens": 52346232.0, "step": 1840 }, { "entropy": 0.7924864767119288, "epoch": 2.96, "grad_norm": 17.0, "learning_rate": 8.525925925925927e-06, "loss": 12.7675, "mean_token_accuracy": 0.7973909642547369, "num_tokens": 52633789.0, "step": 1850 }, { "entropy": 0.7772072922438383, "epoch": 2.976, "grad_norm": 16.625, "learning_rate": 8.451851851851852e-06, "loss": 12.5696, "mean_token_accuracy": 0.798350129276514, "num_tokens": 52911755.0, "step": 1860 }, { "entropy": 0.785734941251576, "epoch": 2.992, "grad_norm": 18.75, "learning_rate": 8.377777777777779e-06, "loss": 12.5958, "mean_token_accuracy": 0.798584484308958, "num_tokens": 53198176.0, "step": 1870 }, { "entropy": 0.798486184887588, "epoch": 3.008, "grad_norm": 17.625, "learning_rate": 8.303703703703705e-06, "loss": 12.8286, "mean_token_accuracy": 0.7952684313058853, "num_tokens": 53481893.0, "step": 1880 }, { "entropy": 0.798673938214779, "epoch": 3.024, "grad_norm": 17.375, "learning_rate": 8.229629629629632e-06, "loss": 12.7458, "mean_token_accuracy": 0.7966153182089328, "num_tokens": 53771717.0, "step": 1890 }, { "entropy": 0.7936699235811829, "epoch": 3.04, "grad_norm": 18.125, "learning_rate": 8.155555555555556e-06, "loss": 12.7437, "mean_token_accuracy": 0.7971193008124828, "num_tokens": 54058045.0, "step": 1900 }, { "epoch": 3.04, "eval_biology_entropy": 3.2133601503372193, "eval_biology_loss": 3.722015142440796, "eval_biology_mean_token_accuracy": 0.43406607151031495, "eval_biology_num_tokens": 54058045.0, "eval_biology_runtime": 50.1009, "eval_biology_samples_per_second": 9.98, "eval_biology_steps_per_second": 2.495, "step": 1900 }, { "epoch": 3.04, "eval_chemistry_entropy": 1.8122160468101502, "eval_chemistry_loss": 1.9867562055587769, "eval_chemistry_mean_token_accuracy": 0.6197145907878876, "eval_chemistry_num_tokens": 54058045.0, "eval_chemistry_runtime": 62.5867, "eval_chemistry_samples_per_second": 7.989, "eval_chemistry_steps_per_second": 1.997, "step": 1900 }, { "epoch": 3.04, "eval_math_entropy": 0.7948782880306244, "eval_math_loss": 0.8342062830924988, "eval_math_mean_token_accuracy": 0.7903887577056885, "eval_math_num_tokens": 54058045.0, "eval_math_runtime": 64.4647, "eval_math_samples_per_second": 7.756, "eval_math_steps_per_second": 1.939, "step": 1900 }, { "epoch": 3.04, "eval_physics_entropy": 1.4016832985877992, "eval_physics_loss": 1.5218627452850342, "eval_physics_mean_token_accuracy": 0.6867203059196472, "eval_physics_num_tokens": 54058045.0, "eval_physics_runtime": 72.1233, "eval_physics_samples_per_second": 6.933, "eval_physics_steps_per_second": 1.733, "step": 1900 }, { "entropy": 0.791794583760202, "epoch": 3.056, "grad_norm": 16.0, "learning_rate": 8.081481481481483e-06, "loss": 12.767, "mean_token_accuracy": 0.7971944514662027, "num_tokens": 54334332.0, "step": 1910 }, { "entropy": 0.7716375244781375, "epoch": 3.072, "grad_norm": 17.125, "learning_rate": 8.007407407407408e-06, "loss": 12.435, "mean_token_accuracy": 0.8003965705633164, "num_tokens": 54624543.0, "step": 1920 }, { "entropy": 0.7641144976019859, "epoch": 3.088, "grad_norm": 17.875, "learning_rate": 7.933333333333334e-06, "loss": 12.2604, "mean_token_accuracy": 0.8041298471391201, "num_tokens": 54907550.0, "step": 1930 }, { "entropy": 0.7768820196390152, "epoch": 3.104, "grad_norm": 15.3125, "learning_rate": 7.859259259259259e-06, "loss": 12.4156, "mean_token_accuracy": 0.8015246070921421, "num_tokens": 55190959.0, "step": 1940 }, { "entropy": 0.767643989995122, "epoch": 3.12, "grad_norm": 19.0, "learning_rate": 7.785185185185185e-06, "loss": 12.2748, "mean_token_accuracy": 0.8037003818899393, "num_tokens": 55481635.0, "step": 1950 }, { "entropy": 0.7627615902572871, "epoch": 3.136, "grad_norm": 17.5, "learning_rate": 7.711111111111112e-06, "loss": 12.3626, "mean_token_accuracy": 0.8015805222094059, "num_tokens": 55769010.0, "step": 1960 }, { "entropy": 0.7825732477009296, "epoch": 3.152, "grad_norm": 18.0, "learning_rate": 7.637037037037037e-06, "loss": 12.5732, "mean_token_accuracy": 0.7999679546803236, "num_tokens": 56053160.0, "step": 1970 }, { "entropy": 0.7954605100676417, "epoch": 3.168, "grad_norm": 16.875, "learning_rate": 7.562962962962963e-06, "loss": 12.7728, "mean_token_accuracy": 0.7977348294109106, "num_tokens": 56337066.0, "step": 1980 }, { "entropy": 0.7740211399272084, "epoch": 3.184, "grad_norm": 19.625, "learning_rate": 7.48888888888889e-06, "loss": 12.3752, "mean_token_accuracy": 0.799398972466588, "num_tokens": 56618899.0, "step": 1990 }, { "entropy": 0.7562787082046271, "epoch": 3.2, "grad_norm": 14.875, "learning_rate": 7.4148148148148155e-06, "loss": 12.1475, "mean_token_accuracy": 0.8042113542556762, "num_tokens": 56910071.0, "step": 2000 }, { "epoch": 3.2, "eval_biology_entropy": 3.208591146469116, "eval_biology_loss": 3.7341694831848145, "eval_biology_mean_token_accuracy": 0.4337711172103882, "eval_biology_num_tokens": 56910071.0, "eval_biology_runtime": 49.9923, "eval_biology_samples_per_second": 10.002, "eval_biology_steps_per_second": 2.5, "step": 2000 }, { "epoch": 3.2, "eval_chemistry_entropy": 1.8099074120521546, "eval_chemistry_loss": 1.998437762260437, "eval_chemistry_mean_token_accuracy": 0.6191729652881622, "eval_chemistry_num_tokens": 56910071.0, "eval_chemistry_runtime": 61.2788, "eval_chemistry_samples_per_second": 8.159, "eval_chemistry_steps_per_second": 2.04, "step": 2000 }, { "epoch": 3.2, "eval_math_entropy": 0.7919997727870941, "eval_math_loss": 0.8333685398101807, "eval_math_mean_token_accuracy": 0.7902761344909668, "eval_math_num_tokens": 56910071.0, "eval_math_runtime": 63.3474, "eval_math_samples_per_second": 7.893, "eval_math_steps_per_second": 1.973, "step": 2000 }, { "epoch": 3.2, "eval_physics_entropy": 1.3938024516105652, "eval_physics_loss": 1.5245832204818726, "eval_physics_mean_token_accuracy": 0.6867119417190551, "eval_physics_num_tokens": 56910071.0, "eval_physics_runtime": 72.2962, "eval_physics_samples_per_second": 6.916, "eval_physics_steps_per_second": 1.729, "step": 2000 }, { "entropy": 0.7968308020383119, "epoch": 3.216, "grad_norm": 19.75, "learning_rate": 7.340740740740742e-06, "loss": 12.919, "mean_token_accuracy": 0.7950544532388448, "num_tokens": 57190265.0, "step": 2010 }, { "entropy": 0.7894575387239456, "epoch": 3.232, "grad_norm": 19.375, "learning_rate": 7.266666666666668e-06, "loss": 12.7167, "mean_token_accuracy": 0.7984283819794655, "num_tokens": 57471355.0, "step": 2020 }, { "entropy": 0.7716388316825032, "epoch": 3.248, "grad_norm": 23.125, "learning_rate": 7.192592592592593e-06, "loss": 12.2718, "mean_token_accuracy": 0.8023973722010851, "num_tokens": 57749030.0, "step": 2030 }, { "entropy": 0.7682912373915315, "epoch": 3.2640000000000002, "grad_norm": 17.875, "learning_rate": 7.118518518518519e-06, "loss": 12.3142, "mean_token_accuracy": 0.802096302434802, "num_tokens": 58027567.0, "step": 2040 }, { "entropy": 0.7848245477303862, "epoch": 3.2800000000000002, "grad_norm": 15.5625, "learning_rate": 7.044444444444445e-06, "loss": 12.7192, "mean_token_accuracy": 0.7964802388101816, "num_tokens": 58304346.0, "step": 2050 }, { "entropy": 0.8068621216341854, "epoch": 3.296, "grad_norm": 17.125, "learning_rate": 6.97037037037037e-06, "loss": 12.9016, "mean_token_accuracy": 0.7960762921720743, "num_tokens": 58579831.0, "step": 2060 }, { "entropy": 0.7883643863722682, "epoch": 3.312, "grad_norm": 18.75, "learning_rate": 6.896296296296297e-06, "loss": 12.6797, "mean_token_accuracy": 0.7974914859980344, "num_tokens": 58858426.0, "step": 2070 }, { "entropy": 0.7716894892975688, "epoch": 3.328, "grad_norm": 18.375, "learning_rate": 6.8222222222222225e-06, "loss": 12.3509, "mean_token_accuracy": 0.8006251383572817, "num_tokens": 59142949.0, "step": 2080 }, { "entropy": 0.774476227350533, "epoch": 3.344, "grad_norm": 20.75, "learning_rate": 6.748148148148149e-06, "loss": 12.4547, "mean_token_accuracy": 0.8013994835317135, "num_tokens": 59424654.0, "step": 2090 }, { "entropy": 0.8170315666124225, "epoch": 3.36, "grad_norm": 20.75, "learning_rate": 6.674074074074075e-06, "loss": 13.2774, "mean_token_accuracy": 0.7903341125696898, "num_tokens": 59704876.0, "step": 2100 }, { "epoch": 3.36, "eval_biology_entropy": 3.217072533607483, "eval_biology_loss": 3.728389024734497, "eval_biology_mean_token_accuracy": 0.4338984694480896, "eval_biology_num_tokens": 59704876.0, "eval_biology_runtime": 49.8299, "eval_biology_samples_per_second": 10.034, "eval_biology_steps_per_second": 2.509, "step": 2100 }, { "epoch": 3.36, "eval_chemistry_entropy": 1.8177995371818543, "eval_chemistry_loss": 1.9904567003250122, "eval_chemistry_mean_token_accuracy": 0.6200088469982147, "eval_chemistry_num_tokens": 59704876.0, "eval_chemistry_runtime": 61.8572, "eval_chemistry_samples_per_second": 8.083, "eval_chemistry_steps_per_second": 2.021, "step": 2100 }, { "epoch": 3.36, "eval_math_entropy": 0.7981109554767609, "eval_math_loss": 0.8328840136528015, "eval_math_mean_token_accuracy": 0.7904439249038696, "eval_math_num_tokens": 59704876.0, "eval_math_runtime": 63.3506, "eval_math_samples_per_second": 7.893, "eval_math_steps_per_second": 1.973, "step": 2100 }, { "epoch": 3.36, "eval_physics_entropy": 1.40065105676651, "eval_physics_loss": 1.5222822427749634, "eval_physics_mean_token_accuracy": 0.6869667596817016, "eval_physics_num_tokens": 59704876.0, "eval_physics_runtime": 72.4951, "eval_physics_samples_per_second": 6.897, "eval_physics_steps_per_second": 1.724, "step": 2100 }, { "entropy": 0.8069535192102194, "epoch": 3.376, "grad_norm": 18.25, "learning_rate": 6.600000000000001e-06, "loss": 12.9428, "mean_token_accuracy": 0.7946984317153692, "num_tokens": 59988211.0, "step": 2110 }, { "entropy": 0.7739303324371576, "epoch": 3.392, "grad_norm": 16.875, "learning_rate": 6.525925925925927e-06, "loss": 12.3205, "mean_token_accuracy": 0.8029689799994231, "num_tokens": 60276606.0, "step": 2120 }, { "entropy": 0.7634080411866307, "epoch": 3.408, "grad_norm": 15.8125, "learning_rate": 6.4518518518518525e-06, "loss": 12.1937, "mean_token_accuracy": 0.8028978146612644, "num_tokens": 60560228.0, "step": 2130 }, { "entropy": 0.7720513068139553, "epoch": 3.424, "grad_norm": 18.75, "learning_rate": 6.377777777777778e-06, "loss": 12.4972, "mean_token_accuracy": 0.7988769717514514, "num_tokens": 60845893.0, "step": 2140 }, { "entropy": 0.7658586408942938, "epoch": 3.44, "grad_norm": 17.875, "learning_rate": 6.303703703703704e-06, "loss": 12.3182, "mean_token_accuracy": 0.8030787121504546, "num_tokens": 61126410.0, "step": 2150 }, { "entropy": 0.7944835271686316, "epoch": 3.456, "grad_norm": 18.625, "learning_rate": 6.2296296296296295e-06, "loss": 12.7545, "mean_token_accuracy": 0.7976623672991991, "num_tokens": 61406501.0, "step": 2160 }, { "entropy": 0.7987036965787411, "epoch": 3.472, "grad_norm": 21.625, "learning_rate": 6.155555555555556e-06, "loss": 12.9686, "mean_token_accuracy": 0.7942549601197243, "num_tokens": 61693412.0, "step": 2170 }, { "entropy": 0.7782032687216998, "epoch": 3.488, "grad_norm": 17.75, "learning_rate": 6.081481481481482e-06, "loss": 12.3813, "mean_token_accuracy": 0.8012834239751101, "num_tokens": 61977303.0, "step": 2180 }, { "entropy": 0.7691146014258265, "epoch": 3.504, "grad_norm": 19.625, "learning_rate": 6.007407407407407e-06, "loss": 12.4591, "mean_token_accuracy": 0.8003957267850638, "num_tokens": 62265256.0, "step": 2190 }, { "entropy": 0.7718311287462711, "epoch": 3.52, "grad_norm": 20.625, "learning_rate": 5.933333333333335e-06, "loss": 12.3847, "mean_token_accuracy": 0.8017024364322424, "num_tokens": 62554970.0, "step": 2200 }, { "epoch": 3.52, "eval_biology_entropy": 3.2120394563674926, "eval_biology_loss": 3.7082746028900146, "eval_biology_mean_token_accuracy": 0.4342635951042175, "eval_biology_num_tokens": 62554970.0, "eval_biology_runtime": 50.8721, "eval_biology_samples_per_second": 9.829, "eval_biology_steps_per_second": 2.457, "step": 2200 }, { "epoch": 3.52, "eval_chemistry_entropy": 1.8098075938224794, "eval_chemistry_loss": 1.9864753484725952, "eval_chemistry_mean_token_accuracy": 0.6200196840763093, "eval_chemistry_num_tokens": 62554970.0, "eval_chemistry_runtime": 61.9307, "eval_chemistry_samples_per_second": 8.074, "eval_chemistry_steps_per_second": 2.018, "step": 2200 }, { "epoch": 3.52, "eval_math_entropy": 0.7994570763111114, "eval_math_loss": 0.8322504758834839, "eval_math_mean_token_accuracy": 0.7903432693481446, "eval_math_num_tokens": 62554970.0, "eval_math_runtime": 64.0277, "eval_math_samples_per_second": 7.809, "eval_math_steps_per_second": 1.952, "step": 2200 }, { "epoch": 3.52, "eval_physics_entropy": 1.3982198128700256, "eval_physics_loss": 1.5216139554977417, "eval_physics_mean_token_accuracy": 0.6869434370994568, "eval_physics_num_tokens": 62554970.0, "eval_physics_runtime": 72.1793, "eval_physics_samples_per_second": 6.927, "eval_physics_steps_per_second": 1.732, "step": 2200 }, { "entropy": 0.7885877426713706, "epoch": 3.536, "grad_norm": 18.5, "learning_rate": 5.85925925925926e-06, "loss": 12.72, "mean_token_accuracy": 0.7979052390903234, "num_tokens": 62837055.0, "step": 2210 }, { "entropy": 0.7594215430319309, "epoch": 3.552, "grad_norm": 17.25, "learning_rate": 5.785185185185186e-06, "loss": 12.2173, "mean_token_accuracy": 0.8033355198800564, "num_tokens": 63129767.0, "step": 2220 }, { "entropy": 0.7631898380815982, "epoch": 3.568, "grad_norm": 20.25, "learning_rate": 5.711111111111112e-06, "loss": 12.1379, "mean_token_accuracy": 0.8046595975756645, "num_tokens": 63424239.0, "step": 2230 }, { "entropy": 0.7886038830503821, "epoch": 3.584, "grad_norm": 21.875, "learning_rate": 5.637037037037037e-06, "loss": 12.67, "mean_token_accuracy": 0.7991335902363061, "num_tokens": 63704243.0, "step": 2240 }, { "entropy": 0.7425258183851838, "epoch": 3.6, "grad_norm": 17.875, "learning_rate": 5.562962962962963e-06, "loss": 11.9745, "mean_token_accuracy": 0.8067378461360931, "num_tokens": 63994604.0, "step": 2250 }, { "entropy": 0.7823067845776677, "epoch": 3.616, "grad_norm": 21.5, "learning_rate": 5.4888888888888895e-06, "loss": 12.6425, "mean_token_accuracy": 0.7972631234675646, "num_tokens": 64280032.0, "step": 2260 }, { "entropy": 0.7663131864741445, "epoch": 3.632, "grad_norm": 18.125, "learning_rate": 5.414814814814815e-06, "loss": 12.3641, "mean_token_accuracy": 0.802097548916936, "num_tokens": 64560379.0, "step": 2270 }, { "entropy": 0.7514292212203145, "epoch": 3.648, "grad_norm": 20.25, "learning_rate": 5.340740740740741e-06, "loss": 12.104, "mean_token_accuracy": 0.8061405006796122, "num_tokens": 64843196.0, "step": 2280 }, { "entropy": 0.8035670185461641, "epoch": 3.664, "grad_norm": 18.75, "learning_rate": 5.2666666666666665e-06, "loss": 12.9498, "mean_token_accuracy": 0.7948029205203057, "num_tokens": 65126428.0, "step": 2290 }, { "entropy": 0.7716160112991929, "epoch": 3.68, "grad_norm": 19.75, "learning_rate": 5.192592592592594e-06, "loss": 12.3344, "mean_token_accuracy": 0.8020378608256579, "num_tokens": 65411914.0, "step": 2300 }, { "epoch": 3.68, "eval_biology_entropy": 3.215197211265564, "eval_biology_loss": 3.7224931716918945, "eval_biology_mean_token_accuracy": 0.4343402876853943, "eval_biology_num_tokens": 65411914.0, "eval_biology_runtime": 49.8096, "eval_biology_samples_per_second": 10.038, "eval_biology_steps_per_second": 2.51, "step": 2300 }, { "epoch": 3.68, "eval_chemistry_entropy": 1.8129710841178894, "eval_chemistry_loss": 1.9824562072753906, "eval_chemistry_mean_token_accuracy": 0.6203249397277832, "eval_chemistry_num_tokens": 65411914.0, "eval_chemistry_runtime": 62.1495, "eval_chemistry_samples_per_second": 8.045, "eval_chemistry_steps_per_second": 2.011, "step": 2300 }, { "epoch": 3.68, "eval_math_entropy": 0.7980078947544098, "eval_math_loss": 0.8318822979927063, "eval_math_mean_token_accuracy": 0.7904034781455994, "eval_math_num_tokens": 65411914.0, "eval_math_runtime": 63.7772, "eval_math_samples_per_second": 7.84, "eval_math_steps_per_second": 1.96, "step": 2300 }, { "epoch": 3.68, "eval_physics_entropy": 1.4016683073043823, "eval_physics_loss": 1.5173635482788086, "eval_physics_mean_token_accuracy": 0.6873576474189759, "eval_physics_num_tokens": 65411914.0, "eval_physics_runtime": 72.1932, "eval_physics_samples_per_second": 6.926, "eval_physics_steps_per_second": 1.731, "step": 2300 }, { "entropy": 0.7915378937497735, "epoch": 3.6959999999999997, "grad_norm": 30.25, "learning_rate": 5.1185185185185195e-06, "loss": 12.6962, "mean_token_accuracy": 0.7970439530909061, "num_tokens": 65696757.0, "step": 2310 }, { "entropy": 0.7779884770512581, "epoch": 3.7119999999999997, "grad_norm": 19.75, "learning_rate": 5.044444444444445e-06, "loss": 12.5336, "mean_token_accuracy": 0.7997329588979483, "num_tokens": 65975610.0, "step": 2320 }, { "entropy": 0.7705695915967226, "epoch": 3.7279999999999998, "grad_norm": 20.75, "learning_rate": 4.970370370370371e-06, "loss": 12.2856, "mean_token_accuracy": 0.8022748168557883, "num_tokens": 66257358.0, "step": 2330 }, { "entropy": 0.777613940089941, "epoch": 3.7439999999999998, "grad_norm": 17.625, "learning_rate": 4.8962962962962965e-06, "loss": 12.4253, "mean_token_accuracy": 0.7997446611523629, "num_tokens": 66543693.0, "step": 2340 }, { "entropy": 0.7779997067525983, "epoch": 3.76, "grad_norm": 16.75, "learning_rate": 4.822222222222222e-06, "loss": 12.615, "mean_token_accuracy": 0.7987085115164518, "num_tokens": 66825970.0, "step": 2350 }, { "entropy": 0.8043872950598597, "epoch": 3.776, "grad_norm": 18.625, "learning_rate": 4.748148148148149e-06, "loss": 13.0847, "mean_token_accuracy": 0.7925404973328114, "num_tokens": 67115478.0, "step": 2360 }, { "entropy": 0.7561512844637036, "epoch": 3.792, "grad_norm": 18.0, "learning_rate": 4.674074074074074e-06, "loss": 12.0553, "mean_token_accuracy": 0.8056211285293102, "num_tokens": 67405183.0, "step": 2370 }, { "entropy": 0.7998656073585153, "epoch": 3.808, "grad_norm": 20.125, "learning_rate": 4.600000000000001e-06, "loss": 12.8424, "mean_token_accuracy": 0.7964729398488999, "num_tokens": 67688611.0, "step": 2380 }, { "entropy": 0.7603978240862489, "epoch": 3.824, "grad_norm": 17.625, "learning_rate": 4.5259259259259265e-06, "loss": 12.3006, "mean_token_accuracy": 0.8031455595046282, "num_tokens": 67974438.0, "step": 2390 }, { "entropy": 0.7869010392576457, "epoch": 3.84, "grad_norm": 19.625, "learning_rate": 4.451851851851852e-06, "loss": 12.5672, "mean_token_accuracy": 0.7984500721096992, "num_tokens": 68258661.0, "step": 2400 }, { "epoch": 3.84, "eval_biology_entropy": 3.210782961845398, "eval_biology_loss": 3.720635414123535, "eval_biology_mean_token_accuracy": 0.4344561665058136, "eval_biology_num_tokens": 68258661.0, "eval_biology_runtime": 49.9557, "eval_biology_samples_per_second": 10.009, "eval_biology_steps_per_second": 2.502, "step": 2400 }, { "epoch": 3.84, "eval_chemistry_entropy": 1.8049536199569702, "eval_chemistry_loss": 1.9834426641464233, "eval_chemistry_mean_token_accuracy": 0.6201271147727966, "eval_chemistry_num_tokens": 68258661.0, "eval_chemistry_runtime": 61.7558, "eval_chemistry_samples_per_second": 8.096, "eval_chemistry_steps_per_second": 2.024, "step": 2400 }, { "epoch": 3.84, "eval_math_entropy": 0.7937772991657257, "eval_math_loss": 0.831400454044342, "eval_math_mean_token_accuracy": 0.7906554841995239, "eval_math_num_tokens": 68258661.0, "eval_math_runtime": 63.792, "eval_math_samples_per_second": 7.838, "eval_math_steps_per_second": 1.959, "step": 2400 }, { "epoch": 3.84, "eval_physics_entropy": 1.3921758160591124, "eval_physics_loss": 1.5169947147369385, "eval_physics_mean_token_accuracy": 0.6875439782142639, "eval_physics_num_tokens": 68258661.0, "eval_physics_runtime": 72.1187, "eval_physics_samples_per_second": 6.933, "eval_physics_steps_per_second": 1.733, "step": 2400 }, { "entropy": 0.7666634568944574, "epoch": 3.856, "grad_norm": 17.125, "learning_rate": 4.377777777777778e-06, "loss": 12.2564, "mean_token_accuracy": 0.8032576251775027, "num_tokens": 68541830.0, "step": 2410 }, { "entropy": 0.7806821163743735, "epoch": 3.872, "grad_norm": 17.5, "learning_rate": 4.3037037037037035e-06, "loss": 12.5883, "mean_token_accuracy": 0.7983071640133858, "num_tokens": 68832764.0, "step": 2420 }, { "entropy": 0.7824332881718874, "epoch": 3.888, "grad_norm": 19.0, "learning_rate": 4.22962962962963e-06, "loss": 12.6199, "mean_token_accuracy": 0.7984338030219078, "num_tokens": 69117158.0, "step": 2430 }, { "entropy": 0.8022292889654636, "epoch": 3.904, "grad_norm": 18.5, "learning_rate": 4.155555555555556e-06, "loss": 12.9376, "mean_token_accuracy": 0.795144984126091, "num_tokens": 69406954.0, "step": 2440 }, { "entropy": 0.7660128910094499, "epoch": 3.92, "grad_norm": 19.875, "learning_rate": 4.081481481481482e-06, "loss": 12.3054, "mean_token_accuracy": 0.8025309626013041, "num_tokens": 69682187.0, "step": 2450 }, { "entropy": 0.7793645923957229, "epoch": 3.936, "grad_norm": 20.0, "learning_rate": 4.007407407407408e-06, "loss": 12.5246, "mean_token_accuracy": 0.7993701048195362, "num_tokens": 69977882.0, "step": 2460 }, { "entropy": 0.7782584555447102, "epoch": 3.952, "grad_norm": 16.25, "learning_rate": 3.9333333333333335e-06, "loss": 12.5243, "mean_token_accuracy": 0.7999023869633675, "num_tokens": 70259269.0, "step": 2470 }, { "entropy": 0.7844892725348472, "epoch": 3.968, "grad_norm": 17.25, "learning_rate": 3.85925925925926e-06, "loss": 12.6869, "mean_token_accuracy": 0.7965930238366127, "num_tokens": 70542461.0, "step": 2480 }, { "entropy": 0.7715097475796938, "epoch": 3.984, "grad_norm": 21.0, "learning_rate": 3.7851851851851857e-06, "loss": 12.4222, "mean_token_accuracy": 0.8015750881284476, "num_tokens": 70842100.0, "step": 2490 }, { "entropy": 0.7797549622133374, "epoch": 4.0, "grad_norm": 19.0, "learning_rate": 3.7111111111111113e-06, "loss": 12.5159, "mean_token_accuracy": 0.7990538150072097, "num_tokens": 71121728.0, "step": 2500 }, { "epoch": 4.0, "eval_biology_entropy": 3.217764102935791, "eval_biology_loss": 3.7286667823791504, "eval_biology_mean_token_accuracy": 0.4341450183391571, "eval_biology_num_tokens": 71121728.0, "eval_biology_runtime": 49.8074, "eval_biology_samples_per_second": 10.039, "eval_biology_steps_per_second": 2.51, "step": 2500 }, { "epoch": 4.0, "eval_chemistry_entropy": 1.8093777270317077, "eval_chemistry_loss": 1.9849001169204712, "eval_chemistry_mean_token_accuracy": 0.6203813493251801, "eval_chemistry_num_tokens": 71121728.0, "eval_chemistry_runtime": 61.2108, "eval_chemistry_samples_per_second": 8.168, "eval_chemistry_steps_per_second": 2.042, "step": 2500 }, { "epoch": 4.0, "eval_math_entropy": 0.7972562448978424, "eval_math_loss": 0.8311391472816467, "eval_math_mean_token_accuracy": 0.7904811363220214, "eval_math_num_tokens": 71121728.0, "eval_math_runtime": 63.1961, "eval_math_samples_per_second": 7.912, "eval_math_steps_per_second": 1.978, "step": 2500 }, { "epoch": 4.0, "eval_physics_entropy": 1.397938307762146, "eval_physics_loss": 1.5150039196014404, "eval_physics_mean_token_accuracy": 0.6876113767623901, "eval_physics_num_tokens": 71121728.0, "eval_physics_runtime": 72.1146, "eval_physics_samples_per_second": 6.933, "eval_physics_steps_per_second": 1.733, "step": 2500 } ], "logging_steps": 10, "max_steps": 3000, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.238099893744535e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }