{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.8, "eval_steps": 100, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.0476444251835346, "epoch": 0.016, "grad_norm": 158.0, "learning_rate": 6.000000000000001e-07, "loss": 34.7404, "mean_token_accuracy": 0.6322720043361187, "num_tokens": 280941.0, "step": 10 }, { "entropy": 1.103129617869854, "epoch": 0.032, "grad_norm": 140.0, "learning_rate": 1.2666666666666669e-06, "loss": 35.2833, "mean_token_accuracy": 0.627065640874207, "num_tokens": 558056.0, "step": 20 }, { "entropy": 1.1186710916459561, "epoch": 0.048, "grad_norm": 125.5, "learning_rate": 1.9333333333333336e-06, "loss": 34.279, "mean_token_accuracy": 0.6317529268562794, "num_tokens": 836753.0, "step": 30 }, { "entropy": 1.2232312321662904, "epoch": 0.064, "grad_norm": 99.0, "learning_rate": 2.6e-06, "loss": 33.2898, "mean_token_accuracy": 0.6292566264048218, "num_tokens": 1126446.0, "step": 40 }, { "entropy": 1.3642345190048217, "epoch": 0.08, "grad_norm": 78.0, "learning_rate": 3.266666666666667e-06, "loss": 31.1081, "mean_token_accuracy": 0.643590996041894, "num_tokens": 1413596.0, "step": 50 }, { "entropy": 1.4666775342077016, "epoch": 0.096, "grad_norm": 50.0, "learning_rate": 3.9333333333333335e-06, "loss": 28.0855, "mean_token_accuracy": 0.6677330181002616, "num_tokens": 1701193.0, "step": 60 }, { "entropy": 1.479279049113393, "epoch": 0.112, "grad_norm": 44.5, "learning_rate": 4.600000000000001e-06, "loss": 25.9511, "mean_token_accuracy": 0.6763931937515736, "num_tokens": 1979232.0, "step": 70 }, { "entropy": 1.4356445044279098, "epoch": 0.128, "grad_norm": 31.5, "learning_rate": 5.2666666666666665e-06, "loss": 24.2383, "mean_token_accuracy": 0.6900208964943886, "num_tokens": 2274177.0, "step": 80 }, { "entropy": 1.3773791242390871, "epoch": 0.144, "grad_norm": 28.25, "learning_rate": 5.933333333333335e-06, "loss": 22.3128, "mean_token_accuracy": 0.7042559664696455, "num_tokens": 2548445.0, "step": 90 }, { "entropy": 1.3055105797946454, "epoch": 0.16, "grad_norm": 29.25, "learning_rate": 6.600000000000001e-06, "loss": 20.6858, "mean_token_accuracy": 0.718151243403554, "num_tokens": 2824418.0, "step": 100 }, { "epoch": 0.16, "eval_biology_entropy": 3.5783245754241944, "eval_biology_loss": 4.867801189422607, "eval_biology_mean_token_accuracy": 0.3678655695915222, "eval_biology_num_tokens": 2824418.0, "eval_biology_runtime": 50.0015, "eval_biology_samples_per_second": 10.0, "eval_biology_steps_per_second": 2.5, "step": 100 }, { "epoch": 0.16, "eval_chemistry_entropy": 2.541287685394287, "eval_chemistry_loss": 2.7937169075012207, "eval_chemistry_mean_token_accuracy": 0.539857518196106, "eval_chemistry_num_tokens": 2824418.0, "eval_chemistry_runtime": 61.8974, "eval_chemistry_samples_per_second": 8.078, "eval_chemistry_steps_per_second": 2.019, "step": 100 }, { "epoch": 0.16, "eval_math_entropy": 1.325653440952301, "eval_math_loss": 1.2963088750839233, "eval_math_mean_token_accuracy": 0.7152829113006591, "eval_math_num_tokens": 2824418.0, "eval_math_runtime": 63.4831, "eval_math_samples_per_second": 7.876, "eval_math_steps_per_second": 1.969, "step": 100 }, { "epoch": 0.16, "eval_physics_entropy": 2.0472509851455687, "eval_physics_loss": 2.108372688293457, "eval_physics_mean_token_accuracy": 0.6150235052108765, "eval_physics_num_tokens": 2824418.0, "eval_physics_runtime": 72.2792, "eval_physics_samples_per_second": 6.918, "eval_physics_steps_per_second": 1.729, "step": 100 }, { "entropy": 1.2799372777342797, "epoch": 0.176, "grad_norm": 23.25, "learning_rate": 7.266666666666668e-06, "loss": 20.1214, "mean_token_accuracy": 0.7223256763070822, "num_tokens": 3110313.0, "step": 110 }, { "entropy": 1.271880804002285, "epoch": 0.192, "grad_norm": 22.625, "learning_rate": 7.933333333333334e-06, "loss": 19.8735, "mean_token_accuracy": 0.7208763293921947, "num_tokens": 3394170.0, "step": 120 }, { "entropy": 1.2141485448926688, "epoch": 0.208, "grad_norm": 20.25, "learning_rate": 8.6e-06, "loss": 19.3285, "mean_token_accuracy": 0.7284529402852058, "num_tokens": 3673600.0, "step": 130 }, { "entropy": 1.1842644162476064, "epoch": 0.224, "grad_norm": 22.25, "learning_rate": 9.266666666666667e-06, "loss": 18.8222, "mean_token_accuracy": 0.7318667653948069, "num_tokens": 3953732.0, "step": 140 }, { "entropy": 1.1367911960929633, "epoch": 0.24, "grad_norm": 21.875, "learning_rate": 9.933333333333334e-06, "loss": 18.0331, "mean_token_accuracy": 0.7411410234868526, "num_tokens": 4243655.0, "step": 150 }, { "entropy": 1.099366496130824, "epoch": 0.256, "grad_norm": 20.875, "learning_rate": 1.0600000000000002e-05, "loss": 17.5038, "mean_token_accuracy": 0.7470747366547584, "num_tokens": 4531471.0, "step": 160 }, { "entropy": 1.1072874456644057, "epoch": 0.272, "grad_norm": 18.125, "learning_rate": 1.1266666666666668e-05, "loss": 17.6358, "mean_token_accuracy": 0.7453399378806352, "num_tokens": 4810284.0, "step": 170 }, { "entropy": 1.0879811342805623, "epoch": 0.288, "grad_norm": 16.375, "learning_rate": 1.1933333333333335e-05, "loss": 17.2142, "mean_token_accuracy": 0.749498899653554, "num_tokens": 5095104.0, "step": 180 }, { "entropy": 1.0824080746620894, "epoch": 0.304, "grad_norm": 16.625, "learning_rate": 1.2600000000000001e-05, "loss": 17.223, "mean_token_accuracy": 0.7493264392018318, "num_tokens": 5383732.0, "step": 190 }, { "entropy": 1.048988512158394, "epoch": 0.32, "grad_norm": 19.125, "learning_rate": 1.3266666666666668e-05, "loss": 16.7231, "mean_token_accuracy": 0.7555124312639236, "num_tokens": 5676334.0, "step": 200 }, { "epoch": 0.32, "eval_biology_entropy": 3.5249868144989014, "eval_biology_loss": 4.3318328857421875, "eval_biology_mean_token_accuracy": 0.39426790857315064, "eval_biology_num_tokens": 5676334.0, "eval_biology_runtime": 49.8836, "eval_biology_samples_per_second": 10.023, "eval_biology_steps_per_second": 2.506, "step": 200 }, { "epoch": 0.32, "eval_chemistry_entropy": 2.3051584205627442, "eval_chemistry_loss": 2.4184799194335938, "eval_chemistry_mean_token_accuracy": 0.5702217166423798, "eval_chemistry_num_tokens": 5676334.0, "eval_chemistry_runtime": 61.8203, "eval_chemistry_samples_per_second": 8.088, "eval_chemistry_steps_per_second": 2.022, "step": 200 }, { "epoch": 0.32, "eval_math_entropy": 1.0760114483833314, "eval_math_loss": 1.0703883171081543, "eval_math_mean_token_accuracy": 0.7492856097221374, "eval_math_num_tokens": 5676334.0, "eval_math_runtime": 63.3918, "eval_math_samples_per_second": 7.887, "eval_math_steps_per_second": 1.972, "step": 200 }, { "epoch": 0.32, "eval_physics_entropy": 1.7677014241218567, "eval_physics_loss": 1.8187588453292847, "eval_physics_mean_token_accuracy": 0.6450894718170166, "eval_physics_num_tokens": 5676334.0, "eval_physics_runtime": 72.1893, "eval_physics_samples_per_second": 6.926, "eval_physics_steps_per_second": 1.732, "step": 200 }, { "entropy": 1.0659823317080737, "epoch": 0.336, "grad_norm": 18.125, "learning_rate": 1.3933333333333334e-05, "loss": 17.0855, "mean_token_accuracy": 0.7487949147820473, "num_tokens": 5958480.0, "step": 210 }, { "entropy": 1.0319596778601408, "epoch": 0.352, "grad_norm": 16.625, "learning_rate": 1.46e-05, "loss": 16.3603, "mean_token_accuracy": 0.7570414818823338, "num_tokens": 6242161.0, "step": 220 }, { "entropy": 1.0307059772312641, "epoch": 0.368, "grad_norm": 19.125, "learning_rate": 1.5266666666666667e-05, "loss": 16.4803, "mean_token_accuracy": 0.7558304250240326, "num_tokens": 6523679.0, "step": 230 }, { "entropy": 1.008485846966505, "epoch": 0.384, "grad_norm": 17.75, "learning_rate": 1.5933333333333336e-05, "loss": 16.1119, "mean_token_accuracy": 0.7602646600455045, "num_tokens": 6810978.0, "step": 240 }, { "entropy": 1.0437373481690884, "epoch": 0.4, "grad_norm": 17.125, "learning_rate": 1.66e-05, "loss": 16.936, "mean_token_accuracy": 0.7534951154142618, "num_tokens": 7096903.0, "step": 250 }, { "entropy": 1.0108945474028588, "epoch": 0.416, "grad_norm": 16.75, "learning_rate": 1.726666666666667e-05, "loss": 16.165, "mean_token_accuracy": 0.7585977960377932, "num_tokens": 7377181.0, "step": 260 }, { "entropy": 0.9808746818453074, "epoch": 0.432, "grad_norm": 19.125, "learning_rate": 1.7933333333333333e-05, "loss": 15.708, "mean_token_accuracy": 0.7660888392478228, "num_tokens": 7650523.0, "step": 270 }, { "entropy": 0.9656771093606948, "epoch": 0.448, "grad_norm": 19.375, "learning_rate": 1.86e-05, "loss": 15.4934, "mean_token_accuracy": 0.7665207251906395, "num_tokens": 7936788.0, "step": 280 }, { "entropy": 0.970192850753665, "epoch": 0.464, "grad_norm": 21.125, "learning_rate": 1.926666666666667e-05, "loss": 15.697, "mean_token_accuracy": 0.7657037891447545, "num_tokens": 8223147.0, "step": 290 }, { "entropy": 0.9936408184468746, "epoch": 0.48, "grad_norm": 18.375, "learning_rate": 1.9933333333333334e-05, "loss": 15.8279, "mean_token_accuracy": 0.7644913751631975, "num_tokens": 8506030.0, "step": 300 }, { "epoch": 0.48, "eval_biology_entropy": 3.547258296966553, "eval_biology_loss": 4.036286354064941, "eval_biology_mean_token_accuracy": 0.4092668824195862, "eval_biology_num_tokens": 8506030.0, "eval_biology_runtime": 49.8646, "eval_biology_samples_per_second": 10.027, "eval_biology_steps_per_second": 2.507, "step": 300 }, { "epoch": 0.48, "eval_chemistry_entropy": 2.243741189956665, "eval_chemistry_loss": 2.2735037803649902, "eval_chemistry_mean_token_accuracy": 0.585998829126358, "eval_chemistry_num_tokens": 8506030.0, "eval_chemistry_runtime": 61.8087, "eval_chemistry_samples_per_second": 8.089, "eval_chemistry_steps_per_second": 2.022, "step": 300 }, { "epoch": 0.48, "eval_math_entropy": 0.969578085899353, "eval_math_loss": 0.9849457740783691, "eval_math_mean_token_accuracy": 0.7633716745376586, "eval_math_num_tokens": 8506030.0, "eval_math_runtime": 63.4567, "eval_math_samples_per_second": 7.879, "eval_math_steps_per_second": 1.97, "step": 300 }, { "epoch": 0.48, "eval_physics_entropy": 1.6970900883674622, "eval_physics_loss": 1.7149206399917603, "eval_physics_mean_token_accuracy": 0.6588896939754486, "eval_physics_num_tokens": 8506030.0, "eval_physics_runtime": 72.1904, "eval_physics_samples_per_second": 6.926, "eval_physics_steps_per_second": 1.732, "step": 300 }, { "entropy": 0.9509127460420131, "epoch": 0.496, "grad_norm": 27.125, "learning_rate": 1.9933333333333334e-05, "loss": 15.2509, "mean_token_accuracy": 0.7699485655874014, "num_tokens": 8788726.0, "step": 310 }, { "entropy": 0.9583811838179827, "epoch": 0.512, "grad_norm": 17.75, "learning_rate": 1.985925925925926e-05, "loss": 15.3156, "mean_token_accuracy": 0.7701035752892494, "num_tokens": 9078039.0, "step": 320 }, { "entropy": 0.9286689113825559, "epoch": 0.528, "grad_norm": 21.375, "learning_rate": 1.9785185185185187e-05, "loss": 14.9975, "mean_token_accuracy": 0.7747602745890617, "num_tokens": 9373860.0, "step": 330 }, { "entropy": 0.93574563190341, "epoch": 0.544, "grad_norm": 19.0, "learning_rate": 1.971111111111111e-05, "loss": 15.1982, "mean_token_accuracy": 0.7723268665373325, "num_tokens": 9660940.0, "step": 340 }, { "entropy": 0.9833962079137564, "epoch": 0.56, "grad_norm": 27.375, "learning_rate": 1.963703703703704e-05, "loss": 15.6566, "mean_token_accuracy": 0.7635074742138386, "num_tokens": 9932302.0, "step": 350 }, { "entropy": 0.9380292896181345, "epoch": 0.576, "grad_norm": 19.5, "learning_rate": 1.9562962962962964e-05, "loss": 15.0896, "mean_token_accuracy": 0.771076611801982, "num_tokens": 10215462.0, "step": 360 }, { "entropy": 0.9226088404655457, "epoch": 0.592, "grad_norm": 17.125, "learning_rate": 1.948888888888889e-05, "loss": 14.9585, "mean_token_accuracy": 0.7726155135780572, "num_tokens": 10504396.0, "step": 370 }, { "entropy": 0.9383664086461068, "epoch": 0.608, "grad_norm": 16.875, "learning_rate": 1.9414814814814817e-05, "loss": 15.0747, "mean_token_accuracy": 0.772657984867692, "num_tokens": 10793126.0, "step": 380 }, { "entropy": 0.9145499892532826, "epoch": 0.624, "grad_norm": 18.875, "learning_rate": 1.9340740740740743e-05, "loss": 14.7611, "mean_token_accuracy": 0.7738783087581396, "num_tokens": 11081768.0, "step": 390 }, { "entropy": 0.9076093930751086, "epoch": 0.64, "grad_norm": 17.5, "learning_rate": 1.926666666666667e-05, "loss": 14.491, "mean_token_accuracy": 0.7761218074709177, "num_tokens": 11370320.0, "step": 400 }, { "epoch": 0.64, "eval_biology_entropy": 3.4644125595092774, "eval_biology_loss": 3.9791781902313232, "eval_biology_mean_token_accuracy": 0.4146076333522797, "eval_biology_num_tokens": 11370320.0, "eval_biology_runtime": 49.8972, "eval_biology_samples_per_second": 10.021, "eval_biology_steps_per_second": 2.505, "step": 400 }, { "epoch": 0.64, "eval_chemistry_entropy": 2.1470180315971374, "eval_chemistry_loss": 2.1781718730926514, "eval_chemistry_mean_token_accuracy": 0.5962020962238311, "eval_chemistry_num_tokens": 11370320.0, "eval_chemistry_runtime": 61.8451, "eval_chemistry_samples_per_second": 8.085, "eval_chemistry_steps_per_second": 2.021, "step": 400 }, { "epoch": 0.64, "eval_math_entropy": 0.9346609992980957, "eval_math_loss": 0.9384942054748535, "eval_math_mean_token_accuracy": 0.771290735244751, "eval_math_num_tokens": 11370320.0, "eval_math_runtime": 63.3885, "eval_math_samples_per_second": 7.888, "eval_math_steps_per_second": 1.972, "step": 400 }, { "epoch": 0.64, "eval_physics_entropy": 1.6158739218711853, "eval_physics_loss": 1.6468137502670288, "eval_physics_mean_token_accuracy": 0.6678030126094818, "eval_physics_num_tokens": 11370320.0, "eval_physics_runtime": 72.1919, "eval_physics_samples_per_second": 6.926, "eval_physics_steps_per_second": 1.731, "step": 400 }, { "entropy": 0.8844340030103922, "epoch": 0.656, "grad_norm": 15.625, "learning_rate": 1.9192592592592593e-05, "loss": 14.2919, "mean_token_accuracy": 0.7790201846510172, "num_tokens": 11657835.0, "step": 410 }, { "entropy": 0.9230461683124304, "epoch": 0.672, "grad_norm": 17.25, "learning_rate": 1.911851851851852e-05, "loss": 14.8067, "mean_token_accuracy": 0.7730407487601042, "num_tokens": 11949262.0, "step": 420 }, { "entropy": 0.8864722024649382, "epoch": 0.688, "grad_norm": 14.5625, "learning_rate": 1.9044444444444446e-05, "loss": 14.2543, "mean_token_accuracy": 0.7801163170486689, "num_tokens": 12227640.0, "step": 430 }, { "entropy": 0.8921929065138101, "epoch": 0.704, "grad_norm": 17.5, "learning_rate": 1.8970370370370372e-05, "loss": 14.3854, "mean_token_accuracy": 0.7790493600070476, "num_tokens": 12516641.0, "step": 440 }, { "entropy": 0.9114352393895387, "epoch": 0.72, "grad_norm": 18.75, "learning_rate": 1.8896296296296295e-05, "loss": 14.7549, "mean_token_accuracy": 0.7744539573788642, "num_tokens": 12793343.0, "step": 450 }, { "entropy": 0.9004437670111656, "epoch": 0.736, "grad_norm": 16.625, "learning_rate": 1.8822222222222225e-05, "loss": 14.4296, "mean_token_accuracy": 0.7780555929988623, "num_tokens": 13077981.0, "step": 460 }, { "entropy": 0.8800053089857102, "epoch": 0.752, "grad_norm": 19.5, "learning_rate": 1.874814814814815e-05, "loss": 14.1044, "mean_token_accuracy": 0.7824321106076241, "num_tokens": 13358957.0, "step": 470 }, { "entropy": 0.8506957441568375, "epoch": 0.768, "grad_norm": 15.25, "learning_rate": 1.8674074074074075e-05, "loss": 13.7613, "mean_token_accuracy": 0.7884838156402111, "num_tokens": 13653412.0, "step": 480 }, { "entropy": 0.9173242174088955, "epoch": 0.784, "grad_norm": 18.125, "learning_rate": 1.86e-05, "loss": 14.7366, "mean_token_accuracy": 0.776311819255352, "num_tokens": 13940856.0, "step": 490 }, { "entropy": 0.9153516864404082, "epoch": 0.8, "grad_norm": 18.5, "learning_rate": 1.8525925925925928e-05, "loss": 14.84, "mean_token_accuracy": 0.7749685924500227, "num_tokens": 14230754.0, "step": 500 }, { "epoch": 0.8, "eval_biology_entropy": 3.587346399307251, "eval_biology_loss": 3.8396143913269043, "eval_biology_mean_token_accuracy": 0.42098368620872495, "eval_biology_num_tokens": 14230754.0, "eval_biology_runtime": 49.8808, "eval_biology_samples_per_second": 10.024, "eval_biology_steps_per_second": 2.506, "step": 500 }, { "epoch": 0.8, "eval_chemistry_entropy": 2.168390507221222, "eval_chemistry_loss": 2.1044840812683105, "eval_chemistry_mean_token_accuracy": 0.6037136344909668, "eval_chemistry_num_tokens": 14230754.0, "eval_chemistry_runtime": 61.4676, "eval_chemistry_samples_per_second": 8.134, "eval_chemistry_steps_per_second": 2.034, "step": 500 }, { "epoch": 0.8, "eval_math_entropy": 0.9173241977691651, "eval_math_loss": 0.911715030670166, "eval_math_mean_token_accuracy": 0.7763807888031006, "eval_math_num_tokens": 14230754.0, "eval_math_runtime": 63.353, "eval_math_samples_per_second": 7.892, "eval_math_steps_per_second": 1.973, "step": 500 }, { "epoch": 0.8, "eval_physics_entropy": 1.6311655960083007, "eval_physics_loss": 1.610815167427063, "eval_physics_mean_token_accuracy": 0.6733945074081421, "eval_physics_num_tokens": 14230754.0, "eval_physics_runtime": 72.3928, "eval_physics_samples_per_second": 6.907, "eval_physics_steps_per_second": 1.727, "step": 500 } ], "logging_steps": 10, "max_steps": 3000, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.448451560178856e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }