{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.8, "eval_steps": 100, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.142920307815075, "epoch": 0.016, "grad_norm": 290.0, "learning_rate": 6.000000000000001e-07, "loss": 42.6658, "mean_token_accuracy": 0.5620782226324081, "num_tokens": 195524.0, "step": 10 }, { "entropy": 1.148210159689188, "epoch": 0.032, "grad_norm": 239.0, "learning_rate": 1.2666666666666669e-06, "loss": 41.9984, "mean_token_accuracy": 0.5613080382347106, "num_tokens": 390903.0, "step": 20 }, { "entropy": 1.1933260083198547, "epoch": 0.048, "grad_norm": 249.0, "learning_rate": 1.9333333333333336e-06, "loss": 40.6208, "mean_token_accuracy": 0.5657517908141017, "num_tokens": 589868.0, "step": 30 }, { "entropy": 1.2957281917333603, "epoch": 0.064, "grad_norm": 139.0, "learning_rate": 2.6e-06, "loss": 37.9032, "mean_token_accuracy": 0.5714796105399728, "num_tokens": 791190.0, "step": 40 }, { "entropy": 1.5075685508549213, "epoch": 0.08, "grad_norm": 94.0, "learning_rate": 3.266666666666667e-06, "loss": 35.7561, "mean_token_accuracy": 0.5766569443047047, "num_tokens": 989860.0, "step": 50 }, { "entropy": 1.7984249681234359, "epoch": 0.096, "grad_norm": 50.75, "learning_rate": 3.9333333333333335e-06, "loss": 33.4379, "mean_token_accuracy": 0.5814697606489062, "num_tokens": 1181777.0, "step": 60 }, { "entropy": 1.8387351341545581, "epoch": 0.112, "grad_norm": 43.0, "learning_rate": 4.600000000000001e-06, "loss": 30.4219, "mean_token_accuracy": 0.5971228444948793, "num_tokens": 1385513.0, "step": 70 }, { "entropy": 1.7275233700871468, "epoch": 0.128, "grad_norm": 33.5, "learning_rate": 5.2666666666666665e-06, "loss": 28.4703, "mean_token_accuracy": 0.6095364252105355, "num_tokens": 1582368.0, "step": 80 }, { "entropy": 1.7214979872107505, "epoch": 0.144, "grad_norm": 27.0, "learning_rate": 5.933333333333335e-06, "loss": 26.677, "mean_token_accuracy": 0.6243448719382286, "num_tokens": 1773764.0, "step": 90 }, { "entropy": 1.6311134904623033, "epoch": 0.16, "grad_norm": 22.0, "learning_rate": 6.600000000000001e-06, "loss": 25.7683, "mean_token_accuracy": 0.6301404371857643, "num_tokens": 1970077.0, "step": 100 }, { "epoch": 0.16, "eval_biology_entropy": 1.5580159120559693, "eval_biology_loss": 1.5081593990325928, "eval_biology_mean_token_accuracy": 0.6457349667549134, "eval_biology_num_tokens": 1970077.0, "eval_biology_runtime": 48.7413, "eval_biology_samples_per_second": 10.258, "eval_biology_steps_per_second": 2.565, "step": 100 }, { "epoch": 0.16, "eval_chemistry_entropy": 1.206756212234497, "eval_chemistry_loss": 1.1218774318695068, "eval_chemistry_mean_token_accuracy": 0.7205783066749573, "eval_chemistry_num_tokens": 1970077.0, "eval_chemistry_runtime": 60.3159, "eval_chemistry_samples_per_second": 8.29, "eval_chemistry_steps_per_second": 2.072, "step": 100 }, { "epoch": 0.16, "eval_math_entropy": 0.9672308325767517, "eval_math_loss": 1.159799337387085, "eval_math_mean_token_accuracy": 0.7189845342636109, "eval_math_num_tokens": 1970077.0, "eval_math_runtime": 61.8237, "eval_math_samples_per_second": 8.088, "eval_math_steps_per_second": 2.022, "step": 100 }, { "epoch": 0.16, "eval_physics_entropy": 1.1670387201309205, "eval_physics_loss": 1.1291608810424805, "eval_physics_mean_token_accuracy": 0.7211072521209717, "eval_physics_num_tokens": 1970077.0, "eval_physics_runtime": 70.4586, "eval_physics_samples_per_second": 7.096, "eval_physics_steps_per_second": 1.774, "step": 100 }, { "entropy": 1.5482715763151647, "epoch": 0.176, "grad_norm": 21.125, "learning_rate": 7.266666666666668e-06, "loss": 24.5868, "mean_token_accuracy": 0.6385629490017891, "num_tokens": 2168354.0, "step": 110 }, { "entropy": 1.5266574397683144, "epoch": 0.192, "grad_norm": 22.875, "learning_rate": 7.933333333333334e-06, "loss": 24.2707, "mean_token_accuracy": 0.6432460084557533, "num_tokens": 2365822.0, "step": 120 }, { "entropy": 1.5192069873213767, "epoch": 0.208, "grad_norm": 20.875, "learning_rate": 8.6e-06, "loss": 24.1355, "mean_token_accuracy": 0.6436416517943144, "num_tokens": 2558762.0, "step": 130 }, { "entropy": 1.4698147468268872, "epoch": 0.224, "grad_norm": 20.125, "learning_rate": 9.266666666666667e-06, "loss": 23.5154, "mean_token_accuracy": 0.6499760080128908, "num_tokens": 2755347.0, "step": 140 }, { "entropy": 1.4506230603903532, "epoch": 0.24, "grad_norm": 19.625, "learning_rate": 9.933333333333334e-06, "loss": 23.2013, "mean_token_accuracy": 0.6523264441639185, "num_tokens": 2947346.0, "step": 150 }, { "entropy": 1.4590953961014748, "epoch": 0.256, "grad_norm": 18.5, "learning_rate": 1.0600000000000002e-05, "loss": 23.3227, "mean_token_accuracy": 0.6508617259562015, "num_tokens": 3139957.0, "step": 160 }, { "entropy": 1.419396448880434, "epoch": 0.272, "grad_norm": 19.75, "learning_rate": 1.1266666666666668e-05, "loss": 22.7352, "mean_token_accuracy": 0.6572458431124687, "num_tokens": 3335951.0, "step": 170 }, { "entropy": 1.4005608204752207, "epoch": 0.288, "grad_norm": 19.75, "learning_rate": 1.1933333333333335e-05, "loss": 22.3969, "mean_token_accuracy": 0.6585959013551473, "num_tokens": 3539731.0, "step": 180 }, { "entropy": 1.391934547200799, "epoch": 0.304, "grad_norm": 18.75, "learning_rate": 1.2600000000000001e-05, "loss": 22.31, "mean_token_accuracy": 0.6621056370437145, "num_tokens": 3733488.0, "step": 190 }, { "entropy": 1.4028674490749835, "epoch": 0.32, "grad_norm": 22.25, "learning_rate": 1.3266666666666668e-05, "loss": 22.5559, "mean_token_accuracy": 0.6576981086283922, "num_tokens": 3920545.0, "step": 200 }, { "epoch": 0.32, "eval_biology_entropy": 1.3209806289672852, "eval_biology_loss": 1.338399887084961, "eval_biology_mean_token_accuracy": 0.6720403518676757, "eval_biology_num_tokens": 3920545.0, "eval_biology_runtime": 48.5853, "eval_biology_samples_per_second": 10.291, "eval_biology_steps_per_second": 2.573, "step": 200 }, { "epoch": 0.32, "eval_chemistry_entropy": 1.0033348879814148, "eval_chemistry_loss": 0.9935092926025391, "eval_chemistry_mean_token_accuracy": 0.7448974308967591, "eval_chemistry_num_tokens": 3920545.0, "eval_chemistry_runtime": 60.24, "eval_chemistry_samples_per_second": 8.3, "eval_chemistry_steps_per_second": 2.075, "step": 200 }, { "epoch": 0.32, "eval_math_entropy": 0.8341804294586181, "eval_math_loss": 1.0635857582092285, "eval_math_mean_token_accuracy": 0.7432106451988221, "eval_math_num_tokens": 3920545.0, "eval_math_runtime": 61.8174, "eval_math_samples_per_second": 8.088, "eval_math_steps_per_second": 2.022, "step": 200 }, { "epoch": 0.32, "eval_physics_entropy": 0.9652358031272888, "eval_physics_loss": 0.9950281977653503, "eval_physics_mean_token_accuracy": 0.7510108857154846, "eval_physics_num_tokens": 3920545.0, "eval_physics_runtime": 70.411, "eval_physics_samples_per_second": 7.101, "eval_physics_steps_per_second": 1.775, "step": 200 }, { "entropy": 1.3548175282776356, "epoch": 0.336, "grad_norm": 19.625, "learning_rate": 1.3933333333333334e-05, "loss": 21.7763, "mean_token_accuracy": 0.6656343434005976, "num_tokens": 4114077.0, "step": 210 }, { "entropy": 1.3656601022928954, "epoch": 0.352, "grad_norm": 20.625, "learning_rate": 1.46e-05, "loss": 22.0972, "mean_token_accuracy": 0.6638848338276148, "num_tokens": 4306949.0, "step": 220 }, { "entropy": 1.3525194190442562, "epoch": 0.368, "grad_norm": 18.125, "learning_rate": 1.5266666666666667e-05, "loss": 21.7293, "mean_token_accuracy": 0.6680811226367951, "num_tokens": 4504001.0, "step": 230 }, { "entropy": 1.3454820621758699, "epoch": 0.384, "grad_norm": 21.25, "learning_rate": 1.5933333333333336e-05, "loss": 21.7032, "mean_token_accuracy": 0.6671383358538151, "num_tokens": 4693812.0, "step": 240 }, { "entropy": 1.3525703553110362, "epoch": 0.4, "grad_norm": 17.5, "learning_rate": 1.66e-05, "loss": 21.7856, "mean_token_accuracy": 0.666401931643486, "num_tokens": 4887094.0, "step": 250 }, { "entropy": 1.351718918606639, "epoch": 0.416, "grad_norm": 19.0, "learning_rate": 1.726666666666667e-05, "loss": 21.9058, "mean_token_accuracy": 0.6651136819273233, "num_tokens": 5085369.0, "step": 260 }, { "entropy": 1.3526419658213853, "epoch": 0.432, "grad_norm": 20.875, "learning_rate": 1.7933333333333333e-05, "loss": 21.7813, "mean_token_accuracy": 0.6668458927422762, "num_tokens": 5271275.0, "step": 270 }, { "entropy": 1.3480545241385697, "epoch": 0.448, "grad_norm": 22.875, "learning_rate": 1.86e-05, "loss": 21.627, "mean_token_accuracy": 0.6677324704825878, "num_tokens": 5460559.0, "step": 280 }, { "entropy": 1.301166184991598, "epoch": 0.464, "grad_norm": 21.25, "learning_rate": 1.926666666666667e-05, "loss": 20.889, "mean_token_accuracy": 0.676617132872343, "num_tokens": 5653809.0, "step": 290 }, { "entropy": 1.318466317281127, "epoch": 0.48, "grad_norm": 17.125, "learning_rate": 1.9933333333333334e-05, "loss": 21.2936, "mean_token_accuracy": 0.6712827417999506, "num_tokens": 5850176.0, "step": 300 }, { "epoch": 0.48, "eval_biology_entropy": 1.2827796216011047, "eval_biology_loss": 1.275201678276062, "eval_biology_mean_token_accuracy": 0.6830832781791687, "eval_biology_num_tokens": 5850176.0, "eval_biology_runtime": 48.4915, "eval_biology_samples_per_second": 10.311, "eval_biology_steps_per_second": 2.578, "step": 300 }, { "epoch": 0.48, "eval_chemistry_entropy": 0.983495129108429, "eval_chemistry_loss": 0.9488818645477295, "eval_chemistry_mean_token_accuracy": 0.7523409638404847, "eval_chemistry_num_tokens": 5850176.0, "eval_chemistry_runtime": 60.1707, "eval_chemistry_samples_per_second": 8.31, "eval_chemistry_steps_per_second": 2.077, "step": 300 }, { "epoch": 0.48, "eval_math_entropy": 0.8216862387657166, "eval_math_loss": 1.0297818183898926, "eval_math_mean_token_accuracy": 0.7488151121139527, "eval_math_num_tokens": 5850176.0, "eval_math_runtime": 61.6905, "eval_math_samples_per_second": 8.105, "eval_math_steps_per_second": 2.026, "step": 300 }, { "epoch": 0.48, "eval_physics_entropy": 0.9433758721351624, "eval_physics_loss": 0.9520999193191528, "eval_physics_mean_token_accuracy": 0.7585058889389038, "eval_physics_num_tokens": 5850176.0, "eval_physics_runtime": 70.301, "eval_physics_samples_per_second": 7.112, "eval_physics_steps_per_second": 1.778, "step": 300 }, { "entropy": 1.2579400472342968, "epoch": 0.496, "grad_norm": 17.75, "learning_rate": 1.9933333333333334e-05, "loss": 20.2011, "mean_token_accuracy": 0.6842056062072516, "num_tokens": 6046503.0, "step": 310 }, { "entropy": 1.3082518883049488, "epoch": 0.512, "grad_norm": 18.125, "learning_rate": 1.985925925925926e-05, "loss": 21.0658, "mean_token_accuracy": 0.6749501373618841, "num_tokens": 6240456.0, "step": 320 }, { "entropy": 1.3003981616348028, "epoch": 0.528, "grad_norm": 18.125, "learning_rate": 1.9785185185185187e-05, "loss": 20.9809, "mean_token_accuracy": 0.6757604543119669, "num_tokens": 6430555.0, "step": 330 }, { "entropy": 1.2986273631453513, "epoch": 0.544, "grad_norm": 17.0, "learning_rate": 1.971111111111111e-05, "loss": 20.8809, "mean_token_accuracy": 0.6782271713018417, "num_tokens": 6626006.0, "step": 340 }, { "entropy": 1.284830729290843, "epoch": 0.56, "grad_norm": 17.25, "learning_rate": 1.963703703703704e-05, "loss": 20.8197, "mean_token_accuracy": 0.6767117112874985, "num_tokens": 6820754.0, "step": 350 }, { "entropy": 1.2683125745505095, "epoch": 0.576, "grad_norm": 17.0, "learning_rate": 1.9562962962962964e-05, "loss": 20.4541, "mean_token_accuracy": 0.6809794403612613, "num_tokens": 7021844.0, "step": 360 }, { "entropy": 1.2863252360373736, "epoch": 0.592, "grad_norm": 18.875, "learning_rate": 1.948888888888889e-05, "loss": 20.8043, "mean_token_accuracy": 0.676701345667243, "num_tokens": 7213951.0, "step": 370 }, { "entropy": 1.2630502216517925, "epoch": 0.608, "grad_norm": 18.75, "learning_rate": 1.9414814814814817e-05, "loss": 20.4041, "mean_token_accuracy": 0.6803740747272968, "num_tokens": 7416773.0, "step": 380 }, { "entropy": 1.2804703898727894, "epoch": 0.624, "grad_norm": 19.25, "learning_rate": 1.9340740740740743e-05, "loss": 20.6218, "mean_token_accuracy": 0.6788272958248853, "num_tokens": 7612843.0, "step": 390 }, { "entropy": 1.2843346055597067, "epoch": 0.64, "grad_norm": 18.0, "learning_rate": 1.926666666666667e-05, "loss": 20.7171, "mean_token_accuracy": 0.6782444745302201, "num_tokens": 7801633.0, "step": 400 }, { "epoch": 0.64, "eval_biology_entropy": 1.226506398677826, "eval_biology_loss": 1.2382104396820068, "eval_biology_mean_token_accuracy": 0.6894095778465271, "eval_biology_num_tokens": 7801633.0, "eval_biology_runtime": 48.5507, "eval_biology_samples_per_second": 10.299, "eval_biology_steps_per_second": 2.575, "step": 400 }, { "epoch": 0.64, "eval_chemistry_entropy": 0.9317227191925049, "eval_chemistry_loss": 0.9207452535629272, "eval_chemistry_mean_token_accuracy": 0.7581370029449462, "eval_chemistry_num_tokens": 7801633.0, "eval_chemistry_runtime": 60.2113, "eval_chemistry_samples_per_second": 8.304, "eval_chemistry_steps_per_second": 2.076, "step": 400 }, { "epoch": 0.64, "eval_math_entropy": 0.7863595089912414, "eval_math_loss": 1.010460376739502, "eval_math_mean_token_accuracy": 0.7535392093658447, "eval_math_num_tokens": 7801633.0, "eval_math_runtime": 61.807, "eval_math_samples_per_second": 8.09, "eval_math_steps_per_second": 2.022, "step": 400 }, { "epoch": 0.64, "eval_physics_entropy": 0.8958085932731629, "eval_physics_loss": 0.9257401823997498, "eval_physics_mean_token_accuracy": 0.7637984156608582, "eval_physics_num_tokens": 7801633.0, "eval_physics_runtime": 70.3663, "eval_physics_samples_per_second": 7.106, "eval_physics_steps_per_second": 1.776, "step": 400 }, { "entropy": 1.278659427165985, "epoch": 0.656, "grad_norm": 18.25, "learning_rate": 1.9192592592592593e-05, "loss": 20.6682, "mean_token_accuracy": 0.6772829819470644, "num_tokens": 7995843.0, "step": 410 }, { "entropy": 1.2931427203118802, "epoch": 0.672, "grad_norm": 18.625, "learning_rate": 1.911851851851852e-05, "loss": 20.8656, "mean_token_accuracy": 0.6753748003393412, "num_tokens": 8183103.0, "step": 420 }, { "entropy": 1.2739692747592926, "epoch": 0.688, "grad_norm": 16.75, "learning_rate": 1.9044444444444446e-05, "loss": 20.5407, "mean_token_accuracy": 0.6812681049108505, "num_tokens": 8385976.0, "step": 430 }, { "entropy": 1.2659825466573238, "epoch": 0.704, "grad_norm": 16.25, "learning_rate": 1.8970370370370372e-05, "loss": 20.4243, "mean_token_accuracy": 0.6820976916700602, "num_tokens": 8578431.0, "step": 440 }, { "entropy": 1.220404140278697, "epoch": 0.72, "grad_norm": 16.75, "learning_rate": 1.8896296296296295e-05, "loss": 19.6546, "mean_token_accuracy": 0.6908745598047972, "num_tokens": 8781342.0, "step": 450 }, { "entropy": 1.2406103231012822, "epoch": 0.736, "grad_norm": 16.75, "learning_rate": 1.8822222222222225e-05, "loss": 19.9745, "mean_token_accuracy": 0.6853331789374352, "num_tokens": 8977918.0, "step": 460 }, { "entropy": 1.2618801843374967, "epoch": 0.752, "grad_norm": 17.125, "learning_rate": 1.874814814814815e-05, "loss": 20.4041, "mean_token_accuracy": 0.6825968738645315, "num_tokens": 9169322.0, "step": 470 }, { "entropy": 1.2232345014810562, "epoch": 0.768, "grad_norm": 19.25, "learning_rate": 1.8674074074074075e-05, "loss": 19.7045, "mean_token_accuracy": 0.6888250291347504, "num_tokens": 9368141.0, "step": 480 }, { "entropy": 1.25159954726696, "epoch": 0.784, "grad_norm": 18.25, "learning_rate": 1.86e-05, "loss": 20.2036, "mean_token_accuracy": 0.6849453710019588, "num_tokens": 9565236.0, "step": 490 }, { "entropy": 1.264250884205103, "epoch": 0.8, "grad_norm": 19.25, "learning_rate": 1.8525925925925928e-05, "loss": 20.5299, "mean_token_accuracy": 0.6811827480792999, "num_tokens": 9761227.0, "step": 500 }, { "epoch": 0.8, "eval_biology_entropy": 1.2163097896575927, "eval_biology_loss": 1.2177292108535767, "eval_biology_mean_token_accuracy": 0.6932459664344788, "eval_biology_num_tokens": 9761227.0, "eval_biology_runtime": 48.5438, "eval_biology_samples_per_second": 10.3, "eval_biology_steps_per_second": 2.575, "step": 500 }, { "epoch": 0.8, "eval_chemistry_entropy": 0.9239063205718994, "eval_chemistry_loss": 0.9047155380249023, "eval_chemistry_mean_token_accuracy": 0.761792631149292, "eval_chemistry_num_tokens": 9761227.0, "eval_chemistry_runtime": 59.9546, "eval_chemistry_samples_per_second": 8.34, "eval_chemistry_steps_per_second": 2.085, "step": 500 }, { "epoch": 0.8, "eval_math_entropy": 0.7864464523792267, "eval_math_loss": 0.9939978122711182, "eval_math_mean_token_accuracy": 0.7574145245552063, "eval_math_num_tokens": 9761227.0, "eval_math_runtime": 61.7812, "eval_math_samples_per_second": 8.093, "eval_math_steps_per_second": 2.023, "step": 500 }, { "epoch": 0.8, "eval_physics_entropy": 0.889360978603363, "eval_physics_loss": 0.9096766710281372, "eval_physics_mean_token_accuracy": 0.7674052910804748, "eval_physics_num_tokens": 9761227.0, "eval_physics_runtime": 70.5356, "eval_physics_samples_per_second": 7.089, "eval_physics_steps_per_second": 1.772, "step": 500 } ], "logging_steps": 10, "max_steps": 3000, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.0976936565348991e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }