{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.4331348132106118, "eval_steps": 500, "global_step": 4000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02707092582566324, "grad_norm": 0.8431992530822754, "learning_rate": 9.910124526258799e-05, "loss": 4.3576, "step": 250 }, { "epoch": 0.05414185165132648, "grad_norm": 0.8061181306838989, "learning_rate": 9.819888106839921e-05, "loss": 4.2775, "step": 500 }, { "epoch": 0.05414185165132648, "eval_loss": 3.5462777614593506, "eval_runtime": 87.734, "eval_samples_per_second": 112.271, "eval_steps_per_second": 7.021, "step": 500 }, { "epoch": 0.08121277747698971, "grad_norm": 0.7875335812568665, "learning_rate": 9.729651687421044e-05, "loss": 4.1927, "step": 750 }, { "epoch": 0.10828370330265295, "grad_norm": 1.0330173969268799, "learning_rate": 9.639415268002166e-05, "loss": 4.1178, "step": 1000 }, { "epoch": 0.10828370330265295, "eval_loss": 3.3765828609466553, "eval_runtime": 87.6227, "eval_samples_per_second": 112.414, "eval_steps_per_second": 7.03, "step": 1000 }, { "epoch": 0.1353546291283162, "grad_norm": 1.041638731956482, "learning_rate": 9.549178848583288e-05, "loss": 4.0201, "step": 1250 }, { "epoch": 0.16242555495397942, "grad_norm": 0.8976061940193176, "learning_rate": 9.458942429164411e-05, "loss": 3.9802, "step": 1500 }, { "epoch": 0.16242555495397942, "eval_loss": 3.2597038745880127, "eval_runtime": 87.5921, "eval_samples_per_second": 112.453, "eval_steps_per_second": 7.033, "step": 1500 }, { "epoch": 0.18949648077964265, "grad_norm": 1.0333911180496216, "learning_rate": 9.368706009745533e-05, "loss": 3.9335, "step": 1750 }, { "epoch": 0.2165674066053059, "grad_norm": 0.9179081320762634, "learning_rate": 9.278469590326656e-05, "loss": 3.8709, "step": 2000 }, { "epoch": 0.2165674066053059, "eval_loss": 3.1846394538879395, "eval_runtime": 87.5421, "eval_samples_per_second": 112.517, "eval_steps_per_second": 7.037, "step": 2000 }, { "epoch": 0.24363833243096913, "grad_norm": 1.0419113636016846, "learning_rate": 9.188233170907778e-05, "loss": 3.8416, "step": 2250 }, { "epoch": 0.2707092582566324, "grad_norm": 0.9652225375175476, "learning_rate": 9.0979967514889e-05, "loss": 3.807, "step": 2500 }, { "epoch": 0.2707092582566324, "eval_loss": 3.119335174560547, "eval_runtime": 87.4769, "eval_samples_per_second": 112.601, "eval_steps_per_second": 7.042, "step": 2500 }, { "epoch": 0.2977801840822956, "grad_norm": 0.8792561888694763, "learning_rate": 9.007760332070024e-05, "loss": 3.7754, "step": 2750 }, { "epoch": 0.32485110990795885, "grad_norm": 0.9625837206840515, "learning_rate": 8.917523912651147e-05, "loss": 3.7471, "step": 3000 }, { "epoch": 0.32485110990795885, "eval_loss": 3.0782463550567627, "eval_runtime": 87.7273, "eval_samples_per_second": 112.28, "eval_steps_per_second": 7.022, "step": 3000 }, { "epoch": 0.3519220357336221, "grad_norm": 1.0289523601531982, "learning_rate": 8.827287493232269e-05, "loss": 3.7326, "step": 3250 }, { "epoch": 0.3789929615592853, "grad_norm": 0.9764179587364197, "learning_rate": 8.737051073813391e-05, "loss": 3.6939, "step": 3500 }, { "epoch": 0.3789929615592853, "eval_loss": 3.052320718765259, "eval_runtime": 87.6344, "eval_samples_per_second": 112.399, "eval_steps_per_second": 7.029, "step": 3500 }, { "epoch": 0.4060638873849486, "grad_norm": 0.9247903227806091, "learning_rate": 8.646814654394514e-05, "loss": 3.6782, "step": 3750 }, { "epoch": 0.4331348132106118, "grad_norm": 0.9769233465194702, "learning_rate": 8.556578234975636e-05, "loss": 3.6654, "step": 4000 }, { "epoch": 0.4331348132106118, "eval_loss": 3.0321156978607178, "eval_runtime": 87.6462, "eval_samples_per_second": 112.384, "eval_steps_per_second": 7.028, "step": 4000 } ], "logging_steps": 250, "max_steps": 27705, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1444216307712000.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }