{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.987202925045704, "eval_steps": 500, "global_step": 204, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.14625228519195613, "grad_norm": 3.582301207210251, "learning_rate": 5e-06, "loss": 0.6264, "step": 10 }, { "epoch": 0.29250457038391225, "grad_norm": 0.8448007859307827, "learning_rate": 5e-06, "loss": 0.5676, "step": 20 }, { "epoch": 0.43875685557586835, "grad_norm": 1.1496603868484656, "learning_rate": 5e-06, "loss": 0.5365, "step": 30 }, { "epoch": 0.5850091407678245, "grad_norm": 0.6501842231450267, "learning_rate": 5e-06, "loss": 0.5288, "step": 40 }, { "epoch": 0.7312614259597806, "grad_norm": 0.6880486176943897, "learning_rate": 5e-06, "loss": 0.5117, "step": 50 }, { "epoch": 0.8775137111517367, "grad_norm": 0.653270772148158, "learning_rate": 5e-06, "loss": 0.5034, "step": 60 }, { "epoch": 0.9945155393053017, "eval_loss": 0.48675259947776794, "eval_runtime": 48.1062, "eval_samples_per_second": 38.249, "eval_steps_per_second": 0.603, "step": 68 }, { "epoch": 1.0255941499085923, "grad_norm": 0.8213019683138277, "learning_rate": 5e-06, "loss": 0.5267, "step": 70 }, { "epoch": 1.1718464351005484, "grad_norm": 0.6316112543570708, "learning_rate": 5e-06, "loss": 0.4427, "step": 80 }, { "epoch": 1.3180987202925045, "grad_norm": 0.5370226018361522, "learning_rate": 5e-06, "loss": 0.4395, "step": 90 }, { "epoch": 1.4643510054844606, "grad_norm": 0.5840998374236711, "learning_rate": 5e-06, "loss": 0.4431, "step": 100 }, { "epoch": 1.610603290676417, "grad_norm": 0.5441743828650639, "learning_rate": 5e-06, "loss": 0.4389, "step": 110 }, { "epoch": 1.7568555758683728, "grad_norm": 0.4999808048028024, "learning_rate": 5e-06, "loss": 0.4379, "step": 120 }, { "epoch": 1.9031078610603291, "grad_norm": 0.611839727056921, "learning_rate": 5e-06, "loss": 0.4325, "step": 130 }, { "epoch": 1.9908592321755028, "eval_loss": 0.46772316098213196, "eval_runtime": 47.4216, "eval_samples_per_second": 38.801, "eval_steps_per_second": 0.612, "step": 136 }, { "epoch": 2.0511882998171846, "grad_norm": 0.8668059984866675, "learning_rate": 5e-06, "loss": 0.4489, "step": 140 }, { "epoch": 2.197440585009141, "grad_norm": 0.6140471225460572, "learning_rate": 5e-06, "loss": 0.3845, "step": 150 }, { "epoch": 2.343692870201097, "grad_norm": 0.6202755918802835, "learning_rate": 5e-06, "loss": 0.3819, "step": 160 }, { "epoch": 2.489945155393053, "grad_norm": 0.6403659423565966, "learning_rate": 5e-06, "loss": 0.3839, "step": 170 }, { "epoch": 2.636197440585009, "grad_norm": 0.604130131999038, "learning_rate": 5e-06, "loss": 0.3867, "step": 180 }, { "epoch": 2.7824497257769654, "grad_norm": 0.592166338429596, "learning_rate": 5e-06, "loss": 0.3815, "step": 190 }, { "epoch": 2.9287020109689212, "grad_norm": 0.47952199046688787, "learning_rate": 5e-06, "loss": 0.3893, "step": 200 }, { "epoch": 2.987202925045704, "eval_loss": 0.46915486454963684, "eval_runtime": 46.4418, "eval_samples_per_second": 39.619, "eval_steps_per_second": 0.624, "step": 204 }, { "epoch": 2.987202925045704, "step": 204, "total_flos": 341498218414080.0, "train_loss": 0.4590076675602034, "train_runtime": 7001.299, "train_samples_per_second": 14.978, "train_steps_per_second": 0.029 } ], "logging_steps": 10, "max_steps": 204, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 341498218414080.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }