{ "best_metric": 0.8568298816680908, "best_model_checkpoint": "mgh6/HTH_nt_MLM/checkpoint-4112", "epoch": 16.0, "eval_steps": 500, "global_step": 4112, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.0, "grad_norm": 0.5239282846450806, "learning_rate": 9.832682291666667e-05, "loss": 1.5981, "step": 257 }, { "epoch": 1.0, "eval_loss": 1.4935803413391113, "eval_runtime": 9.2798, "eval_samples_per_second": 46.553, "eval_steps_per_second": 5.819, "step": 257 }, { "epoch": 2.0, "grad_norm": 0.4561195969581604, "learning_rate": 9.665364583333334e-05, "loss": 1.3936, "step": 514 }, { "epoch": 2.0, "eval_loss": 1.3606078624725342, "eval_runtime": 9.2843, "eval_samples_per_second": 46.53, "eval_steps_per_second": 5.816, "step": 514 }, { "epoch": 3.0, "grad_norm": 0.5290057063102722, "learning_rate": 9.498046875000001e-05, "loss": 1.264, "step": 771 }, { "epoch": 3.0, "eval_loss": 1.278564214706421, "eval_runtime": 9.2918, "eval_samples_per_second": 46.493, "eval_steps_per_second": 5.812, "step": 771 }, { "epoch": 4.0, "grad_norm": 0.4372493624687195, "learning_rate": 9.330729166666667e-05, "loss": 1.1581, "step": 1028 }, { "epoch": 4.0, "eval_loss": 1.1972724199295044, "eval_runtime": 9.2934, "eval_samples_per_second": 46.485, "eval_steps_per_second": 5.811, "step": 1028 }, { "epoch": 5.0, "grad_norm": 0.5823390483856201, "learning_rate": 9.163411458333334e-05, "loss": 1.0762, "step": 1285 }, { "epoch": 5.0, "eval_loss": 1.1470884084701538, "eval_runtime": 9.2848, "eval_samples_per_second": 46.528, "eval_steps_per_second": 5.816, "step": 1285 }, { "epoch": 6.0, "grad_norm": 0.5049512386322021, "learning_rate": 8.996093750000001e-05, "loss": 1.0031, "step": 1542 }, { "epoch": 6.0, "eval_loss": 1.097914695739746, "eval_runtime": 9.2778, "eval_samples_per_second": 46.563, "eval_steps_per_second": 5.82, "step": 1542 }, { "epoch": 7.0, "grad_norm": 0.46150991320610046, "learning_rate": 8.828776041666668e-05, "loss": 0.9425, "step": 1799 }, { "epoch": 7.0, "eval_loss": 1.063740611076355, "eval_runtime": 9.2777, "eval_samples_per_second": 46.563, "eval_steps_per_second": 5.82, "step": 1799 }, { "epoch": 8.0, "grad_norm": 0.5542742013931274, "learning_rate": 8.661458333333333e-05, "loss": 0.8851, "step": 2056 }, { "epoch": 8.0, "eval_loss": 1.0174546241760254, "eval_runtime": 9.2773, "eval_samples_per_second": 46.565, "eval_steps_per_second": 5.821, "step": 2056 }, { "epoch": 9.0, "grad_norm": 0.5170336961746216, "learning_rate": 8.494140625e-05, "loss": 0.832, "step": 2313 }, { "epoch": 9.0, "eval_loss": 0.9869545102119446, "eval_runtime": 9.2784, "eval_samples_per_second": 46.56, "eval_steps_per_second": 5.82, "step": 2313 }, { "epoch": 10.0, "grad_norm": 0.4140383303165436, "learning_rate": 8.326822916666667e-05, "loss": 0.7883, "step": 2570 }, { "epoch": 10.0, "eval_loss": 0.9511386752128601, "eval_runtime": 9.2745, "eval_samples_per_second": 46.579, "eval_steps_per_second": 5.822, "step": 2570 }, { "epoch": 11.0, "grad_norm": 0.5148940682411194, "learning_rate": 8.159505208333334e-05, "loss": 0.7466, "step": 2827 }, { "epoch": 11.0, "eval_loss": 0.9429148435592651, "eval_runtime": 9.2986, "eval_samples_per_second": 46.459, "eval_steps_per_second": 5.807, "step": 2827 }, { "epoch": 12.0, "grad_norm": 0.4817085564136505, "learning_rate": 7.9921875e-05, "loss": 0.7099, "step": 3084 }, { "epoch": 12.0, "eval_loss": 0.9054032564163208, "eval_runtime": 9.2846, "eval_samples_per_second": 46.529, "eval_steps_per_second": 5.816, "step": 3084 }, { "epoch": 13.0, "grad_norm": 0.4569827914237976, "learning_rate": 7.824869791666667e-05, "loss": 0.6726, "step": 3341 }, { "epoch": 13.0, "eval_loss": 0.8854942917823792, "eval_runtime": 9.2942, "eval_samples_per_second": 46.481, "eval_steps_per_second": 5.81, "step": 3341 }, { "epoch": 14.0, "grad_norm": 0.5499886274337769, "learning_rate": 7.657552083333333e-05, "loss": 0.6395, "step": 3598 }, { "epoch": 14.0, "eval_loss": 0.8803820013999939, "eval_runtime": 9.2888, "eval_samples_per_second": 46.508, "eval_steps_per_second": 5.813, "step": 3598 }, { "epoch": 15.0, "grad_norm": 0.5064253211021423, "learning_rate": 7.490234375e-05, "loss": 0.6111, "step": 3855 }, { "epoch": 15.0, "eval_loss": 0.8705362677574158, "eval_runtime": 9.2774, "eval_samples_per_second": 46.565, "eval_steps_per_second": 5.821, "step": 3855 }, { "epoch": 16.0, "grad_norm": 0.5744128227233887, "learning_rate": 7.322916666666667e-05, "loss": 0.5853, "step": 4112 }, { "epoch": 16.0, "eval_loss": 0.8568298816680908, "eval_runtime": 9.2831, "eval_samples_per_second": 46.536, "eval_steps_per_second": 5.817, "step": 4112 } ], "logging_steps": 500, "max_steps": 15360, "num_input_tokens_seen": 0, "num_train_epochs": 60, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 10, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.909258585589678e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }