| { | |
| "best_metric": 0.8568298816680908, | |
| "best_model_checkpoint": "mgh6/HTH_nt_MLM/checkpoint-4112", | |
| "epoch": 16.0, | |
| "eval_steps": 500, | |
| "global_step": 4112, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.5239282846450806, | |
| "learning_rate": 9.832682291666667e-05, | |
| "loss": 1.5981, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 1.4935803413391113, | |
| "eval_runtime": 9.2798, | |
| "eval_samples_per_second": 46.553, | |
| "eval_steps_per_second": 5.819, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.4561195969581604, | |
| "learning_rate": 9.665364583333334e-05, | |
| "loss": 1.3936, | |
| "step": 514 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 1.3606078624725342, | |
| "eval_runtime": 9.2843, | |
| "eval_samples_per_second": 46.53, | |
| "eval_steps_per_second": 5.816, | |
| "step": 514 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.5290057063102722, | |
| "learning_rate": 9.498046875000001e-05, | |
| "loss": 1.264, | |
| "step": 771 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 1.278564214706421, | |
| "eval_runtime": 9.2918, | |
| "eval_samples_per_second": 46.493, | |
| "eval_steps_per_second": 5.812, | |
| "step": 771 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "grad_norm": 0.4372493624687195, | |
| "learning_rate": 9.330729166666667e-05, | |
| "loss": 1.1581, | |
| "step": 1028 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 1.1972724199295044, | |
| "eval_runtime": 9.2934, | |
| "eval_samples_per_second": 46.485, | |
| "eval_steps_per_second": 5.811, | |
| "step": 1028 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 0.5823390483856201, | |
| "learning_rate": 9.163411458333334e-05, | |
| "loss": 1.0762, | |
| "step": 1285 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_loss": 1.1470884084701538, | |
| "eval_runtime": 9.2848, | |
| "eval_samples_per_second": 46.528, | |
| "eval_steps_per_second": 5.816, | |
| "step": 1285 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "grad_norm": 0.5049512386322021, | |
| "learning_rate": 8.996093750000001e-05, | |
| "loss": 1.0031, | |
| "step": 1542 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_loss": 1.097914695739746, | |
| "eval_runtime": 9.2778, | |
| "eval_samples_per_second": 46.563, | |
| "eval_steps_per_second": 5.82, | |
| "step": 1542 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "grad_norm": 0.46150991320610046, | |
| "learning_rate": 8.828776041666668e-05, | |
| "loss": 0.9425, | |
| "step": 1799 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "eval_loss": 1.063740611076355, | |
| "eval_runtime": 9.2777, | |
| "eval_samples_per_second": 46.563, | |
| "eval_steps_per_second": 5.82, | |
| "step": 1799 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "grad_norm": 0.5542742013931274, | |
| "learning_rate": 8.661458333333333e-05, | |
| "loss": 0.8851, | |
| "step": 2056 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_loss": 1.0174546241760254, | |
| "eval_runtime": 9.2773, | |
| "eval_samples_per_second": 46.565, | |
| "eval_steps_per_second": 5.821, | |
| "step": 2056 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "grad_norm": 0.5170336961746216, | |
| "learning_rate": 8.494140625e-05, | |
| "loss": 0.832, | |
| "step": 2313 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "eval_loss": 0.9869545102119446, | |
| "eval_runtime": 9.2784, | |
| "eval_samples_per_second": 46.56, | |
| "eval_steps_per_second": 5.82, | |
| "step": 2313 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "grad_norm": 0.4140383303165436, | |
| "learning_rate": 8.326822916666667e-05, | |
| "loss": 0.7883, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "eval_loss": 0.9511386752128601, | |
| "eval_runtime": 9.2745, | |
| "eval_samples_per_second": 46.579, | |
| "eval_steps_per_second": 5.822, | |
| "step": 2570 | |
| }, | |
| { | |
| "epoch": 11.0, | |
| "grad_norm": 0.5148940682411194, | |
| "learning_rate": 8.159505208333334e-05, | |
| "loss": 0.7466, | |
| "step": 2827 | |
| }, | |
| { | |
| "epoch": 11.0, | |
| "eval_loss": 0.9429148435592651, | |
| "eval_runtime": 9.2986, | |
| "eval_samples_per_second": 46.459, | |
| "eval_steps_per_second": 5.807, | |
| "step": 2827 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "grad_norm": 0.4817085564136505, | |
| "learning_rate": 7.9921875e-05, | |
| "loss": 0.7099, | |
| "step": 3084 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "eval_loss": 0.9054032564163208, | |
| "eval_runtime": 9.2846, | |
| "eval_samples_per_second": 46.529, | |
| "eval_steps_per_second": 5.816, | |
| "step": 3084 | |
| }, | |
| { | |
| "epoch": 13.0, | |
| "grad_norm": 0.4569827914237976, | |
| "learning_rate": 7.824869791666667e-05, | |
| "loss": 0.6726, | |
| "step": 3341 | |
| }, | |
| { | |
| "epoch": 13.0, | |
| "eval_loss": 0.8854942917823792, | |
| "eval_runtime": 9.2942, | |
| "eval_samples_per_second": 46.481, | |
| "eval_steps_per_second": 5.81, | |
| "step": 3341 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "grad_norm": 0.5499886274337769, | |
| "learning_rate": 7.657552083333333e-05, | |
| "loss": 0.6395, | |
| "step": 3598 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "eval_loss": 0.8803820013999939, | |
| "eval_runtime": 9.2888, | |
| "eval_samples_per_second": 46.508, | |
| "eval_steps_per_second": 5.813, | |
| "step": 3598 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "grad_norm": 0.5064253211021423, | |
| "learning_rate": 7.490234375e-05, | |
| "loss": 0.6111, | |
| "step": 3855 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "eval_loss": 0.8705362677574158, | |
| "eval_runtime": 9.2774, | |
| "eval_samples_per_second": 46.565, | |
| "eval_steps_per_second": 5.821, | |
| "step": 3855 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "grad_norm": 0.5744128227233887, | |
| "learning_rate": 7.322916666666667e-05, | |
| "loss": 0.5853, | |
| "step": 4112 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "eval_loss": 0.8568298816680908, | |
| "eval_runtime": 9.2831, | |
| "eval_samples_per_second": 46.536, | |
| "eval_steps_per_second": 5.817, | |
| "step": 4112 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 15360, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 60, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "EarlyStoppingCallback": { | |
| "args": { | |
| "early_stopping_patience": 10, | |
| "early_stopping_threshold": 0.0 | |
| }, | |
| "attributes": { | |
| "early_stopping_patience_counter": 0 | |
| } | |
| }, | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.909258585589678e+17, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |