| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 5.0, | |
| "eval_steps": 500, | |
| "global_step": 80, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.31746031746031744, | |
| "grad_norm": 3.1160410477431384, | |
| "learning_rate": 2e-05, | |
| "loss": 0.743, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.18026351928710938, | |
| "step": 5, | |
| "valid_targets_mean": 4387.2, | |
| "valid_targets_min": 740 | |
| }, | |
| { | |
| "epoch": 0.6349206349206349, | |
| "grad_norm": 0.9623025342413126, | |
| "learning_rate": 3.998096443163716e-05, | |
| "loss": 0.6426, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.15755076706409454, | |
| "step": 10, | |
| "valid_targets_mean": 3791.7, | |
| "valid_targets_min": 695 | |
| }, | |
| { | |
| "epoch": 0.9523809523809523, | |
| "grad_norm": 0.5400417708688194, | |
| "learning_rate": 3.931851652578137e-05, | |
| "loss": 0.5922, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.1631372570991516, | |
| "step": 15, | |
| "valid_targets_mean": 4048.3, | |
| "valid_targets_min": 1151 | |
| }, | |
| { | |
| "epoch": 1.253968253968254, | |
| "grad_norm": 0.4346375139896992, | |
| "learning_rate": 3.774021666356444e-05, | |
| "loss": 0.5513, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.13641172647476196, | |
| "step": 20, | |
| "valid_targets_mean": 3890.5, | |
| "valid_targets_min": 1004 | |
| }, | |
| { | |
| "epoch": 1.5714285714285714, | |
| "grad_norm": 0.3583194659422831, | |
| "learning_rate": 3.532088886237956e-05, | |
| "loss": 0.5268, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.157705619931221, | |
| "step": 25, | |
| "valid_targets_mean": 4477.9, | |
| "valid_targets_min": 656 | |
| }, | |
| { | |
| "epoch": 1.8888888888888888, | |
| "grad_norm": 0.2998700734251974, | |
| "learning_rate": 3.217522858017442e-05, | |
| "loss": 0.5194, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.12757813930511475, | |
| "step": 30, | |
| "valid_targets_mean": 4071.6, | |
| "valid_targets_min": 773 | |
| }, | |
| { | |
| "epoch": 2.1904761904761907, | |
| "grad_norm": 0.354322163099566, | |
| "learning_rate": 2.8452365234813992e-05, | |
| "loss": 0.4999, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.1116921454668045, | |
| "step": 35, | |
| "valid_targets_mean": 3261.8, | |
| "valid_targets_min": 695 | |
| }, | |
| { | |
| "epoch": 2.507936507936508, | |
| "grad_norm": 0.2736861113965854, | |
| "learning_rate": 2.4328792278762058e-05, | |
| "loss": 0.476, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.11490871757268906, | |
| "step": 40, | |
| "valid_targets_mean": 3865.1, | |
| "valid_targets_min": 746 | |
| }, | |
| { | |
| "epoch": 2.825396825396825, | |
| "grad_norm": 0.26810971560154995, | |
| "learning_rate": 2e-05, | |
| "loss": 0.4743, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.09590233117341995, | |
| "step": 45, | |
| "valid_targets_mean": 3056.7, | |
| "valid_targets_min": 726 | |
| }, | |
| { | |
| "epoch": 3.126984126984127, | |
| "grad_norm": 0.26045065338871903, | |
| "learning_rate": 1.5671207721237945e-05, | |
| "loss": 0.4803, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.1086278185248375, | |
| "step": 50, | |
| "valid_targets_mean": 3327.7, | |
| "valid_targets_min": 746 | |
| }, | |
| { | |
| "epoch": 3.4444444444444446, | |
| "grad_norm": 0.2646056254599043, | |
| "learning_rate": 1.1547634765186016e-05, | |
| "loss": 0.4741, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.10994982719421387, | |
| "step": 55, | |
| "valid_targets_mean": 3503.4, | |
| "valid_targets_min": 569 | |
| }, | |
| { | |
| "epoch": 3.761904761904762, | |
| "grad_norm": 0.23864309413170337, | |
| "learning_rate": 7.824771419825588e-06, | |
| "loss": 0.4612, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.09653226286172867, | |
| "step": 60, | |
| "valid_targets_mean": 3644.4, | |
| "valid_targets_min": 1042 | |
| }, | |
| { | |
| "epoch": 4.063492063492063, | |
| "grad_norm": 0.2652889879375569, | |
| "learning_rate": 4.679111137620442e-06, | |
| "loss": 0.4606, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.1475735604763031, | |
| "step": 65, | |
| "valid_targets_mean": 4244.4, | |
| "valid_targets_min": 2068 | |
| }, | |
| { | |
| "epoch": 4.380952380952381, | |
| "grad_norm": 0.28736207202393793, | |
| "learning_rate": 2.259783336435566e-06, | |
| "loss": 0.467, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.10715197026729584, | |
| "step": 70, | |
| "valid_targets_mean": 3558.8, | |
| "valid_targets_min": 733 | |
| }, | |
| { | |
| "epoch": 4.698412698412699, | |
| "grad_norm": 0.26387185108781563, | |
| "learning_rate": 6.814834742186361e-07, | |
| "loss": 0.459, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.10606744885444641, | |
| "step": 75, | |
| "valid_targets_mean": 3355.8, | |
| "valid_targets_min": 761 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 0.2624385775505968, | |
| "learning_rate": 1.9035568362844037e-08, | |
| "loss": 0.4442, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.11343716084957123, | |
| "step": 80, | |
| "valid_targets_mean": 3113.4, | |
| "valid_targets_min": 587 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "loss_nan_ranks": 0, | |
| "loss_rank_avg": 0.11343716084957123, | |
| "step": 80, | |
| "total_flos": 1.3290499486829773e+17, | |
| "train_loss": 0.5170090794563293, | |
| "train_runtime": 1718.7633, | |
| "train_samples_per_second": 2.909, | |
| "train_steps_per_second": 0.047, | |
| "valid_targets_mean": 3113.4, | |
| "valid_targets_min": 587 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 80, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": false, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.3290499486829773e+17, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |