| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 10695, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 34.351593017578125, | |
| "learning_rate": 3.327102803738318e-05, | |
| "loss": 1.2122, | |
| "step": 712 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 10.322906494140625, | |
| "learning_rate": 4.816103896103896e-05, | |
| "loss": 0.8692, | |
| "step": 1424 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 23.850841522216797, | |
| "learning_rate": 4.446233766233767e-05, | |
| "loss": 0.747, | |
| "step": 2136 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 26.398853302001953, | |
| "learning_rate": 4.076363636363636e-05, | |
| "loss": 0.6957, | |
| "step": 2848 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 39.551414489746094, | |
| "learning_rate": 3.706493506493507e-05, | |
| "loss": 0.6323, | |
| "step": 3560 | |
| }, | |
| { | |
| "epoch": 1.2, | |
| "grad_norm": 20.38348388671875, | |
| "learning_rate": 3.3366233766233766e-05, | |
| "loss": 0.5724, | |
| "step": 4272 | |
| }, | |
| { | |
| "epoch": 1.4, | |
| "grad_norm": 18.263744354248047, | |
| "learning_rate": 2.9667532467532467e-05, | |
| "loss": 0.5256, | |
| "step": 4984 | |
| }, | |
| { | |
| "epoch": 1.6, | |
| "grad_norm": 7.960543632507324, | |
| "learning_rate": 2.596883116883117e-05, | |
| "loss": 0.522, | |
| "step": 5696 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 5.682173252105713, | |
| "learning_rate": 2.227012987012987e-05, | |
| "loss": 0.4766, | |
| "step": 6408 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 7.151096820831299, | |
| "learning_rate": 1.8571428571428572e-05, | |
| "loss": 0.466, | |
| "step": 7120 | |
| }, | |
| { | |
| "epoch": 2.2, | |
| "grad_norm": 48.665584564208984, | |
| "learning_rate": 1.4872727272727275e-05, | |
| "loss": 0.3953, | |
| "step": 7832 | |
| }, | |
| { | |
| "epoch": 2.4, | |
| "grad_norm": 3.8058860301971436, | |
| "learning_rate": 1.1174025974025975e-05, | |
| "loss": 0.3673, | |
| "step": 8544 | |
| }, | |
| { | |
| "epoch": 2.6, | |
| "grad_norm": 17.926410675048828, | |
| "learning_rate": 7.475324675324675e-06, | |
| "loss": 0.3625, | |
| "step": 9256 | |
| }, | |
| { | |
| "epoch": 2.8, | |
| "grad_norm": 25.310287475585938, | |
| "learning_rate": 3.776623376623377e-06, | |
| "loss": 0.348, | |
| "step": 9968 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 18.61994743347168, | |
| "learning_rate": 7.792207792207792e-08, | |
| "loss": 0.3617, | |
| "step": 10680 | |
| } | |
| ], | |
| "logging_steps": 712, | |
| "max_steps": 10695, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "total_flos": 6.702780767795675e+18, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |