| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 5.0, | |
| "eval_steps": 500, | |
| "global_step": 450, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 8.953761100769043, | |
| "learning_rate": 1.888888888888889e-05, | |
| "loss": 1.3623, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 10.632478713989258, | |
| "learning_rate": 3.777777777777778e-05, | |
| "loss": 1.0225, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 16.316967010498047, | |
| "learning_rate": 4.925925925925926e-05, | |
| "loss": 0.6166, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 21.347354888916016, | |
| "learning_rate": 4.7160493827160495e-05, | |
| "loss": 0.5825, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 8.19096565246582, | |
| "learning_rate": 4.506172839506173e-05, | |
| "loss": 0.4259, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 1.13, | |
| "grad_norm": 12.711194038391113, | |
| "learning_rate": 4.296296296296296e-05, | |
| "loss": 0.2901, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 1.32, | |
| "grad_norm": 18.733543395996094, | |
| "learning_rate": 4.0864197530864204e-05, | |
| "loss": 0.2684, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 1.51, | |
| "grad_norm": 6.416273593902588, | |
| "learning_rate": 3.876543209876544e-05, | |
| "loss": 0.2797, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 1.7, | |
| "grad_norm": 4.896417617797852, | |
| "learning_rate": 3.6666666666666666e-05, | |
| "loss": 0.3023, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 1.89, | |
| "grad_norm": 17.5325984954834, | |
| "learning_rate": 3.45679012345679e-05, | |
| "loss": 0.2397, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 2.08, | |
| "grad_norm": 9.514878273010254, | |
| "learning_rate": 3.2469135802469134e-05, | |
| "loss": 0.2499, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 2.27, | |
| "grad_norm": 6.783230304718018, | |
| "learning_rate": 3.037037037037037e-05, | |
| "loss": 0.2137, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 2.46, | |
| "grad_norm": 9.813042640686035, | |
| "learning_rate": 2.8271604938271606e-05, | |
| "loss": 0.1946, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 2.64, | |
| "grad_norm": 4.386660099029541, | |
| "learning_rate": 2.617283950617284e-05, | |
| "loss": 0.2326, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 2.83, | |
| "grad_norm": 13.430244445800781, | |
| "learning_rate": 2.4074074074074074e-05, | |
| "loss": 0.1931, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 3.02, | |
| "grad_norm": 10.026385307312012, | |
| "learning_rate": 2.1975308641975308e-05, | |
| "loss": 0.1584, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 3.21, | |
| "grad_norm": 5.96959924697876, | |
| "learning_rate": 1.9876543209876546e-05, | |
| "loss": 0.1612, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 3.4, | |
| "grad_norm": 8.849898338317871, | |
| "learning_rate": 1.777777777777778e-05, | |
| "loss": 0.1843, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 3.59, | |
| "grad_norm": 4.787600517272949, | |
| "learning_rate": 1.5679012345679014e-05, | |
| "loss": 0.1395, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 3.78, | |
| "grad_norm": 5.98640251159668, | |
| "learning_rate": 1.3580246913580247e-05, | |
| "loss": 0.1351, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 3.97, | |
| "grad_norm": 8.119280815124512, | |
| "learning_rate": 1.1481481481481482e-05, | |
| "loss": 0.1626, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 4.16, | |
| "grad_norm": 7.947086334228516, | |
| "learning_rate": 9.382716049382717e-06, | |
| "loss": 0.1658, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 4.34, | |
| "grad_norm": 2.4981260299682617, | |
| "learning_rate": 7.283950617283951e-06, | |
| "loss": 0.1147, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 4.53, | |
| "grad_norm": 5.954956531524658, | |
| "learning_rate": 5.185185185185185e-06, | |
| "loss": 0.1295, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 4.72, | |
| "grad_norm": 9.320396423339844, | |
| "learning_rate": 3.0864197530864196e-06, | |
| "loss": 0.1309, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 4.91, | |
| "grad_norm": 9.413307189941406, | |
| "learning_rate": 9.876543209876544e-07, | |
| "loss": 0.095, | |
| "step": 442 | |
| } | |
| ], | |
| "logging_steps": 17, | |
| "max_steps": 450, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 500, | |
| "total_flos": 1.1242852922068992e+18, | |
| "train_batch_size": 32, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |