| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.896, | |
| "eval_steps": 500, | |
| "global_step": 45, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.064, | |
| "grad_norm": 5.85249391104972, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": 0.8383, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.128, | |
| "grad_norm": 5.7880200477972314, | |
| "learning_rate": 8.000000000000001e-06, | |
| "loss": 0.8521, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.192, | |
| "grad_norm": 4.637830956527449, | |
| "learning_rate": 1.2e-05, | |
| "loss": 0.8444, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.256, | |
| "grad_norm": 2.174262328740971, | |
| "learning_rate": 1.6000000000000003e-05, | |
| "loss": 0.7812, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 5.057305436302631, | |
| "learning_rate": 2e-05, | |
| "loss": 0.7787, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.384, | |
| "grad_norm": 8.176173201136002, | |
| "learning_rate": 1.9969173337331283e-05, | |
| "loss": 0.8424, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.448, | |
| "grad_norm": 6.401762925359562, | |
| "learning_rate": 1.9876883405951378e-05, | |
| "loss": 0.7736, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.512, | |
| "grad_norm": 3.2961629781668953, | |
| "learning_rate": 1.9723699203976768e-05, | |
| "loss": 0.7289, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.576, | |
| "grad_norm": 2.581895471280243, | |
| "learning_rate": 1.9510565162951538e-05, | |
| "loss": 0.7189, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 1.8876971192011507, | |
| "learning_rate": 1.9238795325112867e-05, | |
| "loss": 0.6747, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.704, | |
| "grad_norm": 1.458550787382178, | |
| "learning_rate": 1.891006524188368e-05, | |
| "loss": 0.6609, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.768, | |
| "grad_norm": 1.3017499709559937, | |
| "learning_rate": 1.8526401643540924e-05, | |
| "loss": 0.6662, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.832, | |
| "grad_norm": 1.384898941850018, | |
| "learning_rate": 1.8090169943749477e-05, | |
| "loss": 0.6792, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.896, | |
| "grad_norm": 1.363683022675478, | |
| "learning_rate": 1.7604059656000313e-05, | |
| "loss": 0.637, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 1.0440100554501006, | |
| "learning_rate": 1.7071067811865477e-05, | |
| "loss": 0.6611, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 1.032, | |
| "grad_norm": 1.4642435617268161, | |
| "learning_rate": 1.6494480483301836e-05, | |
| "loss": 0.9129, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 1.096, | |
| "grad_norm": 1.2688197311643739, | |
| "learning_rate": 1.5877852522924733e-05, | |
| "loss": 0.6914, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 1.16, | |
| "grad_norm": 0.8143916637114235, | |
| "learning_rate": 1.5224985647159489e-05, | |
| "loss": 0.4865, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 1.224, | |
| "grad_norm": 0.9165299977448033, | |
| "learning_rate": 1.4539904997395468e-05, | |
| "loss": 0.5731, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 1.288, | |
| "grad_norm": 0.7360018502732568, | |
| "learning_rate": 1.3826834323650899e-05, | |
| "loss": 0.4969, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 1.3519999999999999, | |
| "grad_norm": 0.7602013706953981, | |
| "learning_rate": 1.3090169943749475e-05, | |
| "loss": 0.572, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 1.416, | |
| "grad_norm": 0.6823276613726659, | |
| "learning_rate": 1.2334453638559057e-05, | |
| "loss": 0.5332, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 1.48, | |
| "grad_norm": 0.6128440387016976, | |
| "learning_rate": 1.156434465040231e-05, | |
| "loss": 0.5485, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 1.544, | |
| "grad_norm": 0.6379847198707073, | |
| "learning_rate": 1.0784590957278452e-05, | |
| "loss": 0.5301, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 1.608, | |
| "grad_norm": 0.6063364000158917, | |
| "learning_rate": 1e-05, | |
| "loss": 0.56, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 1.6720000000000002, | |
| "grad_norm": 0.598214112142404, | |
| "learning_rate": 9.215409042721553e-06, | |
| "loss": 0.5712, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 1.736, | |
| "grad_norm": 0.598964734762257, | |
| "learning_rate": 8.43565534959769e-06, | |
| "loss": 0.506, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 1.8, | |
| "grad_norm": 0.564643443704361, | |
| "learning_rate": 7.66554636144095e-06, | |
| "loss": 0.5203, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 1.8639999999999999, | |
| "grad_norm": 0.547875294216537, | |
| "learning_rate": 6.909830056250527e-06, | |
| "loss": 0.4922, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 1.928, | |
| "grad_norm": 0.5824606933429736, | |
| "learning_rate": 6.173165676349103e-06, | |
| "loss": 0.5583, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 1.992, | |
| "grad_norm": 0.8335341090192849, | |
| "learning_rate": 5.460095002604533e-06, | |
| "loss": 0.7596, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 2.064, | |
| "grad_norm": 0.5850931076101747, | |
| "learning_rate": 4.775014352840512e-06, | |
| "loss": 0.4994, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 2.128, | |
| "grad_norm": 0.5107787299774194, | |
| "learning_rate": 4.12214747707527e-06, | |
| "loss": 0.455, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 2.192, | |
| "grad_norm": 0.465974220692486, | |
| "learning_rate": 3.505519516698165e-06, | |
| "loss": 0.473, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 2.2560000000000002, | |
| "grad_norm": 0.49096442552776787, | |
| "learning_rate": 2.9289321881345257e-06, | |
| "loss": 0.4711, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 2.32, | |
| "grad_norm": 0.4545396809474353, | |
| "learning_rate": 2.395940343999691e-06, | |
| "loss": 0.5029, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 2.384, | |
| "grad_norm": 0.4158603803574914, | |
| "learning_rate": 1.9098300562505266e-06, | |
| "loss": 0.4538, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 2.448, | |
| "grad_norm": 0.5599429573170525, | |
| "learning_rate": 1.4735983564590784e-06, | |
| "loss": 0.4522, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 2.512, | |
| "grad_norm": 0.4773395982310906, | |
| "learning_rate": 1.0899347581163222e-06, | |
| "loss": 0.4694, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 2.576, | |
| "grad_norm": 0.4555133299662401, | |
| "learning_rate": 7.612046748871327e-07, | |
| "loss": 0.4794, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 2.64, | |
| "grad_norm": 0.43925529257171464, | |
| "learning_rate": 4.894348370484648e-07, | |
| "loss": 0.4532, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 2.7039999999999997, | |
| "grad_norm": 0.40275165109816563, | |
| "learning_rate": 2.7630079602323447e-07, | |
| "loss": 0.483, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 2.768, | |
| "grad_norm": 0.45666320321239945, | |
| "learning_rate": 1.231165940486234e-07, | |
| "loss": 0.4769, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 2.832, | |
| "grad_norm": 0.39135454805689784, | |
| "learning_rate": 3.082666266872036e-08, | |
| "loss": 0.4274, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 2.896, | |
| "grad_norm": 0.4269794519040765, | |
| "learning_rate": 0.0, | |
| "loss": 0.4811, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 2.896, | |
| "step": 45, | |
| "total_flos": 5.682962857616998e+16, | |
| "train_loss": 0.6006159649954902, | |
| "train_runtime": 3170.2046, | |
| "train_samples_per_second": 0.943, | |
| "train_steps_per_second": 0.014 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 45, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5.682962857616998e+16, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |