{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.896, "eval_steps": 500, "global_step": 45, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.064, "grad_norm": 5.85249391104972, "learning_rate": 4.000000000000001e-06, "loss": 0.8383, "step": 1 }, { "epoch": 0.128, "grad_norm": 5.7880200477972314, "learning_rate": 8.000000000000001e-06, "loss": 0.8521, "step": 2 }, { "epoch": 0.192, "grad_norm": 4.637830956527449, "learning_rate": 1.2e-05, "loss": 0.8444, "step": 3 }, { "epoch": 0.256, "grad_norm": 2.174262328740971, "learning_rate": 1.6000000000000003e-05, "loss": 0.7812, "step": 4 }, { "epoch": 0.32, "grad_norm": 5.057305436302631, "learning_rate": 2e-05, "loss": 0.7787, "step": 5 }, { "epoch": 0.384, "grad_norm": 8.176173201136002, "learning_rate": 1.9969173337331283e-05, "loss": 0.8424, "step": 6 }, { "epoch": 0.448, "grad_norm": 6.401762925359562, "learning_rate": 1.9876883405951378e-05, "loss": 0.7736, "step": 7 }, { "epoch": 0.512, "grad_norm": 3.2961629781668953, "learning_rate": 1.9723699203976768e-05, "loss": 0.7289, "step": 8 }, { "epoch": 0.576, "grad_norm": 2.581895471280243, "learning_rate": 1.9510565162951538e-05, "loss": 0.7189, "step": 9 }, { "epoch": 0.64, "grad_norm": 1.8876971192011507, "learning_rate": 1.9238795325112867e-05, "loss": 0.6747, "step": 10 }, { "epoch": 0.704, "grad_norm": 1.458550787382178, "learning_rate": 1.891006524188368e-05, "loss": 0.6609, "step": 11 }, { "epoch": 0.768, "grad_norm": 1.3017499709559937, "learning_rate": 1.8526401643540924e-05, "loss": 0.6662, "step": 12 }, { "epoch": 0.832, "grad_norm": 1.384898941850018, "learning_rate": 1.8090169943749477e-05, "loss": 0.6792, "step": 13 }, { "epoch": 0.896, "grad_norm": 1.363683022675478, "learning_rate": 1.7604059656000313e-05, "loss": 0.637, "step": 14 }, { "epoch": 0.96, "grad_norm": 1.0440100554501006, "learning_rate": 1.7071067811865477e-05, "loss": 0.6611, "step": 15 }, { "epoch": 1.032, "grad_norm": 1.4642435617268161, "learning_rate": 1.6494480483301836e-05, "loss": 0.9129, "step": 16 }, { "epoch": 1.096, "grad_norm": 1.2688197311643739, "learning_rate": 1.5877852522924733e-05, "loss": 0.6914, "step": 17 }, { "epoch": 1.16, "grad_norm": 0.8143916637114235, "learning_rate": 1.5224985647159489e-05, "loss": 0.4865, "step": 18 }, { "epoch": 1.224, "grad_norm": 0.9165299977448033, "learning_rate": 1.4539904997395468e-05, "loss": 0.5731, "step": 19 }, { "epoch": 1.288, "grad_norm": 0.7360018502732568, "learning_rate": 1.3826834323650899e-05, "loss": 0.4969, "step": 20 }, { "epoch": 1.3519999999999999, "grad_norm": 0.7602013706953981, "learning_rate": 1.3090169943749475e-05, "loss": 0.572, "step": 21 }, { "epoch": 1.416, "grad_norm": 0.6823276613726659, "learning_rate": 1.2334453638559057e-05, "loss": 0.5332, "step": 22 }, { "epoch": 1.48, "grad_norm": 0.6128440387016976, "learning_rate": 1.156434465040231e-05, "loss": 0.5485, "step": 23 }, { "epoch": 1.544, "grad_norm": 0.6379847198707073, "learning_rate": 1.0784590957278452e-05, "loss": 0.5301, "step": 24 }, { "epoch": 1.608, "grad_norm": 0.6063364000158917, "learning_rate": 1e-05, "loss": 0.56, "step": 25 }, { "epoch": 1.6720000000000002, "grad_norm": 0.598214112142404, "learning_rate": 9.215409042721553e-06, "loss": 0.5712, "step": 26 }, { "epoch": 1.736, "grad_norm": 0.598964734762257, "learning_rate": 8.43565534959769e-06, "loss": 0.506, "step": 27 }, { "epoch": 1.8, "grad_norm": 0.564643443704361, "learning_rate": 7.66554636144095e-06, "loss": 0.5203, "step": 28 }, { "epoch": 1.8639999999999999, "grad_norm": 0.547875294216537, "learning_rate": 6.909830056250527e-06, "loss": 0.4922, "step": 29 }, { "epoch": 1.928, "grad_norm": 0.5824606933429736, "learning_rate": 6.173165676349103e-06, "loss": 0.5583, "step": 30 }, { "epoch": 1.992, "grad_norm": 0.8335341090192849, "learning_rate": 5.460095002604533e-06, "loss": 0.7596, "step": 31 }, { "epoch": 2.064, "grad_norm": 0.5850931076101747, "learning_rate": 4.775014352840512e-06, "loss": 0.4994, "step": 32 }, { "epoch": 2.128, "grad_norm": 0.5107787299774194, "learning_rate": 4.12214747707527e-06, "loss": 0.455, "step": 33 }, { "epoch": 2.192, "grad_norm": 0.465974220692486, "learning_rate": 3.505519516698165e-06, "loss": 0.473, "step": 34 }, { "epoch": 2.2560000000000002, "grad_norm": 0.49096442552776787, "learning_rate": 2.9289321881345257e-06, "loss": 0.4711, "step": 35 }, { "epoch": 2.32, "grad_norm": 0.4545396809474353, "learning_rate": 2.395940343999691e-06, "loss": 0.5029, "step": 36 }, { "epoch": 2.384, "grad_norm": 0.4158603803574914, "learning_rate": 1.9098300562505266e-06, "loss": 0.4538, "step": 37 }, { "epoch": 2.448, "grad_norm": 0.5599429573170525, "learning_rate": 1.4735983564590784e-06, "loss": 0.4522, "step": 38 }, { "epoch": 2.512, "grad_norm": 0.4773395982310906, "learning_rate": 1.0899347581163222e-06, "loss": 0.4694, "step": 39 }, { "epoch": 2.576, "grad_norm": 0.4555133299662401, "learning_rate": 7.612046748871327e-07, "loss": 0.4794, "step": 40 }, { "epoch": 2.64, "grad_norm": 0.43925529257171464, "learning_rate": 4.894348370484648e-07, "loss": 0.4532, "step": 41 }, { "epoch": 2.7039999999999997, "grad_norm": 0.40275165109816563, "learning_rate": 2.7630079602323447e-07, "loss": 0.483, "step": 42 }, { "epoch": 2.768, "grad_norm": 0.45666320321239945, "learning_rate": 1.231165940486234e-07, "loss": 0.4769, "step": 43 }, { "epoch": 2.832, "grad_norm": 0.39135454805689784, "learning_rate": 3.082666266872036e-08, "loss": 0.4274, "step": 44 }, { "epoch": 2.896, "grad_norm": 0.4269794519040765, "learning_rate": 0.0, "loss": 0.4811, "step": 45 }, { "epoch": 2.896, "step": 45, "total_flos": 5.682962857616998e+16, "train_loss": 0.6006159649954902, "train_runtime": 3170.2046, "train_samples_per_second": 0.943, "train_steps_per_second": 0.014 } ], "logging_steps": 1.0, "max_steps": 45, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.682962857616998e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }