{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 6250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.016, "grad_norm": 5.986245155334473, "learning_rate": 1.5840000000000002e-06, "loss": 1.9745, "step": 100 }, { "epoch": 0.032, "grad_norm": 5.599625110626221, "learning_rate": 3.1840000000000003e-06, "loss": 1.125, "step": 200 }, { "epoch": 0.048, "grad_norm": 4.610442161560059, "learning_rate": 4.784e-06, "loss": 1.025, "step": 300 }, { "epoch": 0.064, "grad_norm": 5.483785152435303, "learning_rate": 6.384e-06, "loss": 0.9741, "step": 400 }, { "epoch": 0.08, "grad_norm": 4.7527313232421875, "learning_rate": 7.984e-06, "loss": 0.9544, "step": 500 }, { "epoch": 0.096, "grad_norm": 4.325509548187256, "learning_rate": 9.584000000000002e-06, "loss": 0.9134, "step": 600 }, { "epoch": 0.112, "grad_norm": 4.1613898277282715, "learning_rate": 9.995730310237113e-06, "loss": 0.8952, "step": 700 }, { "epoch": 0.128, "grad_norm": 5.373358726501465, "learning_rate": 9.976408726659296e-06, "loss": 0.8752, "step": 800 }, { "epoch": 0.144, "grad_norm": 5.119192600250244, "learning_rate": 9.941568353618064e-06, "loss": 0.8654, "step": 900 }, { "epoch": 0.16, "grad_norm": 3.383653402328491, "learning_rate": 9.891317839828527e-06, "loss": 0.845, "step": 1000 }, { "epoch": 0.176, "grad_norm": 3.9472246170043945, "learning_rate": 9.825813890092639e-06, "loss": 0.8178, "step": 1100 }, { "epoch": 0.192, "grad_norm": 3.940248489379883, "learning_rate": 9.745260776619698e-06, "loss": 0.8142, "step": 1200 }, { "epoch": 0.208, "grad_norm": 3.9536900520324707, "learning_rate": 9.649909702009265e-06, "loss": 0.8028, "step": 1300 }, { "epoch": 0.224, "grad_norm": 4.985702037811279, "learning_rate": 9.54005801588298e-06, "loss": 0.795, "step": 1400 }, { "epoch": 0.24, "grad_norm": 5.107704162597656, "learning_rate": 9.416048287608195e-06, "loss": 0.7805, "step": 1500 }, { "epoch": 0.256, "grad_norm": 3.63683819770813, "learning_rate": 9.27826723800513e-06, "loss": 0.7734, "step": 1600 }, { "epoch": 0.272, "grad_norm": 5.008887767791748, "learning_rate": 9.127144533368956e-06, "loss": 0.7681, "step": 1700 }, { "epoch": 0.288, "grad_norm": 4.119167327880859, "learning_rate": 8.963151445567642e-06, "loss": 0.7479, "step": 1800 }, { "epoch": 0.304, "grad_norm": 4.297443866729736, "learning_rate": 8.786799382394e-06, "loss": 0.7478, "step": 1900 }, { "epoch": 0.32, "grad_norm": 4.435776710510254, "learning_rate": 8.598638292755e-06, "loss": 0.7389, "step": 2000 }, { "epoch": 0.336, "grad_norm": 3.9187700748443604, "learning_rate": 8.399254951671681e-06, "loss": 0.7226, "step": 2100 }, { "epoch": 0.352, "grad_norm": 3.695847988128662, "learning_rate": 8.18927113043791e-06, "loss": 0.7138, "step": 2200 }, { "epoch": 0.368, "grad_norm": 4.540302753448486, "learning_rate": 7.969341657644236e-06, "loss": 0.7126, "step": 2300 }, { "epoch": 0.384, "grad_norm": 4.550365447998047, "learning_rate": 7.740152377113493e-06, "loss": 0.7063, "step": 2400 }, { "epoch": 0.4, "grad_norm": 3.8343756198883057, "learning_rate": 7.5024180091162976e-06, "loss": 0.6911, "step": 2500 }, { "epoch": 0.416, "grad_norm": 4.098830223083496, "learning_rate": 7.256879921536164e-06, "loss": 0.6991, "step": 2600 }, { "epoch": 0.432, "grad_norm": 3.9230875968933105, "learning_rate": 7.004303817934775e-06, "loss": 0.6848, "step": 2700 }, { "epoch": 0.448, "grad_norm": 4.32880163192749, "learning_rate": 6.745477349727154e-06, "loss": 0.6643, "step": 2800 }, { "epoch": 0.464, "grad_norm": 4.100039482116699, "learning_rate": 6.481207659913062e-06, "loss": 0.6791, "step": 2900 }, { "epoch": 0.48, "grad_norm": 4.455363750457764, "learning_rate": 6.212318866024449e-06, "loss": 0.6568, "step": 3000 }, { "epoch": 0.496, "grad_norm": 3.64780330657959, "learning_rate": 5.939649490138305e-06, "loss": 0.6609, "step": 3100 }, { "epoch": 0.512, "grad_norm": 4.5561418533325195, "learning_rate": 5.664049843969348e-06, "loss": 0.6598, "step": 3200 }, { "epoch": 0.528, "grad_norm": 4.1221747398376465, "learning_rate": 5.386379377197056e-06, "loss": 0.6499, "step": 3300 }, { "epoch": 0.544, "grad_norm": 4.2754106521606445, "learning_rate": 5.107503997296225e-06, "loss": 0.6534, "step": 3400 }, { "epoch": 0.56, "grad_norm": 3.418328285217285, "learning_rate": 4.8282933692290665e-06, "loss": 0.6511, "step": 3500 }, { "epoch": 0.576, "grad_norm": 4.334653854370117, "learning_rate": 4.549618203419684e-06, "loss": 0.6388, "step": 3600 }, { "epoch": 0.592, "grad_norm": 3.9164488315582275, "learning_rate": 4.272347540468327e-06, "loss": 0.6327, "step": 3700 }, { "epoch": 0.608, "grad_norm": 4.421480655670166, "learning_rate": 3.997346041072912e-06, "loss": 0.6378, "step": 3800 }, { "epoch": 0.624, "grad_norm": 4.170716762542725, "learning_rate": 3.725471289609174e-06, "loss": 0.6336, "step": 3900 }, { "epoch": 0.64, "grad_norm": 5.108222961425781, "learning_rate": 3.457571119778104e-06, "loss": 0.613, "step": 4000 }, { "epoch": 0.656, "grad_norm": 4.666893005371094, "learning_rate": 3.1944809706606123e-06, "loss": 0.6106, "step": 4100 }, { "epoch": 0.672, "grad_norm": 4.099169731140137, "learning_rate": 2.9370212814244436e-06, "loss": 0.5947, "step": 4200 }, { "epoch": 0.688, "grad_norm": 3.848003625869751, "learning_rate": 2.6859949328079005e-06, "loss": 0.5981, "step": 4300 }, { "epoch": 0.704, "grad_norm": 4.583881378173828, "learning_rate": 2.4421847433590466e-06, "loss": 0.6008, "step": 4400 }, { "epoch": 0.72, "grad_norm": 4.723909378051758, "learning_rate": 2.2063510282382517e-06, "loss": 0.5932, "step": 4500 }, { "epoch": 0.736, "grad_norm": 4.5427045822143555, "learning_rate": 1.979229228196942e-06, "loss": 0.5972, "step": 4600 }, { "epoch": 0.752, "grad_norm": 5.033416748046875, "learning_rate": 1.761527616126475e-06, "loss": 0.5964, "step": 4700 }, { "epoch": 0.768, "grad_norm": 5.443480491638184, "learning_rate": 1.5539250883292078e-06, "loss": 0.589, "step": 4800 }, { "epoch": 0.784, "grad_norm": 4.804011821746826, "learning_rate": 1.3570690473996483e-06, "loss": 0.5812, "step": 4900 }, { "epoch": 0.8, "grad_norm": 5.263527870178223, "learning_rate": 1.1715733833178178e-06, "loss": 0.5747, "step": 5000 }, { "epoch": 0.816, "grad_norm": 5.167060852050781, "learning_rate": 9.98016559050765e-07, "loss": 0.5652, "step": 5100 }, { "epoch": 0.832, "grad_norm": 5.249851226806641, "learning_rate": 8.369398066322049e-07, "loss": 0.5808, "step": 5200 }, { "epoch": 0.848, "grad_norm": 5.08359956741333, "learning_rate": 6.888454393457817e-07, "loss": 0.5656, "step": 5300 }, { "epoch": 0.864, "grad_norm": 4.25005578994751, "learning_rate": 5.541952852753341e-07, "loss": 0.5745, "step": 5400 }, { "epoch": 0.88, "grad_norm": 4.67172908782959, "learning_rate": 4.334092471071194e-07, "loss": 0.5695, "step": 5500 }, { "epoch": 0.896, "grad_norm": 3.975259780883789, "learning_rate": 3.268639926751943e-07, "loss": 0.5632, "step": 5600 }, { "epoch": 0.912, "grad_norm": 4.989463806152344, "learning_rate": 2.3489178033345994e-07, "loss": 0.5807, "step": 5700 }, { "epoch": 0.928, "grad_norm": 4.873440742492676, "learning_rate": 1.5777942281740789e-07, "loss": 0.5645, "step": 5800 }, { "epoch": 0.944, "grad_norm": 4.344705581665039, "learning_rate": 9.576739282673886e-08, "loss": 0.5681, "step": 5900 }, { "epoch": 0.96, "grad_norm": 4.521224498748779, "learning_rate": 4.9049073118072057e-08, "loss": 0.5692, "step": 6000 }, { "epoch": 0.976, "grad_norm": 4.890485763549805, "learning_rate": 1.7770153446302618e-08, "loss": 0.5742, "step": 6100 }, { "epoch": 0.992, "grad_norm": 5.1736626625061035, "learning_rate": 2.0281762352331034e-09, "loss": 0.5735, "step": 6200 } ], "logging_steps": 100, "max_steps": 6250, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2582174620450816.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }