{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.848, "eval_steps": 500, "global_step": 620, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.16, "grad_norm": 53.243560791015625, "learning_rate": 2.580645161290323e-06, "loss": 14.2026, "step": 10 }, { "epoch": 0.32, "grad_norm": 18.361902236938477, "learning_rate": 5.806451612903226e-06, "loss": 6.1329, "step": 20 }, { "epoch": 0.48, "grad_norm": 19.430877685546875, "learning_rate": 9.03225806451613e-06, "loss": 4.058, "step": 30 }, { "epoch": 0.64, "grad_norm": 15.267464637756348, "learning_rate": 1.2258064516129034e-05, "loss": 3.097, "step": 40 }, { "epoch": 0.8, "grad_norm": 9.403186798095703, "learning_rate": 1.5483870967741936e-05, "loss": 2.8274, "step": 50 }, { "epoch": 0.96, "grad_norm": 6.261369228363037, "learning_rate": 1.870967741935484e-05, "loss": 2.4626, "step": 60 }, { "epoch": 1.112, "grad_norm": 5.149415493011475, "learning_rate": 1.999429490929718e-05, "loss": 2.1131, "step": 70 }, { "epoch": 1.272, "grad_norm": 4.141679286956787, "learning_rate": 1.9959454037227215e-05, "loss": 2.0099, "step": 80 }, { "epoch": 1.432, "grad_norm": 4.4521870613098145, "learning_rate": 1.989305206325792e-05, "loss": 1.8846, "step": 90 }, { "epoch": 1.592, "grad_norm": 6.675071716308594, "learning_rate": 1.9795299412524948e-05, "loss": 1.8903, "step": 100 }, { "epoch": 1.752, "grad_norm": 4.7035231590271, "learning_rate": 1.9666505859174462e-05, "loss": 1.7584, "step": 110 }, { "epoch": 1.912, "grad_norm": 6.597540855407715, "learning_rate": 1.9507079544701583e-05, "loss": 1.7185, "step": 120 }, { "epoch": 2.064, "grad_norm": 4.8695759773254395, "learning_rate": 1.9317525684566686e-05, "loss": 1.5598, "step": 130 }, { "epoch": 2.224, "grad_norm": 3.69114351272583, "learning_rate": 1.9098444967188308e-05, "loss": 1.6693, "step": 140 }, { "epoch": 2.384, "grad_norm": 3.190814733505249, "learning_rate": 1.8850531650386154e-05, "loss": 1.5646, "step": 150 }, { "epoch": 2.544, "grad_norm": 3.4081127643585205, "learning_rate": 1.857457136130651e-05, "loss": 1.4366, "step": 160 }, { "epoch": 2.7039999999999997, "grad_norm": 2.457369327545166, "learning_rate": 1.827143860680199e-05, "loss": 1.4132, "step": 170 }, { "epoch": 2.864, "grad_norm": 2.4000205993652344, "learning_rate": 1.7942094002155122e-05, "loss": 1.3689, "step": 180 }, { "epoch": 3.016, "grad_norm": 2.56905198097229, "learning_rate": 1.758758122692791e-05, "loss": 1.2682, "step": 190 }, { "epoch": 3.176, "grad_norm": 2.089385986328125, "learning_rate": 1.7209023717584013e-05, "loss": 1.2686, "step": 200 }, { "epoch": 3.336, "grad_norm": 2.0517237186431885, "learning_rate": 1.6807621107364613e-05, "loss": 1.2347, "step": 210 }, { "epoch": 3.496, "grad_norm": 2.282109260559082, "learning_rate": 1.6384645424699835e-05, "loss": 1.2462, "step": 220 }, { "epoch": 3.656, "grad_norm": 2.201382637023926, "learning_rate": 1.594143706220273e-05, "loss": 1.2536, "step": 230 }, { "epoch": 3.816, "grad_norm": 1.6903492212295532, "learning_rate": 1.5479400529019987e-05, "loss": 1.2723, "step": 240 }, { "epoch": 3.976, "grad_norm": 2.2920656204223633, "learning_rate": 1.5000000000000002e-05, "loss": 1.2036, "step": 250 }, { "epoch": 4.128, "grad_norm": 2.610243320465088, "learning_rate": 1.4504754675782731e-05, "loss": 1.0922, "step": 260 }, { "epoch": 4.288, "grad_norm": 2.494142770767212, "learning_rate": 1.3995233968515105e-05, "loss": 1.0985, "step": 270 }, { "epoch": 4.448, "grad_norm": 1.8717281818389893, "learning_rate": 1.3473052528448203e-05, "loss": 1.1152, "step": 280 }, { "epoch": 4.608, "grad_norm": 2.3531064987182617, "learning_rate": 1.2939865127176771e-05, "loss": 1.1175, "step": 290 }, { "epoch": 4.768, "grad_norm": 2.1707255840301514, "learning_rate": 1.2397361413735785e-05, "loss": 1.119, "step": 300 }, { "epoch": 4.928, "grad_norm": 2.2494473457336426, "learning_rate": 1.1847260560171895e-05, "loss": 1.1175, "step": 310 }, { "epoch": 5.08, "grad_norm": 2.7846431732177734, "learning_rate": 1.1291305813557616e-05, "loss": 1.0205, "step": 320 }, { "epoch": 5.24, "grad_norm": 2.8679280281066895, "learning_rate": 1.0731258971712762e-05, "loss": 1.0235, "step": 330 }, { "epoch": 5.4, "grad_norm": 2.4637234210968018, "learning_rate": 1.0168894800139311e-05, "loss": 1.0103, "step": 340 }, { "epoch": 5.5600000000000005, "grad_norm": 2.420814275741577, "learning_rate": 9.605995407862248e-06, "loss": 0.999, "step": 350 }, { "epoch": 5.72, "grad_norm": 2.7816896438598633, "learning_rate": 9.04434459999902e-06, "loss": 0.9866, "step": 360 }, { "epoch": 5.88, "grad_norm": 3.1777126789093018, "learning_rate": 8.485722224954237e-06, "loss": 1.0309, "step": 370 }, { "epoch": 6.032, "grad_norm": 2.992114782333374, "learning_rate": 7.93189853415293e-06, "loss": 0.9126, "step": 380 }, { "epoch": 6.192, "grad_norm": 2.742065906524658, "learning_rate": 7.384628572186334e-06, "loss": 0.9194, "step": 390 }, { "epoch": 6.352, "grad_norm": 2.3192052841186523, "learning_rate": 6.845646615147445e-06, "loss": 0.907, "step": 400 }, { "epoch": 6.5120000000000005, "grad_norm": 3.0966978073120117, "learning_rate": 6.31666067478113e-06, "loss": 0.9, "step": 410 }, { "epoch": 6.672, "grad_norm": 2.693567991256714, "learning_rate": 5.799347085864851e-06, "loss": 0.8984, "step": 420 }, { "epoch": 6.832, "grad_norm": 2.535724639892578, "learning_rate": 5.295345193972445e-06, "loss": 0.8658, "step": 430 }, { "epoch": 6.992, "grad_norm": 2.9776270389556885, "learning_rate": 4.8062521604551245e-06, "loss": 0.9213, "step": 440 }, { "epoch": 7.144, "grad_norm": 2.5464305877685547, "learning_rate": 4.333617901102592e-06, "loss": 0.7661, "step": 450 }, { "epoch": 7.304, "grad_norm": 2.318953037261963, "learning_rate": 3.878940174523371e-06, "loss": 0.791, "step": 460 }, { "epoch": 7.464, "grad_norm": 3.2819600105285645, "learning_rate": 3.4436598358091577e-06, "loss": 0.8174, "step": 470 }, { "epoch": 7.624, "grad_norm": 2.2935101985931396, "learning_rate": 3.0291562705240107e-06, "loss": 0.8107, "step": 480 }, { "epoch": 7.784, "grad_norm": 3.059847354888916, "learning_rate": 2.6367430234880286e-06, "loss": 0.8199, "step": 490 }, { "epoch": 7.944, "grad_norm": 2.7650320529937744, "learning_rate": 2.2676636362076075e-06, "loss": 0.819, "step": 500 }, { "epoch": 8.096, "grad_norm": 3.0527632236480713, "learning_rate": 1.9230877061433505e-06, "loss": 0.7425, "step": 510 }, { "epoch": 8.256, "grad_norm": 2.6537036895751953, "learning_rate": 1.60410718030361e-06, "loss": 0.731, "step": 520 }, { "epoch": 8.416, "grad_norm": 2.6983063220977783, "learning_rate": 1.3117328949091634e-06, "loss": 0.7354, "step": 530 }, { "epoch": 8.576, "grad_norm": 2.4979090690612793, "learning_rate": 1.0468913720946084e-06, "loss": 0.7442, "step": 540 }, { "epoch": 8.736, "grad_norm": 2.457855463027954, "learning_rate": 8.10421883797694e-07, "loss": 0.7333, "step": 550 }, { "epoch": 8.896, "grad_norm": 2.766944646835327, "learning_rate": 6.030737921409169e-07, "loss": 0.7399, "step": 560 }, { "epoch": 9.048, "grad_norm": 2.2459168434143066, "learning_rate": 4.2550417473364524e-07, "loss": 0.6996, "step": 570 }, { "epoch": 9.208, "grad_norm": 2.7171614170074463, "learning_rate": 2.7827574242009434e-07, "loss": 0.7126, "step": 580 }, { "epoch": 9.368, "grad_norm": 2.6220312118530273, "learning_rate": 1.6185505607171027e-07, "loss": 0.6958, "step": 590 }, { "epoch": 9.528, "grad_norm": 2.0688819885253906, "learning_rate": 7.661104807487607e-08, "loss": 0.7154, "step": 600 }, { "epoch": 9.688, "grad_norm": 2.2390174865722656, "learning_rate": 2.2813853199292745e-08, "loss": 0.7047, "step": 610 }, { "epoch": 9.848, "grad_norm": 2.2094919681549072, "learning_rate": 6.339525519594159e-10, "loss": 0.7091, "step": 620 }, { "epoch": 9.848, "step": 620, "total_flos": 2.117785669819433e+17, "train_loss": 1.5235166057463616, "train_runtime": 30234.6635, "train_samples_per_second": 0.661, "train_steps_per_second": 0.021 } ], "logging_steps": 10, "max_steps": 620, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.117785669819433e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }