| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 9.848, |
| "eval_steps": 500, |
| "global_step": 620, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.16, |
| "grad_norm": 53.243560791015625, |
| "learning_rate": 2.580645161290323e-06, |
| "loss": 14.2026, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.32, |
| "grad_norm": 18.361902236938477, |
| "learning_rate": 5.806451612903226e-06, |
| "loss": 6.1329, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.48, |
| "grad_norm": 19.430877685546875, |
| "learning_rate": 9.03225806451613e-06, |
| "loss": 4.058, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.64, |
| "grad_norm": 15.267464637756348, |
| "learning_rate": 1.2258064516129034e-05, |
| "loss": 3.097, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.8, |
| "grad_norm": 9.403186798095703, |
| "learning_rate": 1.5483870967741936e-05, |
| "loss": 2.8274, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.96, |
| "grad_norm": 6.261369228363037, |
| "learning_rate": 1.870967741935484e-05, |
| "loss": 2.4626, |
| "step": 60 |
| }, |
| { |
| "epoch": 1.112, |
| "grad_norm": 5.149415493011475, |
| "learning_rate": 1.999429490929718e-05, |
| "loss": 2.1131, |
| "step": 70 |
| }, |
| { |
| "epoch": 1.272, |
| "grad_norm": 4.141679286956787, |
| "learning_rate": 1.9959454037227215e-05, |
| "loss": 2.0099, |
| "step": 80 |
| }, |
| { |
| "epoch": 1.432, |
| "grad_norm": 4.4521870613098145, |
| "learning_rate": 1.989305206325792e-05, |
| "loss": 1.8846, |
| "step": 90 |
| }, |
| { |
| "epoch": 1.592, |
| "grad_norm": 6.675071716308594, |
| "learning_rate": 1.9795299412524948e-05, |
| "loss": 1.8903, |
| "step": 100 |
| }, |
| { |
| "epoch": 1.752, |
| "grad_norm": 4.7035231590271, |
| "learning_rate": 1.9666505859174462e-05, |
| "loss": 1.7584, |
| "step": 110 |
| }, |
| { |
| "epoch": 1.912, |
| "grad_norm": 6.597540855407715, |
| "learning_rate": 1.9507079544701583e-05, |
| "loss": 1.7185, |
| "step": 120 |
| }, |
| { |
| "epoch": 2.064, |
| "grad_norm": 4.8695759773254395, |
| "learning_rate": 1.9317525684566686e-05, |
| "loss": 1.5598, |
| "step": 130 |
| }, |
| { |
| "epoch": 2.224, |
| "grad_norm": 3.69114351272583, |
| "learning_rate": 1.9098444967188308e-05, |
| "loss": 1.6693, |
| "step": 140 |
| }, |
| { |
| "epoch": 2.384, |
| "grad_norm": 3.190814733505249, |
| "learning_rate": 1.8850531650386154e-05, |
| "loss": 1.5646, |
| "step": 150 |
| }, |
| { |
| "epoch": 2.544, |
| "grad_norm": 3.4081127643585205, |
| "learning_rate": 1.857457136130651e-05, |
| "loss": 1.4366, |
| "step": 160 |
| }, |
| { |
| "epoch": 2.7039999999999997, |
| "grad_norm": 2.457369327545166, |
| "learning_rate": 1.827143860680199e-05, |
| "loss": 1.4132, |
| "step": 170 |
| }, |
| { |
| "epoch": 2.864, |
| "grad_norm": 2.4000205993652344, |
| "learning_rate": 1.7942094002155122e-05, |
| "loss": 1.3689, |
| "step": 180 |
| }, |
| { |
| "epoch": 3.016, |
| "grad_norm": 2.56905198097229, |
| "learning_rate": 1.758758122692791e-05, |
| "loss": 1.2682, |
| "step": 190 |
| }, |
| { |
| "epoch": 3.176, |
| "grad_norm": 2.089385986328125, |
| "learning_rate": 1.7209023717584013e-05, |
| "loss": 1.2686, |
| "step": 200 |
| }, |
| { |
| "epoch": 3.336, |
| "grad_norm": 2.0517237186431885, |
| "learning_rate": 1.6807621107364613e-05, |
| "loss": 1.2347, |
| "step": 210 |
| }, |
| { |
| "epoch": 3.496, |
| "grad_norm": 2.282109260559082, |
| "learning_rate": 1.6384645424699835e-05, |
| "loss": 1.2462, |
| "step": 220 |
| }, |
| { |
| "epoch": 3.656, |
| "grad_norm": 2.201382637023926, |
| "learning_rate": 1.594143706220273e-05, |
| "loss": 1.2536, |
| "step": 230 |
| }, |
| { |
| "epoch": 3.816, |
| "grad_norm": 1.6903492212295532, |
| "learning_rate": 1.5479400529019987e-05, |
| "loss": 1.2723, |
| "step": 240 |
| }, |
| { |
| "epoch": 3.976, |
| "grad_norm": 2.2920656204223633, |
| "learning_rate": 1.5000000000000002e-05, |
| "loss": 1.2036, |
| "step": 250 |
| }, |
| { |
| "epoch": 4.128, |
| "grad_norm": 2.610243320465088, |
| "learning_rate": 1.4504754675782731e-05, |
| "loss": 1.0922, |
| "step": 260 |
| }, |
| { |
| "epoch": 4.288, |
| "grad_norm": 2.494142770767212, |
| "learning_rate": 1.3995233968515105e-05, |
| "loss": 1.0985, |
| "step": 270 |
| }, |
| { |
| "epoch": 4.448, |
| "grad_norm": 1.8717281818389893, |
| "learning_rate": 1.3473052528448203e-05, |
| "loss": 1.1152, |
| "step": 280 |
| }, |
| { |
| "epoch": 4.608, |
| "grad_norm": 2.3531064987182617, |
| "learning_rate": 1.2939865127176771e-05, |
| "loss": 1.1175, |
| "step": 290 |
| }, |
| { |
| "epoch": 4.768, |
| "grad_norm": 2.1707255840301514, |
| "learning_rate": 1.2397361413735785e-05, |
| "loss": 1.119, |
| "step": 300 |
| }, |
| { |
| "epoch": 4.928, |
| "grad_norm": 2.2494473457336426, |
| "learning_rate": 1.1847260560171895e-05, |
| "loss": 1.1175, |
| "step": 310 |
| }, |
| { |
| "epoch": 5.08, |
| "grad_norm": 2.7846431732177734, |
| "learning_rate": 1.1291305813557616e-05, |
| "loss": 1.0205, |
| "step": 320 |
| }, |
| { |
| "epoch": 5.24, |
| "grad_norm": 2.8679280281066895, |
| "learning_rate": 1.0731258971712762e-05, |
| "loss": 1.0235, |
| "step": 330 |
| }, |
| { |
| "epoch": 5.4, |
| "grad_norm": 2.4637234210968018, |
| "learning_rate": 1.0168894800139311e-05, |
| "loss": 1.0103, |
| "step": 340 |
| }, |
| { |
| "epoch": 5.5600000000000005, |
| "grad_norm": 2.420814275741577, |
| "learning_rate": 9.605995407862248e-06, |
| "loss": 0.999, |
| "step": 350 |
| }, |
| { |
| "epoch": 5.72, |
| "grad_norm": 2.7816896438598633, |
| "learning_rate": 9.04434459999902e-06, |
| "loss": 0.9866, |
| "step": 360 |
| }, |
| { |
| "epoch": 5.88, |
| "grad_norm": 3.1777126789093018, |
| "learning_rate": 8.485722224954237e-06, |
| "loss": 1.0309, |
| "step": 370 |
| }, |
| { |
| "epoch": 6.032, |
| "grad_norm": 2.992114782333374, |
| "learning_rate": 7.93189853415293e-06, |
| "loss": 0.9126, |
| "step": 380 |
| }, |
| { |
| "epoch": 6.192, |
| "grad_norm": 2.742065906524658, |
| "learning_rate": 7.384628572186334e-06, |
| "loss": 0.9194, |
| "step": 390 |
| }, |
| { |
| "epoch": 6.352, |
| "grad_norm": 2.3192052841186523, |
| "learning_rate": 6.845646615147445e-06, |
| "loss": 0.907, |
| "step": 400 |
| }, |
| { |
| "epoch": 6.5120000000000005, |
| "grad_norm": 3.0966978073120117, |
| "learning_rate": 6.31666067478113e-06, |
| "loss": 0.9, |
| "step": 410 |
| }, |
| { |
| "epoch": 6.672, |
| "grad_norm": 2.693567991256714, |
| "learning_rate": 5.799347085864851e-06, |
| "loss": 0.8984, |
| "step": 420 |
| }, |
| { |
| "epoch": 6.832, |
| "grad_norm": 2.535724639892578, |
| "learning_rate": 5.295345193972445e-06, |
| "loss": 0.8658, |
| "step": 430 |
| }, |
| { |
| "epoch": 6.992, |
| "grad_norm": 2.9776270389556885, |
| "learning_rate": 4.8062521604551245e-06, |
| "loss": 0.9213, |
| "step": 440 |
| }, |
| { |
| "epoch": 7.144, |
| "grad_norm": 2.5464305877685547, |
| "learning_rate": 4.333617901102592e-06, |
| "loss": 0.7661, |
| "step": 450 |
| }, |
| { |
| "epoch": 7.304, |
| "grad_norm": 2.318953037261963, |
| "learning_rate": 3.878940174523371e-06, |
| "loss": 0.791, |
| "step": 460 |
| }, |
| { |
| "epoch": 7.464, |
| "grad_norm": 3.2819600105285645, |
| "learning_rate": 3.4436598358091577e-06, |
| "loss": 0.8174, |
| "step": 470 |
| }, |
| { |
| "epoch": 7.624, |
| "grad_norm": 2.2935101985931396, |
| "learning_rate": 3.0291562705240107e-06, |
| "loss": 0.8107, |
| "step": 480 |
| }, |
| { |
| "epoch": 7.784, |
| "grad_norm": 3.059847354888916, |
| "learning_rate": 2.6367430234880286e-06, |
| "loss": 0.8199, |
| "step": 490 |
| }, |
| { |
| "epoch": 7.944, |
| "grad_norm": 2.7650320529937744, |
| "learning_rate": 2.2676636362076075e-06, |
| "loss": 0.819, |
| "step": 500 |
| }, |
| { |
| "epoch": 8.096, |
| "grad_norm": 3.0527632236480713, |
| "learning_rate": 1.9230877061433505e-06, |
| "loss": 0.7425, |
| "step": 510 |
| }, |
| { |
| "epoch": 8.256, |
| "grad_norm": 2.6537036895751953, |
| "learning_rate": 1.60410718030361e-06, |
| "loss": 0.731, |
| "step": 520 |
| }, |
| { |
| "epoch": 8.416, |
| "grad_norm": 2.6983063220977783, |
| "learning_rate": 1.3117328949091634e-06, |
| "loss": 0.7354, |
| "step": 530 |
| }, |
| { |
| "epoch": 8.576, |
| "grad_norm": 2.4979090690612793, |
| "learning_rate": 1.0468913720946084e-06, |
| "loss": 0.7442, |
| "step": 540 |
| }, |
| { |
| "epoch": 8.736, |
| "grad_norm": 2.457855463027954, |
| "learning_rate": 8.10421883797694e-07, |
| "loss": 0.7333, |
| "step": 550 |
| }, |
| { |
| "epoch": 8.896, |
| "grad_norm": 2.766944646835327, |
| "learning_rate": 6.030737921409169e-07, |
| "loss": 0.7399, |
| "step": 560 |
| }, |
| { |
| "epoch": 9.048, |
| "grad_norm": 2.2459168434143066, |
| "learning_rate": 4.2550417473364524e-07, |
| "loss": 0.6996, |
| "step": 570 |
| }, |
| { |
| "epoch": 9.208, |
| "grad_norm": 2.7171614170074463, |
| "learning_rate": 2.7827574242009434e-07, |
| "loss": 0.7126, |
| "step": 580 |
| }, |
| { |
| "epoch": 9.368, |
| "grad_norm": 2.6220312118530273, |
| "learning_rate": 1.6185505607171027e-07, |
| "loss": 0.6958, |
| "step": 590 |
| }, |
| { |
| "epoch": 9.528, |
| "grad_norm": 2.0688819885253906, |
| "learning_rate": 7.661104807487607e-08, |
| "loss": 0.7154, |
| "step": 600 |
| }, |
| { |
| "epoch": 9.688, |
| "grad_norm": 2.2390174865722656, |
| "learning_rate": 2.2813853199292745e-08, |
| "loss": 0.7047, |
| "step": 610 |
| }, |
| { |
| "epoch": 9.848, |
| "grad_norm": 2.2094919681549072, |
| "learning_rate": 6.339525519594159e-10, |
| "loss": 0.7091, |
| "step": 620 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 620, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 10, |
| "save_steps": 1000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.117785669819433e+17, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|