| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 9.848, | |
| "eval_steps": 500, | |
| "global_step": 620, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 53.243560791015625, | |
| "learning_rate": 2.580645161290323e-06, | |
| "loss": 14.2026, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 18.361902236938477, | |
| "learning_rate": 5.806451612903226e-06, | |
| "loss": 6.1329, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 19.430877685546875, | |
| "learning_rate": 9.03225806451613e-06, | |
| "loss": 4.058, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 15.267464637756348, | |
| "learning_rate": 1.2258064516129034e-05, | |
| "loss": 3.097, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 9.403186798095703, | |
| "learning_rate": 1.5483870967741936e-05, | |
| "loss": 2.8274, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 6.261369228363037, | |
| "learning_rate": 1.870967741935484e-05, | |
| "loss": 2.4626, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 1.112, | |
| "grad_norm": 5.149415493011475, | |
| "learning_rate": 1.999429490929718e-05, | |
| "loss": 2.1131, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 1.272, | |
| "grad_norm": 4.141679286956787, | |
| "learning_rate": 1.9959454037227215e-05, | |
| "loss": 2.0099, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.432, | |
| "grad_norm": 4.4521870613098145, | |
| "learning_rate": 1.989305206325792e-05, | |
| "loss": 1.8846, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.592, | |
| "grad_norm": 6.675071716308594, | |
| "learning_rate": 1.9795299412524948e-05, | |
| "loss": 1.8903, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.752, | |
| "grad_norm": 4.7035231590271, | |
| "learning_rate": 1.9666505859174462e-05, | |
| "loss": 1.7584, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.912, | |
| "grad_norm": 6.597540855407715, | |
| "learning_rate": 1.9507079544701583e-05, | |
| "loss": 1.7185, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 2.064, | |
| "grad_norm": 4.8695759773254395, | |
| "learning_rate": 1.9317525684566686e-05, | |
| "loss": 1.5598, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 2.224, | |
| "grad_norm": 3.69114351272583, | |
| "learning_rate": 1.9098444967188308e-05, | |
| "loss": 1.6693, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 2.384, | |
| "grad_norm": 3.190814733505249, | |
| "learning_rate": 1.8850531650386154e-05, | |
| "loss": 1.5646, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 2.544, | |
| "grad_norm": 3.4081127643585205, | |
| "learning_rate": 1.857457136130651e-05, | |
| "loss": 1.4366, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 2.7039999999999997, | |
| "grad_norm": 2.457369327545166, | |
| "learning_rate": 1.827143860680199e-05, | |
| "loss": 1.4132, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 2.864, | |
| "grad_norm": 2.4000205993652344, | |
| "learning_rate": 1.7942094002155122e-05, | |
| "loss": 1.3689, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 3.016, | |
| "grad_norm": 2.56905198097229, | |
| "learning_rate": 1.758758122692791e-05, | |
| "loss": 1.2682, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 3.176, | |
| "grad_norm": 2.089385986328125, | |
| "learning_rate": 1.7209023717584013e-05, | |
| "loss": 1.2686, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 3.336, | |
| "grad_norm": 2.0517237186431885, | |
| "learning_rate": 1.6807621107364613e-05, | |
| "loss": 1.2347, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 3.496, | |
| "grad_norm": 2.282109260559082, | |
| "learning_rate": 1.6384645424699835e-05, | |
| "loss": 1.2462, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 3.656, | |
| "grad_norm": 2.201382637023926, | |
| "learning_rate": 1.594143706220273e-05, | |
| "loss": 1.2536, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 3.816, | |
| "grad_norm": 1.6903492212295532, | |
| "learning_rate": 1.5479400529019987e-05, | |
| "loss": 1.2723, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 3.976, | |
| "grad_norm": 2.2920656204223633, | |
| "learning_rate": 1.5000000000000002e-05, | |
| "loss": 1.2036, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 4.128, | |
| "grad_norm": 2.610243320465088, | |
| "learning_rate": 1.4504754675782731e-05, | |
| "loss": 1.0922, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 4.288, | |
| "grad_norm": 2.494142770767212, | |
| "learning_rate": 1.3995233968515105e-05, | |
| "loss": 1.0985, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 4.448, | |
| "grad_norm": 1.8717281818389893, | |
| "learning_rate": 1.3473052528448203e-05, | |
| "loss": 1.1152, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 4.608, | |
| "grad_norm": 2.3531064987182617, | |
| "learning_rate": 1.2939865127176771e-05, | |
| "loss": 1.1175, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 4.768, | |
| "grad_norm": 2.1707255840301514, | |
| "learning_rate": 1.2397361413735785e-05, | |
| "loss": 1.119, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 4.928, | |
| "grad_norm": 2.2494473457336426, | |
| "learning_rate": 1.1847260560171895e-05, | |
| "loss": 1.1175, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 5.08, | |
| "grad_norm": 2.7846431732177734, | |
| "learning_rate": 1.1291305813557616e-05, | |
| "loss": 1.0205, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 5.24, | |
| "grad_norm": 2.8679280281066895, | |
| "learning_rate": 1.0731258971712762e-05, | |
| "loss": 1.0235, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 5.4, | |
| "grad_norm": 2.4637234210968018, | |
| "learning_rate": 1.0168894800139311e-05, | |
| "loss": 1.0103, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 5.5600000000000005, | |
| "grad_norm": 2.420814275741577, | |
| "learning_rate": 9.605995407862248e-06, | |
| "loss": 0.999, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 5.72, | |
| "grad_norm": 2.7816896438598633, | |
| "learning_rate": 9.04434459999902e-06, | |
| "loss": 0.9866, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 5.88, | |
| "grad_norm": 3.1777126789093018, | |
| "learning_rate": 8.485722224954237e-06, | |
| "loss": 1.0309, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 6.032, | |
| "grad_norm": 2.992114782333374, | |
| "learning_rate": 7.93189853415293e-06, | |
| "loss": 0.9126, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 6.192, | |
| "grad_norm": 2.742065906524658, | |
| "learning_rate": 7.384628572186334e-06, | |
| "loss": 0.9194, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 6.352, | |
| "grad_norm": 2.3192052841186523, | |
| "learning_rate": 6.845646615147445e-06, | |
| "loss": 0.907, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 6.5120000000000005, | |
| "grad_norm": 3.0966978073120117, | |
| "learning_rate": 6.31666067478113e-06, | |
| "loss": 0.9, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 6.672, | |
| "grad_norm": 2.693567991256714, | |
| "learning_rate": 5.799347085864851e-06, | |
| "loss": 0.8984, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 6.832, | |
| "grad_norm": 2.535724639892578, | |
| "learning_rate": 5.295345193972445e-06, | |
| "loss": 0.8658, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 6.992, | |
| "grad_norm": 2.9776270389556885, | |
| "learning_rate": 4.8062521604551245e-06, | |
| "loss": 0.9213, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 7.144, | |
| "grad_norm": 2.5464305877685547, | |
| "learning_rate": 4.333617901102592e-06, | |
| "loss": 0.7661, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 7.304, | |
| "grad_norm": 2.318953037261963, | |
| "learning_rate": 3.878940174523371e-06, | |
| "loss": 0.791, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 7.464, | |
| "grad_norm": 3.2819600105285645, | |
| "learning_rate": 3.4436598358091577e-06, | |
| "loss": 0.8174, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 7.624, | |
| "grad_norm": 2.2935101985931396, | |
| "learning_rate": 3.0291562705240107e-06, | |
| "loss": 0.8107, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 7.784, | |
| "grad_norm": 3.059847354888916, | |
| "learning_rate": 2.6367430234880286e-06, | |
| "loss": 0.8199, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 7.944, | |
| "grad_norm": 2.7650320529937744, | |
| "learning_rate": 2.2676636362076075e-06, | |
| "loss": 0.819, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 8.096, | |
| "grad_norm": 3.0527632236480713, | |
| "learning_rate": 1.9230877061433505e-06, | |
| "loss": 0.7425, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 8.256, | |
| "grad_norm": 2.6537036895751953, | |
| "learning_rate": 1.60410718030361e-06, | |
| "loss": 0.731, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 8.416, | |
| "grad_norm": 2.6983063220977783, | |
| "learning_rate": 1.3117328949091634e-06, | |
| "loss": 0.7354, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 8.576, | |
| "grad_norm": 2.4979090690612793, | |
| "learning_rate": 1.0468913720946084e-06, | |
| "loss": 0.7442, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 8.736, | |
| "grad_norm": 2.457855463027954, | |
| "learning_rate": 8.10421883797694e-07, | |
| "loss": 0.7333, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 8.896, | |
| "grad_norm": 2.766944646835327, | |
| "learning_rate": 6.030737921409169e-07, | |
| "loss": 0.7399, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 9.048, | |
| "grad_norm": 2.2459168434143066, | |
| "learning_rate": 4.2550417473364524e-07, | |
| "loss": 0.6996, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 9.208, | |
| "grad_norm": 2.7171614170074463, | |
| "learning_rate": 2.7827574242009434e-07, | |
| "loss": 0.7126, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 9.368, | |
| "grad_norm": 2.6220312118530273, | |
| "learning_rate": 1.6185505607171027e-07, | |
| "loss": 0.6958, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 9.528, | |
| "grad_norm": 2.0688819885253906, | |
| "learning_rate": 7.661104807487607e-08, | |
| "loss": 0.7154, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 9.688, | |
| "grad_norm": 2.2390174865722656, | |
| "learning_rate": 2.2813853199292745e-08, | |
| "loss": 0.7047, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 9.848, | |
| "grad_norm": 2.2094919681549072, | |
| "learning_rate": 6.339525519594159e-10, | |
| "loss": 0.7091, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 9.848, | |
| "step": 620, | |
| "total_flos": 2.117785669819433e+17, | |
| "train_loss": 1.5235166057463616, | |
| "train_runtime": 30234.6635, | |
| "train_samples_per_second": 0.661, | |
| "train_steps_per_second": 0.021 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 620, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 10, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.117785669819433e+17, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |