{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 385, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06493506493506493, "grad_norm": 2.600665284354585, "learning_rate": 1.25e-05, "loss": 1.6075, "step": 5 }, { "epoch": 0.12987012987012986, "grad_norm": 2.3704504908479493, "learning_rate": 2.5e-05, "loss": 1.4792, "step": 10 }, { "epoch": 0.19480519480519481, "grad_norm": 1.2779795738968422, "learning_rate": 3.7500000000000003e-05, "loss": 1.4115, "step": 15 }, { "epoch": 0.2597402597402597, "grad_norm": 1.267787561996043, "learning_rate": 5e-05, "loss": 1.4018, "step": 20 }, { "epoch": 0.3246753246753247, "grad_norm": 0.9468473252185556, "learning_rate": 4.9979167589800175e-05, "loss": 1.3367, "step": 25 }, { "epoch": 0.38961038961038963, "grad_norm": 0.9893253972313399, "learning_rate": 4.991670893602868e-05, "loss": 1.3354, "step": 30 }, { "epoch": 0.45454545454545453, "grad_norm": 1.0881069970614872, "learning_rate": 4.9812739697734024e-05, "loss": 1.3007, "step": 35 }, { "epoch": 0.5194805194805194, "grad_norm": 1.0194980858529235, "learning_rate": 4.9667452402011365e-05, "loss": 1.2934, "step": 40 }, { "epoch": 0.5844155844155844, "grad_norm": 0.9223444885083613, "learning_rate": 4.94811160874866e-05, "loss": 1.3045, "step": 45 }, { "epoch": 0.6493506493506493, "grad_norm": 0.923065081102753, "learning_rate": 4.925407580611875e-05, "loss": 1.2753, "step": 50 }, { "epoch": 0.7142857142857143, "grad_norm": 1.0718722181581042, "learning_rate": 4.898675198424325e-05, "loss": 1.2756, "step": 55 }, { "epoch": 0.7792207792207793, "grad_norm": 1.052542976749598, "learning_rate": 4.867963964403906e-05, "loss": 1.2911, "step": 60 }, { "epoch": 0.8441558441558441, "grad_norm": 0.8128323214762116, "learning_rate": 4.833330748686162e-05, "loss": 1.2779, "step": 65 }, { "epoch": 0.9090909090909091, "grad_norm": 0.8676636174920215, "learning_rate": 4.794839684013882e-05, "loss": 1.2848, "step": 70 }, { "epoch": 0.974025974025974, "grad_norm": 1.042065531092415, "learning_rate": 4.7525620469780227e-05, "loss": 1.2567, "step": 75 }, { "epoch": 1.0389610389610389, "grad_norm": 0.972909624095194, "learning_rate": 4.7065761260298747e-05, "loss": 1.1721, "step": 80 }, { "epoch": 1.103896103896104, "grad_norm": 1.032654503817413, "learning_rate": 4.6569670765088703e-05, "loss": 1.0906, "step": 85 }, { "epoch": 1.1688311688311688, "grad_norm": 1.0351195035386815, "learning_rate": 4.603826762954497e-05, "loss": 1.1172, "step": 90 }, { "epoch": 1.2337662337662338, "grad_norm": 0.96162795934013, "learning_rate": 4.5472535889943214e-05, "loss": 1.0908, "step": 95 }, { "epoch": 1.2987012987012987, "grad_norm": 0.8351069694923322, "learning_rate": 4.487352315123119e-05, "loss": 1.1043, "step": 100 }, { "epoch": 1.3636363636363638, "grad_norm": 1.0009708201516048, "learning_rate": 4.424233864710562e-05, "loss": 1.0782, "step": 105 }, { "epoch": 1.4285714285714286, "grad_norm": 0.8512417675384085, "learning_rate": 4.3580151185966625e-05, "loss": 1.0982, "step": 110 }, { "epoch": 1.4935064935064934, "grad_norm": 1.0103004383513716, "learning_rate": 4.288818698655374e-05, "loss": 1.0942, "step": 115 }, { "epoch": 1.5584415584415585, "grad_norm": 0.8573478770737569, "learning_rate": 4.216772740727103e-05, "loss": 1.0897, "step": 120 }, { "epoch": 1.6233766233766234, "grad_norm": 0.781876249476916, "learning_rate": 4.142010657340632e-05, "loss": 1.0763, "step": 125 }, { "epoch": 1.6883116883116882, "grad_norm": 0.9579078710565259, "learning_rate": 4.064670890663829e-05, "loss": 1.1087, "step": 130 }, { "epoch": 1.7532467532467533, "grad_norm": 0.7939190983394226, "learning_rate": 3.9848966561406185e-05, "loss": 1.0746, "step": 135 }, { "epoch": 1.8181818181818183, "grad_norm": 1.0342389011696922, "learning_rate": 3.902835677288954e-05, "loss": 1.0677, "step": 140 }, { "epoch": 1.883116883116883, "grad_norm": 0.7587581170618278, "learning_rate": 3.818639912150864e-05, "loss": 1.0916, "step": 145 }, { "epoch": 1.948051948051948, "grad_norm": 0.7978087410640827, "learning_rate": 3.7324652719011446e-05, "loss": 1.0789, "step": 150 }, { "epoch": 2.012987012987013, "grad_norm": 1.1656631051781983, "learning_rate": 3.644471332135751e-05, "loss": 1.0629, "step": 155 }, { "epoch": 2.0779220779220777, "grad_norm": 1.0903515201097143, "learning_rate": 3.554821037374533e-05, "loss": 0.8856, "step": 160 }, { "epoch": 2.142857142857143, "grad_norm": 0.7595990161921076, "learning_rate": 3.463680399325489e-05, "loss": 0.8855, "step": 165 }, { "epoch": 2.207792207792208, "grad_norm": 0.7774624026109681, "learning_rate": 3.371218189469306e-05, "loss": 0.8659, "step": 170 }, { "epoch": 2.2727272727272725, "grad_norm": 0.7461072755462708, "learning_rate": 3.277605626533422e-05, "loss": 0.8772, "step": 175 }, { "epoch": 2.3376623376623376, "grad_norm": 0.7432368434791046, "learning_rate": 3.183016059434367e-05, "loss": 0.8525, "step": 180 }, { "epoch": 2.4025974025974026, "grad_norm": 0.7590366429814798, "learning_rate": 3.0876246462754685e-05, "loss": 0.8742, "step": 185 }, { "epoch": 2.4675324675324677, "grad_norm": 0.7247155849317677, "learning_rate": 2.9916080299943672e-05, "loss": 0.9026, "step": 190 }, { "epoch": 2.5324675324675323, "grad_norm": 0.7310534548444702, "learning_rate": 2.8951440112609623e-05, "loss": 0.8883, "step": 195 }, { "epoch": 2.5974025974025974, "grad_norm": 0.7129005263727524, "learning_rate": 2.7984112192315004e-05, "loss": 0.8736, "step": 200 }, { "epoch": 2.6623376623376624, "grad_norm": 0.7437690616597665, "learning_rate": 2.7015887807685002e-05, "loss": 0.8981, "step": 205 }, { "epoch": 2.7272727272727275, "grad_norm": 0.7385255276801924, "learning_rate": 2.604855988739039e-05, "loss": 0.8895, "step": 210 }, { "epoch": 2.792207792207792, "grad_norm": 0.7144722424573203, "learning_rate": 2.5083919700056337e-05, "loss": 0.8744, "step": 215 }, { "epoch": 2.857142857142857, "grad_norm": 0.7010146967901848, "learning_rate": 2.412375353724532e-05, "loss": 0.896, "step": 220 }, { "epoch": 2.9220779220779223, "grad_norm": 0.7504002323519033, "learning_rate": 2.316983940565633e-05, "loss": 0.8868, "step": 225 }, { "epoch": 2.987012987012987, "grad_norm": 0.7865383039242797, "learning_rate": 2.2223943734665787e-05, "loss": 0.8745, "step": 230 }, { "epoch": 3.051948051948052, "grad_norm": 1.0159335857839107, "learning_rate": 2.128781810530695e-05, "loss": 0.7511, "step": 235 }, { "epoch": 3.116883116883117, "grad_norm": 1.2002393858292175, "learning_rate": 2.0363196006745117e-05, "loss": 0.6809, "step": 240 }, { "epoch": 3.1818181818181817, "grad_norm": 0.7878609155092301, "learning_rate": 1.9451789626254672e-05, "loss": 0.6976, "step": 245 }, { "epoch": 3.2467532467532467, "grad_norm": 0.8016037192685418, "learning_rate": 1.8555286678642496e-05, "loss": 0.6747, "step": 250 }, { "epoch": 3.311688311688312, "grad_norm": 0.7587512144299318, "learning_rate": 1.7675347280988562e-05, "loss": 0.6964, "step": 255 }, { "epoch": 3.3766233766233764, "grad_norm": 0.7356037246911953, "learning_rate": 1.6813600878491376e-05, "loss": 0.6865, "step": 260 }, { "epoch": 3.4415584415584415, "grad_norm": 0.7631434364636613, "learning_rate": 1.597164322711047e-05, "loss": 0.6909, "step": 265 }, { "epoch": 3.5064935064935066, "grad_norm": 0.7131404046904652, "learning_rate": 1.5151033438593826e-05, "loss": 0.6934, "step": 270 }, { "epoch": 3.571428571428571, "grad_norm": 0.711551265262125, "learning_rate": 1.4353291093361709e-05, "loss": 0.6973, "step": 275 }, { "epoch": 3.6363636363636362, "grad_norm": 0.705346898473505, "learning_rate": 1.3579893426593681e-05, "loss": 0.68, "step": 280 }, { "epoch": 3.7012987012987013, "grad_norm": 0.7118443145611075, "learning_rate": 1.2832272592728966e-05, "loss": 0.6904, "step": 285 }, { "epoch": 3.7662337662337664, "grad_norm": 0.7427739908839855, "learning_rate": 1.211181301344627e-05, "loss": 0.6793, "step": 290 }, { "epoch": 3.8311688311688314, "grad_norm": 0.6999298394629496, "learning_rate": 1.141984881403338e-05, "loss": 0.6876, "step": 295 }, { "epoch": 3.896103896103896, "grad_norm": 0.7120263608160156, "learning_rate": 1.0757661352894394e-05, "loss": 0.6893, "step": 300 }, { "epoch": 3.961038961038961, "grad_norm": 0.7144317661772965, "learning_rate": 1.0126476848768805e-05, "loss": 0.6789, "step": 305 }, { "epoch": 4.025974025974026, "grad_norm": 1.063865329311315, "learning_rate": 9.527464110056795e-06, "loss": 0.6269, "step": 310 }, { "epoch": 4.090909090909091, "grad_norm": 1.2451272279185113, "learning_rate": 8.961732370455032e-06, "loss": 0.5587, "step": 315 }, { "epoch": 4.1558441558441555, "grad_norm": 0.8874131926210624, "learning_rate": 8.430329234911305e-06, "loss": 0.5358, "step": 320 }, { "epoch": 4.220779220779221, "grad_norm": 0.8464865813880585, "learning_rate": 7.934238739701252e-06, "loss": 0.5414, "step": 325 }, { "epoch": 4.285714285714286, "grad_norm": 0.8425143948713799, "learning_rate": 7.4743795302197754e-06, "loss": 0.5494, "step": 330 }, { "epoch": 4.35064935064935, "grad_norm": 0.7956269077359945, "learning_rate": 7.051603159861185e-06, "loss": 0.5434, "step": 335 }, { "epoch": 4.415584415584416, "grad_norm": 0.7869754406789251, "learning_rate": 6.66669251313838e-06, "loss": 0.5545, "step": 340 }, { "epoch": 4.48051948051948, "grad_norm": 0.7360290448864131, "learning_rate": 6.320360355960941e-06, "loss": 0.5292, "step": 345 }, { "epoch": 4.545454545454545, "grad_norm": 0.7623311010292727, "learning_rate": 6.013248015756759e-06, "loss": 0.547, "step": 350 }, { "epoch": 4.6103896103896105, "grad_norm": 0.7371275230206066, "learning_rate": 5.745924193881257e-06, "loss": 0.5501, "step": 355 }, { "epoch": 4.675324675324675, "grad_norm": 0.7480895176553981, "learning_rate": 5.518883912513413e-06, "loss": 0.5357, "step": 360 }, { "epoch": 4.740259740259741, "grad_norm": 0.7362818934957748, "learning_rate": 5.332547597988636e-06, "loss": 0.5451, "step": 365 }, { "epoch": 4.805194805194805, "grad_norm": 0.7678966619355423, "learning_rate": 5.1872603022659765e-06, "loss": 0.5297, "step": 370 }, { "epoch": 4.87012987012987, "grad_norm": 0.7535710804010259, "learning_rate": 5.083291063971324e-06, "loss": 0.5356, "step": 375 }, { "epoch": 4.935064935064935, "grad_norm": 0.7578922717450597, "learning_rate": 5.020832410199826e-06, "loss": 0.5347, "step": 380 }, { "epoch": 5.0, "grad_norm": 0.7276287792812202, "learning_rate": 5e-06, "loss": 0.5505, "step": 385 }, { "epoch": 5.0, "step": 385, "total_flos": 43871077662720.0, "train_loss": 0.9090226414915803, "train_runtime": 2127.6426, "train_samples_per_second": 23.091, "train_steps_per_second": 0.181 } ], "logging_steps": 5, "max_steps": 385, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 43871077662720.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }