{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.025, "grad_norm": 5.625, "learning_rate": 8.572335756456368e-06, "loss": 0.2415, "step": 5 }, { "epoch": 0.05, "grad_norm": 3.140625, "learning_rate": 1.9287755452026826e-05, "loss": 0.1367, "step": 10 }, { "epoch": 0.075, "grad_norm": 2.859375, "learning_rate": 3.000317514759729e-05, "loss": 0.1025, "step": 15 }, { "epoch": 0.1, "grad_norm": 2.21875, "learning_rate": 4.071859484316775e-05, "loss": 0.0911, "step": 20 }, { "epoch": 0.125, "grad_norm": 1.6953125, "learning_rate": 5.143401453873821e-05, "loss": 0.0852, "step": 25 }, { "epoch": 0.15, "grad_norm": 3.765625, "learning_rate": 6.214943423430867e-05, "loss": 0.0845, "step": 30 }, { "epoch": 0.175, "grad_norm": 0.80078125, "learning_rate": 7.286485392987913e-05, "loss": 0.0843, "step": 35 }, { "epoch": 0.2, "grad_norm": 1.515625, "learning_rate": 7.500098100637213e-05, "loss": 0.0861, "step": 40 }, { "epoch": 0.225, "grad_norm": 0.82421875, "learning_rate": 7.497272464974502e-05, "loss": 0.0792, "step": 45 }, { "epoch": 0.25, "grad_norm": 0.78515625, "learning_rate": 7.492275581730845e-05, "loss": 0.0796, "step": 50 }, { "epoch": 0.275, "grad_norm": 0.68359375, "learning_rate": 7.485111312922398e-05, "loss": 0.077, "step": 55 }, { "epoch": 0.3, "grad_norm": 0.56640625, "learning_rate": 7.475785195705139e-05, "loss": 0.0763, "step": 60 }, { "epoch": 0.325, "grad_norm": 0.69921875, "learning_rate": 7.464304438095277e-05, "loss": 0.0754, "step": 65 }, { "epoch": 0.35, "grad_norm": 0.609375, "learning_rate": 7.450677913398279e-05, "loss": 0.0749, "step": 70 }, { "epoch": 0.375, "grad_norm": 0.46484375, "learning_rate": 7.434916153350836e-05, "loss": 0.0744, "step": 75 }, { "epoch": 0.4, "grad_norm": 0.81640625, "learning_rate": 7.417031339981032e-05, "loss": 0.0784, "step": 80 }, { "epoch": 0.425, "grad_norm": 0.57421875, "learning_rate": 7.397037296193046e-05, "loss": 0.0773, "step": 85 }, { "epoch": 0.45, "grad_norm": 0.55078125, "learning_rate": 7.374949475083626e-05, "loss": 0.0744, "step": 90 }, { "epoch": 0.475, "grad_norm": 0.53125, "learning_rate": 7.350784947998634e-05, "loss": 0.0736, "step": 95 }, { "epoch": 0.5, "grad_norm": 0.51953125, "learning_rate": 7.324562391338845e-05, "loss": 0.0729, "step": 100 }, { "epoch": 0.525, "grad_norm": 0.578125, "learning_rate": 7.296302072125253e-05, "loss": 0.0735, "step": 105 }, { "epoch": 0.55, "grad_norm": 0.75, "learning_rate": 7.266025832334978e-05, "loss": 0.0738, "step": 110 }, { "epoch": 0.575, "grad_norm": 0.5390625, "learning_rate": 7.233757072019941e-05, "loss": 0.0735, "step": 115 }, { "epoch": 0.6, "grad_norm": 0.59375, "learning_rate": 7.19952073122131e-05, "loss": 0.0715, "step": 120 }, { "epoch": 0.625, "grad_norm": 0.671875, "learning_rate": 7.163343270693716e-05, "loss": 0.0696, "step": 125 }, { "epoch": 0.65, "grad_norm": 0.5, "learning_rate": 7.125252651454133e-05, "loss": 0.0697, "step": 130 }, { "epoch": 0.675, "grad_norm": 0.66015625, "learning_rate": 7.085278313171226e-05, "loss": 0.0697, "step": 135 }, { "epoch": 0.7, "grad_norm": 0.56640625, "learning_rate": 7.043451151411875e-05, "loss": 0.0677, "step": 140 }, { "epoch": 0.725, "grad_norm": 0.53125, "learning_rate": 6.999803493762452e-05, "loss": 0.0711, "step": 145 }, { "epoch": 0.75, "grad_norm": 0.55078125, "learning_rate": 6.954369074843315e-05, "loss": 0.0712, "step": 150 }, { "epoch": 0.775, "grad_norm": 0.515625, "learning_rate": 6.907183010235823e-05, "loss": 0.0715, "step": 155 }, { "epoch": 0.8, "grad_norm": 0.453125, "learning_rate": 6.858281769342038e-05, "loss": 0.0715, "step": 160 }, { "epoch": 0.825, "grad_norm": 0.482421875, "learning_rate": 6.807703147198056e-05, "loss": 0.0699, "step": 165 }, { "epoch": 0.85, "grad_norm": 0.50390625, "learning_rate": 6.755486235262808e-05, "loss": 0.0686, "step": 170 }, { "epoch": 0.875, "grad_norm": 0.515625, "learning_rate": 6.701671391204843e-05, "loss": 0.0681, "step": 175 }, { "epoch": 0.9, "grad_norm": 0.66796875, "learning_rate": 6.646300207710494e-05, "loss": 0.0683, "step": 180 }, { "epoch": 0.925, "grad_norm": 0.44921875, "learning_rate": 6.589415480337521e-05, "loss": 0.0663, "step": 185 }, { "epoch": 0.95, "grad_norm": 0.412109375, "learning_rate": 6.531061174439061e-05, "loss": 0.0685, "step": 190 }, { "epoch": 0.975, "grad_norm": 0.38671875, "learning_rate": 6.471282391183463e-05, "loss": 0.0662, "step": 195 }, { "epoch": 1.0, "grad_norm": 0.42578125, "learning_rate": 6.410125332696272e-05, "loss": 0.0675, "step": 200 }, { "epoch": 1.0, "eval_loss": 0.06526587158441544, "eval_runtime": 1.0744, "eval_samples_per_second": 22.339, "eval_steps_per_second": 22.339, "step": 200 }, { "epoch": 1.025, "grad_norm": 0.453125, "learning_rate": 6.347637266351305e-05, "loss": 0.0581, "step": 205 }, { "epoch": 1.05, "grad_norm": 0.5078125, "learning_rate": 6.2838664882384e-05, "loss": 0.057, "step": 210 }, { "epoch": 1.075, "grad_norm": 0.44921875, "learning_rate": 6.218862285836094e-05, "loss": 0.0587, "step": 215 }, { "epoch": 1.1, "grad_norm": 0.447265625, "learning_rate": 6.152674899918066e-05, "loss": 0.0558, "step": 220 }, { "epoch": 1.125, "grad_norm": 0.53125, "learning_rate": 6.085355485722805e-05, "loss": 0.0579, "step": 225 }, { "epoch": 1.15, "grad_norm": 0.43359375, "learning_rate": 6.016956073416482e-05, "loss": 0.059, "step": 230 }, { "epoch": 1.175, "grad_norm": 0.61328125, "learning_rate": 5.9475295278796255e-05, "loss": 0.0587, "step": 235 }, { "epoch": 1.2, "grad_norm": 0.380859375, "learning_rate": 5.877129507848637e-05, "loss": 0.0593, "step": 240 }, { "epoch": 1.225, "grad_norm": 0.50390625, "learning_rate": 5.805810424443765e-05, "loss": 0.0584, "step": 245 }, { "epoch": 1.25, "grad_norm": 0.484375, "learning_rate": 5.733627399115563e-05, "loss": 0.0592, "step": 250 }, { "epoch": 1.275, "grad_norm": 0.5078125, "learning_rate": 5.66063622104235e-05, "loss": 0.057, "step": 255 }, { "epoch": 1.3, "grad_norm": 0.48046875, "learning_rate": 5.586893304011584e-05, "loss": 0.0573, "step": 260 }, { "epoch": 1.325, "grad_norm": 0.486328125, "learning_rate": 5.512455642818499e-05, "loss": 0.0583, "step": 265 }, { "epoch": 1.35, "grad_norm": 0.4453125, "learning_rate": 5.4373807692156764e-05, "loss": 0.058, "step": 270 }, { "epoch": 1.375, "grad_norm": 0.421875, "learning_rate": 5.36172670744762e-05, "loss": 0.0575, "step": 275 }, { "epoch": 1.4, "grad_norm": 0.369140625, "learning_rate": 5.285551929404679e-05, "loss": 0.0551, "step": 280 }, { "epoch": 1.425, "grad_norm": 0.412109375, "learning_rate": 5.208915309431016e-05, "loss": 0.0573, "step": 285 }, { "epoch": 1.45, "grad_norm": 0.453125, "learning_rate": 5.13187607882149e-05, "loss": 0.059, "step": 290 }, { "epoch": 1.475, "grad_norm": 0.35546875, "learning_rate": 5.0544937800426944e-05, "loss": 0.0579, "step": 295 }, { "epoch": 1.5, "grad_norm": 0.484375, "learning_rate": 4.976828220713459e-05, "loss": 0.0562, "step": 300 }, { "epoch": 1.525, "grad_norm": 0.443359375, "learning_rate": 4.8989394273804545e-05, "loss": 0.057, "step": 305 }, { "epoch": 1.55, "grad_norm": 0.36328125, "learning_rate": 4.820887599124563e-05, "loss": 0.0572, "step": 310 }, { "epoch": 1.575, "grad_norm": 0.41015625, "learning_rate": 4.7427330610339186e-05, "loss": 0.0562, "step": 315 }, { "epoch": 1.6, "grad_norm": 0.416015625, "learning_rate": 4.66453621757954e-05, "loss": 0.0561, "step": 320 }, { "epoch": 1.625, "grad_norm": 0.373046875, "learning_rate": 4.5863575059296397e-05, "loss": 0.0574, "step": 325 }, { "epoch": 1.65, "grad_norm": 0.44921875, "learning_rate": 4.508257349238615e-05, "loss": 0.059, "step": 330 }, { "epoch": 1.675, "grad_norm": 0.46484375, "learning_rate": 4.4302961099469247e-05, "loss": 0.0559, "step": 335 }, { "epoch": 1.7, "grad_norm": 0.369140625, "learning_rate": 4.352534043127848e-05, "loss": 0.0567, "step": 340 }, { "epoch": 1.725, "grad_norm": 0.46484375, "learning_rate": 4.275031249917243e-05, "loss": 0.0548, "step": 345 }, { "epoch": 1.75, "grad_norm": 0.396484375, "learning_rate": 4.197847631062287e-05, "loss": 0.0583, "step": 350 }, { "epoch": 1.775, "grad_norm": 0.4296875, "learning_rate": 4.121042840625079e-05, "loss": 0.0548, "step": 355 }, { "epoch": 1.8, "grad_norm": 0.33984375, "learning_rate": 4.044676239876911e-05, "loss": 0.055, "step": 360 }, { "epoch": 1.825, "grad_norm": 0.365234375, "learning_rate": 3.9688068514188365e-05, "loss": 0.0549, "step": 365 }, { "epoch": 1.85, "grad_norm": 0.486328125, "learning_rate": 3.893493313563978e-05, "loss": 0.0552, "step": 370 }, { "epoch": 1.875, "grad_norm": 0.37109375, "learning_rate": 3.8187938350168616e-05, "loss": 0.0554, "step": 375 }, { "epoch": 1.9, "grad_norm": 0.59375, "learning_rate": 3.744766149884778e-05, "loss": 0.0563, "step": 380 }, { "epoch": 1.925, "grad_norm": 0.396484375, "learning_rate": 3.671467473055956e-05, "loss": 0.0552, "step": 385 }, { "epoch": 1.95, "grad_norm": 0.361328125, "learning_rate": 3.598954455979035e-05, "loss": 0.0555, "step": 390 }, { "epoch": 1.975, "grad_norm": 0.37890625, "learning_rate": 3.52728314287801e-05, "loss": 0.0565, "step": 395 }, { "epoch": 2.0, "grad_norm": 0.390625, "learning_rate": 3.4565089274364856e-05, "loss": 0.0541, "step": 400 }, { "epoch": 2.0, "eval_loss": 0.06434744596481323, "eval_runtime": 0.9755, "eval_samples_per_second": 24.602, "eval_steps_per_second": 24.602, "step": 400 } ], "logging_steps": 5, "max_steps": 600, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.46279712014336e+17, "train_batch_size": 140, "trial_name": null, "trial_params": null }