| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.0, |
| "eval_steps": 500, |
| "global_step": 400, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.025, |
| "grad_norm": 5.625, |
| "learning_rate": 8.572335756456368e-06, |
| "loss": 0.2415, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.05, |
| "grad_norm": 3.140625, |
| "learning_rate": 1.9287755452026826e-05, |
| "loss": 0.1367, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.075, |
| "grad_norm": 2.859375, |
| "learning_rate": 3.000317514759729e-05, |
| "loss": 0.1025, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.1, |
| "grad_norm": 2.21875, |
| "learning_rate": 4.071859484316775e-05, |
| "loss": 0.0911, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.125, |
| "grad_norm": 1.6953125, |
| "learning_rate": 5.143401453873821e-05, |
| "loss": 0.0852, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.15, |
| "grad_norm": 3.765625, |
| "learning_rate": 6.214943423430867e-05, |
| "loss": 0.0845, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.175, |
| "grad_norm": 0.80078125, |
| "learning_rate": 7.286485392987913e-05, |
| "loss": 0.0843, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 1.515625, |
| "learning_rate": 7.500098100637213e-05, |
| "loss": 0.0861, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.225, |
| "grad_norm": 0.82421875, |
| "learning_rate": 7.497272464974502e-05, |
| "loss": 0.0792, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.25, |
| "grad_norm": 0.78515625, |
| "learning_rate": 7.492275581730845e-05, |
| "loss": 0.0796, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.275, |
| "grad_norm": 0.68359375, |
| "learning_rate": 7.485111312922398e-05, |
| "loss": 0.077, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.3, |
| "grad_norm": 0.56640625, |
| "learning_rate": 7.475785195705139e-05, |
| "loss": 0.0763, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.325, |
| "grad_norm": 0.69921875, |
| "learning_rate": 7.464304438095277e-05, |
| "loss": 0.0754, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.35, |
| "grad_norm": 0.609375, |
| "learning_rate": 7.450677913398279e-05, |
| "loss": 0.0749, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.375, |
| "grad_norm": 0.46484375, |
| "learning_rate": 7.434916153350836e-05, |
| "loss": 0.0744, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.4, |
| "grad_norm": 0.81640625, |
| "learning_rate": 7.417031339981032e-05, |
| "loss": 0.0784, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.425, |
| "grad_norm": 0.57421875, |
| "learning_rate": 7.397037296193046e-05, |
| "loss": 0.0773, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.45, |
| "grad_norm": 0.55078125, |
| "learning_rate": 7.374949475083626e-05, |
| "loss": 0.0744, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.475, |
| "grad_norm": 0.53125, |
| "learning_rate": 7.350784947998634e-05, |
| "loss": 0.0736, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.5, |
| "grad_norm": 0.51953125, |
| "learning_rate": 7.324562391338845e-05, |
| "loss": 0.0729, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.525, |
| "grad_norm": 0.578125, |
| "learning_rate": 7.296302072125253e-05, |
| "loss": 0.0735, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.55, |
| "grad_norm": 0.75, |
| "learning_rate": 7.266025832334978e-05, |
| "loss": 0.0738, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.575, |
| "grad_norm": 0.5390625, |
| "learning_rate": 7.233757072019941e-05, |
| "loss": 0.0735, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.6, |
| "grad_norm": 0.59375, |
| "learning_rate": 7.19952073122131e-05, |
| "loss": 0.0715, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.625, |
| "grad_norm": 0.671875, |
| "learning_rate": 7.163343270693716e-05, |
| "loss": 0.0696, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.65, |
| "grad_norm": 0.5, |
| "learning_rate": 7.125252651454133e-05, |
| "loss": 0.0697, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.675, |
| "grad_norm": 0.66015625, |
| "learning_rate": 7.085278313171226e-05, |
| "loss": 0.0697, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.7, |
| "grad_norm": 0.56640625, |
| "learning_rate": 7.043451151411875e-05, |
| "loss": 0.0677, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.725, |
| "grad_norm": 0.53125, |
| "learning_rate": 6.999803493762452e-05, |
| "loss": 0.0711, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.75, |
| "grad_norm": 0.55078125, |
| "learning_rate": 6.954369074843315e-05, |
| "loss": 0.0712, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.775, |
| "grad_norm": 0.515625, |
| "learning_rate": 6.907183010235823e-05, |
| "loss": 0.0715, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.8, |
| "grad_norm": 0.453125, |
| "learning_rate": 6.858281769342038e-05, |
| "loss": 0.0715, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.825, |
| "grad_norm": 0.482421875, |
| "learning_rate": 6.807703147198056e-05, |
| "loss": 0.0699, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.85, |
| "grad_norm": 0.50390625, |
| "learning_rate": 6.755486235262808e-05, |
| "loss": 0.0686, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.875, |
| "grad_norm": 0.515625, |
| "learning_rate": 6.701671391204843e-05, |
| "loss": 0.0681, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.9, |
| "grad_norm": 0.66796875, |
| "learning_rate": 6.646300207710494e-05, |
| "loss": 0.0683, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.925, |
| "grad_norm": 0.44921875, |
| "learning_rate": 6.589415480337521e-05, |
| "loss": 0.0663, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.95, |
| "grad_norm": 0.412109375, |
| "learning_rate": 6.531061174439061e-05, |
| "loss": 0.0685, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.975, |
| "grad_norm": 0.38671875, |
| "learning_rate": 6.471282391183463e-05, |
| "loss": 0.0662, |
| "step": 195 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 0.42578125, |
| "learning_rate": 6.410125332696272e-05, |
| "loss": 0.0675, |
| "step": 200 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_loss": 0.06526587158441544, |
| "eval_runtime": 1.0744, |
| "eval_samples_per_second": 22.339, |
| "eval_steps_per_second": 22.339, |
| "step": 200 |
| }, |
| { |
| "epoch": 1.025, |
| "grad_norm": 0.453125, |
| "learning_rate": 6.347637266351305e-05, |
| "loss": 0.0581, |
| "step": 205 |
| }, |
| { |
| "epoch": 1.05, |
| "grad_norm": 0.5078125, |
| "learning_rate": 6.2838664882384e-05, |
| "loss": 0.057, |
| "step": 210 |
| }, |
| { |
| "epoch": 1.075, |
| "grad_norm": 0.44921875, |
| "learning_rate": 6.218862285836094e-05, |
| "loss": 0.0587, |
| "step": 215 |
| }, |
| { |
| "epoch": 1.1, |
| "grad_norm": 0.447265625, |
| "learning_rate": 6.152674899918066e-05, |
| "loss": 0.0558, |
| "step": 220 |
| }, |
| { |
| "epoch": 1.125, |
| "grad_norm": 0.53125, |
| "learning_rate": 6.085355485722805e-05, |
| "loss": 0.0579, |
| "step": 225 |
| }, |
| { |
| "epoch": 1.15, |
| "grad_norm": 0.43359375, |
| "learning_rate": 6.016956073416482e-05, |
| "loss": 0.059, |
| "step": 230 |
| }, |
| { |
| "epoch": 1.175, |
| "grad_norm": 0.61328125, |
| "learning_rate": 5.9475295278796255e-05, |
| "loss": 0.0587, |
| "step": 235 |
| }, |
| { |
| "epoch": 1.2, |
| "grad_norm": 0.380859375, |
| "learning_rate": 5.877129507848637e-05, |
| "loss": 0.0593, |
| "step": 240 |
| }, |
| { |
| "epoch": 1.225, |
| "grad_norm": 0.50390625, |
| "learning_rate": 5.805810424443765e-05, |
| "loss": 0.0584, |
| "step": 245 |
| }, |
| { |
| "epoch": 1.25, |
| "grad_norm": 0.484375, |
| "learning_rate": 5.733627399115563e-05, |
| "loss": 0.0592, |
| "step": 250 |
| }, |
| { |
| "epoch": 1.275, |
| "grad_norm": 0.5078125, |
| "learning_rate": 5.66063622104235e-05, |
| "loss": 0.057, |
| "step": 255 |
| }, |
| { |
| "epoch": 1.3, |
| "grad_norm": 0.48046875, |
| "learning_rate": 5.586893304011584e-05, |
| "loss": 0.0573, |
| "step": 260 |
| }, |
| { |
| "epoch": 1.325, |
| "grad_norm": 0.486328125, |
| "learning_rate": 5.512455642818499e-05, |
| "loss": 0.0583, |
| "step": 265 |
| }, |
| { |
| "epoch": 1.35, |
| "grad_norm": 0.4453125, |
| "learning_rate": 5.4373807692156764e-05, |
| "loss": 0.058, |
| "step": 270 |
| }, |
| { |
| "epoch": 1.375, |
| "grad_norm": 0.421875, |
| "learning_rate": 5.36172670744762e-05, |
| "loss": 0.0575, |
| "step": 275 |
| }, |
| { |
| "epoch": 1.4, |
| "grad_norm": 0.369140625, |
| "learning_rate": 5.285551929404679e-05, |
| "loss": 0.0551, |
| "step": 280 |
| }, |
| { |
| "epoch": 1.425, |
| "grad_norm": 0.412109375, |
| "learning_rate": 5.208915309431016e-05, |
| "loss": 0.0573, |
| "step": 285 |
| }, |
| { |
| "epoch": 1.45, |
| "grad_norm": 0.453125, |
| "learning_rate": 5.13187607882149e-05, |
| "loss": 0.059, |
| "step": 290 |
| }, |
| { |
| "epoch": 1.475, |
| "grad_norm": 0.35546875, |
| "learning_rate": 5.0544937800426944e-05, |
| "loss": 0.0579, |
| "step": 295 |
| }, |
| { |
| "epoch": 1.5, |
| "grad_norm": 0.484375, |
| "learning_rate": 4.976828220713459e-05, |
| "loss": 0.0562, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.525, |
| "grad_norm": 0.443359375, |
| "learning_rate": 4.8989394273804545e-05, |
| "loss": 0.057, |
| "step": 305 |
| }, |
| { |
| "epoch": 1.55, |
| "grad_norm": 0.36328125, |
| "learning_rate": 4.820887599124563e-05, |
| "loss": 0.0572, |
| "step": 310 |
| }, |
| { |
| "epoch": 1.575, |
| "grad_norm": 0.41015625, |
| "learning_rate": 4.7427330610339186e-05, |
| "loss": 0.0562, |
| "step": 315 |
| }, |
| { |
| "epoch": 1.6, |
| "grad_norm": 0.416015625, |
| "learning_rate": 4.66453621757954e-05, |
| "loss": 0.0561, |
| "step": 320 |
| }, |
| { |
| "epoch": 1.625, |
| "grad_norm": 0.373046875, |
| "learning_rate": 4.5863575059296397e-05, |
| "loss": 0.0574, |
| "step": 325 |
| }, |
| { |
| "epoch": 1.65, |
| "grad_norm": 0.44921875, |
| "learning_rate": 4.508257349238615e-05, |
| "loss": 0.059, |
| "step": 330 |
| }, |
| { |
| "epoch": 1.675, |
| "grad_norm": 0.46484375, |
| "learning_rate": 4.4302961099469247e-05, |
| "loss": 0.0559, |
| "step": 335 |
| }, |
| { |
| "epoch": 1.7, |
| "grad_norm": 0.369140625, |
| "learning_rate": 4.352534043127848e-05, |
| "loss": 0.0567, |
| "step": 340 |
| }, |
| { |
| "epoch": 1.725, |
| "grad_norm": 0.46484375, |
| "learning_rate": 4.275031249917243e-05, |
| "loss": 0.0548, |
| "step": 345 |
| }, |
| { |
| "epoch": 1.75, |
| "grad_norm": 0.396484375, |
| "learning_rate": 4.197847631062287e-05, |
| "loss": 0.0583, |
| "step": 350 |
| }, |
| { |
| "epoch": 1.775, |
| "grad_norm": 0.4296875, |
| "learning_rate": 4.121042840625079e-05, |
| "loss": 0.0548, |
| "step": 355 |
| }, |
| { |
| "epoch": 1.8, |
| "grad_norm": 0.33984375, |
| "learning_rate": 4.044676239876911e-05, |
| "loss": 0.055, |
| "step": 360 |
| }, |
| { |
| "epoch": 1.825, |
| "grad_norm": 0.365234375, |
| "learning_rate": 3.9688068514188365e-05, |
| "loss": 0.0549, |
| "step": 365 |
| }, |
| { |
| "epoch": 1.85, |
| "grad_norm": 0.486328125, |
| "learning_rate": 3.893493313563978e-05, |
| "loss": 0.0552, |
| "step": 370 |
| }, |
| { |
| "epoch": 1.875, |
| "grad_norm": 0.37109375, |
| "learning_rate": 3.8187938350168616e-05, |
| "loss": 0.0554, |
| "step": 375 |
| }, |
| { |
| "epoch": 1.9, |
| "grad_norm": 0.59375, |
| "learning_rate": 3.744766149884778e-05, |
| "loss": 0.0563, |
| "step": 380 |
| }, |
| { |
| "epoch": 1.925, |
| "grad_norm": 0.396484375, |
| "learning_rate": 3.671467473055956e-05, |
| "loss": 0.0552, |
| "step": 385 |
| }, |
| { |
| "epoch": 1.95, |
| "grad_norm": 0.361328125, |
| "learning_rate": 3.598954455979035e-05, |
| "loss": 0.0555, |
| "step": 390 |
| }, |
| { |
| "epoch": 1.975, |
| "grad_norm": 0.37890625, |
| "learning_rate": 3.52728314287801e-05, |
| "loss": 0.0565, |
| "step": 395 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 0.390625, |
| "learning_rate": 3.4565089274364856e-05, |
| "loss": 0.0541, |
| "step": 400 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_loss": 0.06434744596481323, |
| "eval_runtime": 0.9755, |
| "eval_samples_per_second": 24.602, |
| "eval_steps_per_second": 24.602, |
| "step": 400 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 600, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 2.46279712014336e+17, |
| "train_batch_size": 140, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|