| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.0, |
| "eval_steps": 100, |
| "global_step": 3000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.07, |
| "grad_norm": 3.5600266456604004, |
| "learning_rate": 1.9333333333333333e-05, |
| "loss": 1.6248, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.07, |
| "eval_loss": 1.5200175046920776, |
| "eval_runtime": 39.9101, |
| "eval_samples_per_second": 25.056, |
| "eval_steps_per_second": 3.132, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.13, |
| "grad_norm": 2.83909273147583, |
| "learning_rate": 1.866666666666667e-05, |
| "loss": 1.5193, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.13, |
| "eval_loss": 1.5256463289260864, |
| "eval_runtime": 39.9255, |
| "eval_samples_per_second": 25.047, |
| "eval_steps_per_second": 3.131, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.2, |
| "grad_norm": 2.1943609714508057, |
| "learning_rate": 1.8e-05, |
| "loss": 1.4986, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.2, |
| "eval_loss": 1.5103366374969482, |
| "eval_runtime": 39.9391, |
| "eval_samples_per_second": 25.038, |
| "eval_steps_per_second": 3.13, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.27, |
| "grad_norm": 3.015270709991455, |
| "learning_rate": 1.7333333333333336e-05, |
| "loss": 1.4868, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.27, |
| "eval_loss": 1.4908276796340942, |
| "eval_runtime": 39.9824, |
| "eval_samples_per_second": 25.011, |
| "eval_steps_per_second": 3.126, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.33, |
| "grad_norm": 3.107252836227417, |
| "learning_rate": 1.6666666666666667e-05, |
| "loss": 1.4795, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.33, |
| "eval_loss": 1.459694743156433, |
| "eval_runtime": 39.9779, |
| "eval_samples_per_second": 25.014, |
| "eval_steps_per_second": 3.127, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.4, |
| "grad_norm": 3.612938642501831, |
| "learning_rate": 1.6000000000000003e-05, |
| "loss": 1.4927, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.4, |
| "eval_loss": 1.496130347251892, |
| "eval_runtime": 39.9928, |
| "eval_samples_per_second": 25.004, |
| "eval_steps_per_second": 3.126, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.47, |
| "grad_norm": 2.127037763595581, |
| "learning_rate": 1.5333333333333334e-05, |
| "loss": 1.4549, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.47, |
| "eval_loss": 1.4545286893844604, |
| "eval_runtime": 39.9652, |
| "eval_samples_per_second": 25.022, |
| "eval_steps_per_second": 3.128, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.53, |
| "grad_norm": 2.8459372520446777, |
| "learning_rate": 1.4666666666666666e-05, |
| "loss": 1.4764, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.53, |
| "eval_loss": 1.472076177597046, |
| "eval_runtime": 39.984, |
| "eval_samples_per_second": 25.01, |
| "eval_steps_per_second": 3.126, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.6, |
| "grad_norm": 2.973015546798706, |
| "learning_rate": 1.4e-05, |
| "loss": 1.4576, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.6, |
| "eval_loss": 1.4507238864898682, |
| "eval_runtime": 39.9652, |
| "eval_samples_per_second": 25.022, |
| "eval_steps_per_second": 3.128, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.67, |
| "grad_norm": 3.1685054302215576, |
| "learning_rate": 1.3333333333333333e-05, |
| "loss": 1.4547, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.67, |
| "eval_loss": 1.4341034889221191, |
| "eval_runtime": 39.9594, |
| "eval_samples_per_second": 25.025, |
| "eval_steps_per_second": 3.128, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.73, |
| "grad_norm": 2.5751569271087646, |
| "learning_rate": 1.2666666666666667e-05, |
| "loss": 1.4304, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.73, |
| "eval_loss": 1.4542651176452637, |
| "eval_runtime": 39.9141, |
| "eval_samples_per_second": 25.054, |
| "eval_steps_per_second": 3.132, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.8, |
| "grad_norm": 3.4009389877319336, |
| "learning_rate": 1.2e-05, |
| "loss": 1.4094, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.8, |
| "eval_loss": 1.4490467309951782, |
| "eval_runtime": 39.8559, |
| "eval_samples_per_second": 25.09, |
| "eval_steps_per_second": 3.136, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.87, |
| "grad_norm": 3.2304933071136475, |
| "learning_rate": 1.1333333333333334e-05, |
| "loss": 1.4144, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.87, |
| "eval_loss": 1.411392092704773, |
| "eval_runtime": 39.8705, |
| "eval_samples_per_second": 25.081, |
| "eval_steps_per_second": 3.135, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.93, |
| "grad_norm": 2.266749858856201, |
| "learning_rate": 1.0666666666666667e-05, |
| "loss": 1.4275, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.93, |
| "eval_loss": 1.4107253551483154, |
| "eval_runtime": 39.8875, |
| "eval_samples_per_second": 25.071, |
| "eval_steps_per_second": 3.134, |
| "step": 1400 |
| }, |
| { |
| "epoch": 1.0, |
| "grad_norm": 2.8319966793060303, |
| "learning_rate": 1e-05, |
| "loss": 1.4112, |
| "step": 1500 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_loss": 1.4000786542892456, |
| "eval_runtime": 39.906, |
| "eval_samples_per_second": 25.059, |
| "eval_steps_per_second": 3.132, |
| "step": 1500 |
| }, |
| { |
| "epoch": 1.07, |
| "grad_norm": 3.5198869705200195, |
| "learning_rate": 9.333333333333334e-06, |
| "loss": 1.3564, |
| "step": 1600 |
| }, |
| { |
| "epoch": 1.07, |
| "eval_loss": 1.402136206626892, |
| "eval_runtime": 39.9894, |
| "eval_samples_per_second": 25.007, |
| "eval_steps_per_second": 3.126, |
| "step": 1600 |
| }, |
| { |
| "epoch": 1.13, |
| "grad_norm": 3.098515510559082, |
| "learning_rate": 8.666666666666668e-06, |
| "loss": 1.3579, |
| "step": 1700 |
| }, |
| { |
| "epoch": 1.13, |
| "eval_loss": 1.4018568992614746, |
| "eval_runtime": 39.9664, |
| "eval_samples_per_second": 25.021, |
| "eval_steps_per_second": 3.128, |
| "step": 1700 |
| }, |
| { |
| "epoch": 1.2, |
| "grad_norm": 2.2909343242645264, |
| "learning_rate": 8.000000000000001e-06, |
| "loss": 1.3538, |
| "step": 1800 |
| }, |
| { |
| "epoch": 1.2, |
| "eval_loss": 1.3881511688232422, |
| "eval_runtime": 39.9759, |
| "eval_samples_per_second": 25.015, |
| "eval_steps_per_second": 3.127, |
| "step": 1800 |
| }, |
| { |
| "epoch": 1.27, |
| "grad_norm": 3.0837056636810303, |
| "learning_rate": 7.333333333333333e-06, |
| "loss": 1.3425, |
| "step": 1900 |
| }, |
| { |
| "epoch": 1.27, |
| "eval_loss": 1.3771216869354248, |
| "eval_runtime": 39.9844, |
| "eval_samples_per_second": 25.01, |
| "eval_steps_per_second": 3.126, |
| "step": 1900 |
| }, |
| { |
| "epoch": 1.33, |
| "grad_norm": 2.912759780883789, |
| "learning_rate": 6.666666666666667e-06, |
| "loss": 1.3369, |
| "step": 2000 |
| }, |
| { |
| "epoch": 1.33, |
| "eval_loss": 1.3808449506759644, |
| "eval_runtime": 39.9353, |
| "eval_samples_per_second": 25.04, |
| "eval_steps_per_second": 3.13, |
| "step": 2000 |
| }, |
| { |
| "epoch": 1.4, |
| "grad_norm": 4.19409704208374, |
| "learning_rate": 6.006666666666667e-06, |
| "loss": 1.3237, |
| "step": 2100 |
| }, |
| { |
| "epoch": 1.4, |
| "eval_loss": 1.3673079013824463, |
| "eval_runtime": 39.8846, |
| "eval_samples_per_second": 25.072, |
| "eval_steps_per_second": 3.134, |
| "step": 2100 |
| }, |
| { |
| "epoch": 1.47, |
| "grad_norm": 3.5358569622039795, |
| "learning_rate": 5.3400000000000005e-06, |
| "loss": 1.3182, |
| "step": 2200 |
| }, |
| { |
| "epoch": 1.47, |
| "eval_loss": 1.414589524269104, |
| "eval_runtime": 39.8676, |
| "eval_samples_per_second": 25.083, |
| "eval_steps_per_second": 3.135, |
| "step": 2200 |
| }, |
| { |
| "epoch": 1.53, |
| "grad_norm": 3.819859743118286, |
| "learning_rate": 4.673333333333333e-06, |
| "loss": 1.3162, |
| "step": 2300 |
| }, |
| { |
| "epoch": 1.53, |
| "eval_loss": 1.4112467765808105, |
| "eval_runtime": 39.885, |
| "eval_samples_per_second": 25.072, |
| "eval_steps_per_second": 3.134, |
| "step": 2300 |
| }, |
| { |
| "epoch": 1.6, |
| "grad_norm": 2.633864402770996, |
| "learning_rate": 4.006666666666667e-06, |
| "loss": 1.3305, |
| "step": 2400 |
| }, |
| { |
| "epoch": 1.6, |
| "eval_loss": 1.3370815515518188, |
| "eval_runtime": 39.8724, |
| "eval_samples_per_second": 25.08, |
| "eval_steps_per_second": 3.135, |
| "step": 2400 |
| }, |
| { |
| "epoch": 1.67, |
| "grad_norm": 2.9776573181152344, |
| "learning_rate": 3.3400000000000006e-06, |
| "loss": 1.3137, |
| "step": 2500 |
| }, |
| { |
| "epoch": 1.67, |
| "eval_loss": 1.3839720487594604, |
| "eval_runtime": 39.8922, |
| "eval_samples_per_second": 25.068, |
| "eval_steps_per_second": 3.133, |
| "step": 2500 |
| }, |
| { |
| "epoch": 1.73, |
| "grad_norm": 3.757050037384033, |
| "learning_rate": 2.6733333333333333e-06, |
| "loss": 1.2883, |
| "step": 2600 |
| }, |
| { |
| "epoch": 1.73, |
| "eval_loss": 1.3844949007034302, |
| "eval_runtime": 39.9563, |
| "eval_samples_per_second": 25.027, |
| "eval_steps_per_second": 3.128, |
| "step": 2600 |
| }, |
| { |
| "epoch": 1.8, |
| "grad_norm": 2.7019670009613037, |
| "learning_rate": 2.006666666666667e-06, |
| "loss": 1.2819, |
| "step": 2700 |
| }, |
| { |
| "epoch": 1.8, |
| "eval_loss": 1.3609910011291504, |
| "eval_runtime": 39.9383, |
| "eval_samples_per_second": 25.039, |
| "eval_steps_per_second": 3.13, |
| "step": 2700 |
| }, |
| { |
| "epoch": 1.87, |
| "grad_norm": 3.4379355907440186, |
| "learning_rate": 1.34e-06, |
| "loss": 1.3003, |
| "step": 2800 |
| }, |
| { |
| "epoch": 1.87, |
| "eval_loss": 1.3372739553451538, |
| "eval_runtime": 39.9408, |
| "eval_samples_per_second": 25.037, |
| "eval_steps_per_second": 3.13, |
| "step": 2800 |
| }, |
| { |
| "epoch": 1.93, |
| "grad_norm": 3.176496744155884, |
| "learning_rate": 6.733333333333334e-07, |
| "loss": 1.2928, |
| "step": 2900 |
| }, |
| { |
| "epoch": 1.93, |
| "eval_loss": 1.3514271974563599, |
| "eval_runtime": 39.9216, |
| "eval_samples_per_second": 25.049, |
| "eval_steps_per_second": 3.131, |
| "step": 2900 |
| }, |
| { |
| "epoch": 2.0, |
| "grad_norm": 2.788795232772827, |
| "learning_rate": 6.666666666666667e-09, |
| "loss": 1.2878, |
| "step": 3000 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_loss": 1.3470672369003296, |
| "eval_runtime": 39.9358, |
| "eval_samples_per_second": 25.04, |
| "eval_steps_per_second": 3.13, |
| "step": 3000 |
| } |
| ], |
| "logging_steps": 100, |
| "max_steps": 3000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 2, |
| "save_steps": 1500, |
| "total_flos": 2.83206569951232e+17, |
| "train_batch_size": 8, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|