| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 10.0, | |
| "eval_steps": 500, | |
| "global_step": 3110, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.3215434083601286, | |
| "grad_norm": 2206121.5, | |
| "learning_rate": 5.807073954983923e-06, | |
| "loss": 0.7552, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.6430868167202572, | |
| "grad_norm": 2082399.375, | |
| "learning_rate": 5.6141479099678455e-06, | |
| "loss": 0.6123, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.9646302250803859, | |
| "grad_norm": 2188612.25, | |
| "learning_rate": 5.421221864951768e-06, | |
| "loss": 0.5863, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.8456913828849792, | |
| "eval_runtime": 119.4823, | |
| "eval_samples_per_second": 25.811, | |
| "eval_steps_per_second": 3.231, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 1.2861736334405145, | |
| "grad_norm": 2229254.75, | |
| "learning_rate": 5.228295819935691e-06, | |
| "loss": 0.5724, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.607717041800643, | |
| "grad_norm": 1792601.625, | |
| "learning_rate": 5.035369774919614e-06, | |
| "loss": 0.5073, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.9292604501607717, | |
| "grad_norm": 1635809.75, | |
| "learning_rate": 4.842443729903537e-06, | |
| "loss": 0.546, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.840407133102417, | |
| "eval_runtime": 119.5409, | |
| "eval_samples_per_second": 25.799, | |
| "eval_steps_per_second": 3.229, | |
| "step": 622 | |
| }, | |
| { | |
| "epoch": 2.2508038585209005, | |
| "grad_norm": 2690947.75, | |
| "learning_rate": 4.6495176848874605e-06, | |
| "loss": 0.4998, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 2.572347266881029, | |
| "grad_norm": 2749350.25, | |
| "learning_rate": 4.456591639871383e-06, | |
| "loss": 0.4927, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 2.8938906752411575, | |
| "grad_norm": 3980070.25, | |
| "learning_rate": 4.263665594855306e-06, | |
| "loss": 0.486, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 0.9221391081809998, | |
| "eval_runtime": 119.2209, | |
| "eval_samples_per_second": 25.868, | |
| "eval_steps_per_second": 3.238, | |
| "step": 933 | |
| }, | |
| { | |
| "epoch": 3.215434083601286, | |
| "grad_norm": 2559816.5, | |
| "learning_rate": 4.0707395498392284e-06, | |
| "loss": 0.4714, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 3.536977491961415, | |
| "grad_norm": 1433354.625, | |
| "learning_rate": 3.877813504823151e-06, | |
| "loss": 0.4734, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 3.8585209003215435, | |
| "grad_norm": 2509238.5, | |
| "learning_rate": 3.6848874598070737e-06, | |
| "loss": 0.4622, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 0.9068471193313599, | |
| "eval_runtime": 119.3472, | |
| "eval_samples_per_second": 25.841, | |
| "eval_steps_per_second": 3.234, | |
| "step": 1244 | |
| }, | |
| { | |
| "epoch": 4.180064308681672, | |
| "grad_norm": 1468095.75, | |
| "learning_rate": 3.491961414790997e-06, | |
| "loss": 0.4211, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 4.501607717041801, | |
| "grad_norm": 3453460.25, | |
| "learning_rate": 3.2990353697749195e-06, | |
| "loss": 0.4318, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 4.823151125401929, | |
| "grad_norm": 2839102.25, | |
| "learning_rate": 3.106109324758843e-06, | |
| "loss": 0.403, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_loss": 0.9916093945503235, | |
| "eval_runtime": 118.6886, | |
| "eval_samples_per_second": 25.984, | |
| "eval_steps_per_second": 3.252, | |
| "step": 1555 | |
| }, | |
| { | |
| "epoch": 5.144694533762058, | |
| "grad_norm": 1706044.25, | |
| "learning_rate": 2.9131832797427652e-06, | |
| "loss": 0.4149, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 5.466237942122186, | |
| "grad_norm": 1450221.875, | |
| "learning_rate": 2.7202572347266883e-06, | |
| "loss": 0.4114, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 5.787781350482315, | |
| "grad_norm": 2638404.25, | |
| "learning_rate": 2.527331189710611e-06, | |
| "loss": 0.4016, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_loss": 0.991452693939209, | |
| "eval_runtime": 119.0397, | |
| "eval_samples_per_second": 25.907, | |
| "eval_steps_per_second": 3.243, | |
| "step": 1866 | |
| }, | |
| { | |
| "epoch": 6.109324758842444, | |
| "grad_norm": 2121613.25, | |
| "learning_rate": 2.3344051446945336e-06, | |
| "loss": 0.3984, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 6.430868167202572, | |
| "grad_norm": 6534975.0, | |
| "learning_rate": 2.1414790996784567e-06, | |
| "loss": 0.3788, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 6.752411575562701, | |
| "grad_norm": 2598338.5, | |
| "learning_rate": 1.9485530546623794e-06, | |
| "loss": 0.3718, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "eval_loss": 0.9927557110786438, | |
| "eval_runtime": 118.6036, | |
| "eval_samples_per_second": 26.003, | |
| "eval_steps_per_second": 3.255, | |
| "step": 2177 | |
| }, | |
| { | |
| "epoch": 7.07395498392283, | |
| "grad_norm": 1306477.625, | |
| "learning_rate": 1.7556270096463025e-06, | |
| "loss": 0.3777, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 7.395498392282958, | |
| "grad_norm": 4472440.0, | |
| "learning_rate": 1.5627009646302251e-06, | |
| "loss": 0.3696, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 7.717041800643087, | |
| "grad_norm": 2177569.75, | |
| "learning_rate": 1.369774919614148e-06, | |
| "loss": 0.3683, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_loss": 0.9982658624649048, | |
| "eval_runtime": 119.853, | |
| "eval_samples_per_second": 25.732, | |
| "eval_steps_per_second": 3.221, | |
| "step": 2488 | |
| }, | |
| { | |
| "epoch": 8.038585209003216, | |
| "grad_norm": 2502705.75, | |
| "learning_rate": 1.1768488745980709e-06, | |
| "loss": 0.3577, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 8.360128617363344, | |
| "grad_norm": 1114851.875, | |
| "learning_rate": 9.839228295819935e-07, | |
| "loss": 0.357, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 8.681672025723472, | |
| "grad_norm": 1848325.0, | |
| "learning_rate": 7.909967845659164e-07, | |
| "loss": 0.3572, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "eval_loss": 1.01869797706604, | |
| "eval_runtime": 119.1193, | |
| "eval_samples_per_second": 25.89, | |
| "eval_steps_per_second": 3.24, | |
| "step": 2799 | |
| }, | |
| { | |
| "epoch": 9.003215434083602, | |
| "grad_norm": 2616566.5, | |
| "learning_rate": 5.980707395498393e-07, | |
| "loss": 0.3513, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 9.32475884244373, | |
| "grad_norm": 1535270.375, | |
| "learning_rate": 4.051446945337621e-07, | |
| "loss": 0.3412, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 9.646302250803858, | |
| "grad_norm": 1695442.875, | |
| "learning_rate": 2.1221864951768489e-07, | |
| "loss": 0.365, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 9.967845659163988, | |
| "grad_norm": 2609392.5, | |
| "learning_rate": 1.929260450160772e-08, | |
| "loss": 0.3473, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "eval_loss": 1.024717092514038, | |
| "eval_runtime": 119.192, | |
| "eval_samples_per_second": 25.874, | |
| "eval_steps_per_second": 3.238, | |
| "step": 3110 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 3110, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 10, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 9.25876937799168e+16, | |
| "train_batch_size": 32, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |