{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 3110, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.3215434083601286, "grad_norm": 2206121.5, "learning_rate": 5.807073954983923e-06, "loss": 0.7552, "step": 100 }, { "epoch": 0.6430868167202572, "grad_norm": 2082399.375, "learning_rate": 5.6141479099678455e-06, "loss": 0.6123, "step": 200 }, { "epoch": 0.9646302250803859, "grad_norm": 2188612.25, "learning_rate": 5.421221864951768e-06, "loss": 0.5863, "step": 300 }, { "epoch": 1.0, "eval_loss": 0.8456913828849792, "eval_runtime": 119.4823, "eval_samples_per_second": 25.811, "eval_steps_per_second": 3.231, "step": 311 }, { "epoch": 1.2861736334405145, "grad_norm": 2229254.75, "learning_rate": 5.228295819935691e-06, "loss": 0.5724, "step": 400 }, { "epoch": 1.607717041800643, "grad_norm": 1792601.625, "learning_rate": 5.035369774919614e-06, "loss": 0.5073, "step": 500 }, { "epoch": 1.9292604501607717, "grad_norm": 1635809.75, "learning_rate": 4.842443729903537e-06, "loss": 0.546, "step": 600 }, { "epoch": 2.0, "eval_loss": 0.840407133102417, "eval_runtime": 119.5409, "eval_samples_per_second": 25.799, "eval_steps_per_second": 3.229, "step": 622 }, { "epoch": 2.2508038585209005, "grad_norm": 2690947.75, "learning_rate": 4.6495176848874605e-06, "loss": 0.4998, "step": 700 }, { "epoch": 2.572347266881029, "grad_norm": 2749350.25, "learning_rate": 4.456591639871383e-06, "loss": 0.4927, "step": 800 }, { "epoch": 2.8938906752411575, "grad_norm": 3980070.25, "learning_rate": 4.263665594855306e-06, "loss": 0.486, "step": 900 }, { "epoch": 3.0, "eval_loss": 0.9221391081809998, "eval_runtime": 119.2209, "eval_samples_per_second": 25.868, "eval_steps_per_second": 3.238, "step": 933 }, { "epoch": 3.215434083601286, "grad_norm": 2559816.5, "learning_rate": 4.0707395498392284e-06, "loss": 0.4714, "step": 1000 }, { "epoch": 3.536977491961415, "grad_norm": 1433354.625, "learning_rate": 3.877813504823151e-06, "loss": 0.4734, "step": 1100 }, { "epoch": 3.8585209003215435, "grad_norm": 2509238.5, "learning_rate": 3.6848874598070737e-06, "loss": 0.4622, "step": 1200 }, { "epoch": 4.0, "eval_loss": 0.9068471193313599, "eval_runtime": 119.3472, "eval_samples_per_second": 25.841, "eval_steps_per_second": 3.234, "step": 1244 }, { "epoch": 4.180064308681672, "grad_norm": 1468095.75, "learning_rate": 3.491961414790997e-06, "loss": 0.4211, "step": 1300 }, { "epoch": 4.501607717041801, "grad_norm": 3453460.25, "learning_rate": 3.2990353697749195e-06, "loss": 0.4318, "step": 1400 }, { "epoch": 4.823151125401929, "grad_norm": 2839102.25, "learning_rate": 3.106109324758843e-06, "loss": 0.403, "step": 1500 }, { "epoch": 5.0, "eval_loss": 0.9916093945503235, "eval_runtime": 118.6886, "eval_samples_per_second": 25.984, "eval_steps_per_second": 3.252, "step": 1555 }, { "epoch": 5.144694533762058, "grad_norm": 1706044.25, "learning_rate": 2.9131832797427652e-06, "loss": 0.4149, "step": 1600 }, { "epoch": 5.466237942122186, "grad_norm": 1450221.875, "learning_rate": 2.7202572347266883e-06, "loss": 0.4114, "step": 1700 }, { "epoch": 5.787781350482315, "grad_norm": 2638404.25, "learning_rate": 2.527331189710611e-06, "loss": 0.4016, "step": 1800 }, { "epoch": 6.0, "eval_loss": 0.991452693939209, "eval_runtime": 119.0397, "eval_samples_per_second": 25.907, "eval_steps_per_second": 3.243, "step": 1866 }, { "epoch": 6.109324758842444, "grad_norm": 2121613.25, "learning_rate": 2.3344051446945336e-06, "loss": 0.3984, "step": 1900 }, { "epoch": 6.430868167202572, "grad_norm": 6534975.0, "learning_rate": 2.1414790996784567e-06, "loss": 0.3788, "step": 2000 }, { "epoch": 6.752411575562701, "grad_norm": 2598338.5, "learning_rate": 1.9485530546623794e-06, "loss": 0.3718, "step": 2100 }, { "epoch": 7.0, "eval_loss": 0.9927557110786438, "eval_runtime": 118.6036, "eval_samples_per_second": 26.003, "eval_steps_per_second": 3.255, "step": 2177 }, { "epoch": 7.07395498392283, "grad_norm": 1306477.625, "learning_rate": 1.7556270096463025e-06, "loss": 0.3777, "step": 2200 }, { "epoch": 7.395498392282958, "grad_norm": 4472440.0, "learning_rate": 1.5627009646302251e-06, "loss": 0.3696, "step": 2300 }, { "epoch": 7.717041800643087, "grad_norm": 2177569.75, "learning_rate": 1.369774919614148e-06, "loss": 0.3683, "step": 2400 }, { "epoch": 8.0, "eval_loss": 0.9982658624649048, "eval_runtime": 119.853, "eval_samples_per_second": 25.732, "eval_steps_per_second": 3.221, "step": 2488 }, { "epoch": 8.038585209003216, "grad_norm": 2502705.75, "learning_rate": 1.1768488745980709e-06, "loss": 0.3577, "step": 2500 }, { "epoch": 8.360128617363344, "grad_norm": 1114851.875, "learning_rate": 9.839228295819935e-07, "loss": 0.357, "step": 2600 }, { "epoch": 8.681672025723472, "grad_norm": 1848325.0, "learning_rate": 7.909967845659164e-07, "loss": 0.3572, "step": 2700 }, { "epoch": 9.0, "eval_loss": 1.01869797706604, "eval_runtime": 119.1193, "eval_samples_per_second": 25.89, "eval_steps_per_second": 3.24, "step": 2799 }, { "epoch": 9.003215434083602, "grad_norm": 2616566.5, "learning_rate": 5.980707395498393e-07, "loss": 0.3513, "step": 2800 }, { "epoch": 9.32475884244373, "grad_norm": 1535270.375, "learning_rate": 4.051446945337621e-07, "loss": 0.3412, "step": 2900 }, { "epoch": 9.646302250803858, "grad_norm": 1695442.875, "learning_rate": 2.1221864951768489e-07, "loss": 0.365, "step": 3000 }, { "epoch": 9.967845659163988, "grad_norm": 2609392.5, "learning_rate": 1.929260450160772e-08, "loss": 0.3473, "step": 3100 }, { "epoch": 10.0, "eval_loss": 1.024717092514038, "eval_runtime": 119.192, "eval_samples_per_second": 25.874, "eval_steps_per_second": 3.238, "step": 3110 } ], "logging_steps": 100, "max_steps": 3110, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.25876937799168e+16, "train_batch_size": 32, "trial_name": null, "trial_params": null }