{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 9.0, "eval_steps": 500, "global_step": 288, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.32, "grad_norm": 0.5078026652336121, "learning_rate": 0.0001, "loss": 1.0979, "step": 10 }, { "epoch": 0.64, "grad_norm": 0.24573734402656555, "learning_rate": 9.968335515358916e-05, "loss": 0.6078, "step": 20 }, { "epoch": 0.96, "grad_norm": 0.28030624985694885, "learning_rate": 9.873743117270691e-05, "loss": 0.4051, "step": 30 }, { "epoch": 1.0, "eval_accuracy": 0.8977756524449263, "eval_loss": 0.43045690655708313, "eval_runtime": 25.9378, "eval_samples_per_second": 14.843, "eval_steps_per_second": 1.889, "step": 32 }, { "epoch": 1.256, "grad_norm": 0.24669107794761658, "learning_rate": 9.717420893549902e-05, "loss": 0.4398, "step": 40 }, { "epoch": 1.576, "grad_norm": 0.2326413094997406, "learning_rate": 9.501348789257373e-05, "loss": 0.3765, "step": 50 }, { "epoch": 1.896, "grad_norm": 0.2204989194869995, "learning_rate": 9.2282635291242e-05, "loss": 0.2886, "step": 60 }, { "epoch": 2.0, "eval_accuracy": 0.920555460186586, "eval_loss": 0.323241263628006, "eval_runtime": 25.9139, "eval_samples_per_second": 14.857, "eval_steps_per_second": 1.891, "step": 64 }, { "epoch": 2.192, "grad_norm": 0.2965347468852997, "learning_rate": 8.90162395476046e-05, "loss": 0.2927, "step": 70 }, { "epoch": 2.512, "grad_norm": 0.3402569890022278, "learning_rate": 8.525567215680398e-05, "loss": 0.3082, "step": 80 }, { "epoch": 2.832, "grad_norm": 0.29415449500083923, "learning_rate": 8.104856369019524e-05, "loss": 0.2289, "step": 90 }, { "epoch": 3.0, "eval_accuracy": 0.9322851101896489, "eval_loss": 0.27416735887527466, "eval_runtime": 25.8596, "eval_samples_per_second": 14.888, "eval_steps_per_second": 1.895, "step": 96 }, { "epoch": 3.128, "grad_norm": 0.30058005452156067, "learning_rate": 7.644820051634812e-05, "loss": 0.2103, "step": 100 }, { "epoch": 3.448, "grad_norm": 0.39383992552757263, "learning_rate": 7.15128498868873e-05, "loss": 0.2362, "step": 110 }, { "epoch": 3.768, "grad_norm": 0.32451319694519043, "learning_rate": 6.630502193549474e-05, "loss": 0.1925, "step": 120 }, { "epoch": 4.0, "eval_accuracy": 0.938676979897959, "eval_loss": 0.25138482451438904, "eval_runtime": 25.9363, "eval_samples_per_second": 14.844, "eval_steps_per_second": 1.889, "step": 128 }, { "epoch": 4.064, "grad_norm": 0.375893235206604, "learning_rate": 6.0890677937442574e-05, "loss": 0.1471, "step": 130 }, { "epoch": 4.384, "grad_norm": 0.3606163263320923, "learning_rate": 5.5338394857677945e-05, "loss": 0.1876, "step": 140 }, { "epoch": 4.704, "grad_norm": 0.40445348620414734, "learning_rate": 4.971849676912172e-05, "loss": 0.1488, "step": 150 }, { "epoch": 5.0, "grad_norm": 0.4769314229488373, "learning_rate": 4.410216414245771e-05, "loss": 0.1079, "step": 160 }, { "epoch": 5.0, "eval_accuracy": 0.9420305087488926, "eval_loss": 0.2455526441335678, "eval_runtime": 25.9468, "eval_samples_per_second": 14.838, "eval_steps_per_second": 1.888, "step": 160 }, { "epoch": 5.32, "grad_norm": 0.4198707044124603, "learning_rate": 3.856053228896442e-05, "loss": 0.1534, "step": 170 }, { "epoch": 5.64, "grad_norm": 0.3887736201286316, "learning_rate": 3.316379037532644e-05, "loss": 0.1151, "step": 180 }, { "epoch": 5.96, "grad_norm": 0.37902015447616577, "learning_rate": 2.798029242211828e-05, "loss": 0.0968, "step": 190 }, { "epoch": 6.0, "eval_accuracy": 0.945372723288016, "eval_loss": 0.24095402657985687, "eval_runtime": 25.9698, "eval_samples_per_second": 14.825, "eval_steps_per_second": 1.887, "step": 192 }, { "epoch": 6.256, "grad_norm": 0.4507382810115814, "learning_rate": 2.3075691545870558e-05, "loss": 0.1142, "step": 200 }, { "epoch": 6.576, "grad_norm": 0.39560869336128235, "learning_rate": 1.8512108410229878e-05, "loss": 0.0908, "step": 210 }, { "epoch": 6.896, "grad_norm": 0.3712189793586731, "learning_rate": 1.434734441843899e-05, "loss": 0.0835, "step": 220 }, { "epoch": 7.0, "eval_accuracy": 0.9465976771312703, "eval_loss": 0.24637635052204132, "eval_runtime": 25.9064, "eval_samples_per_second": 14.861, "eval_steps_per_second": 1.891, "step": 224 }, { "epoch": 7.192, "grad_norm": 0.36213552951812744, "learning_rate": 1.063414961267859e-05, "loss": 0.0841, "step": 230 }, { "epoch": 7.5120000000000005, "grad_norm": 0.4722885191440582, "learning_rate": 7.41955455290726e-06, "loss": 0.0863, "step": 240 }, { "epoch": 7.832, "grad_norm": 0.29437902569770813, "learning_rate": 4.744274637483936e-06, "loss": 0.0716, "step": 250 }, { "epoch": 8.0, "eval_accuracy": 0.9472423896041875, "eval_loss": 0.2516001760959625, "eval_runtime": 25.92, "eval_samples_per_second": 14.853, "eval_steps_per_second": 1.89, "step": 256 }, { "epoch": 8.128, "grad_norm": 0.39367932081222534, "learning_rate": 2.6421944103256657e-06, "loss": 0.0666, "step": 260 }, { "epoch": 8.448, "grad_norm": 0.31784766912460327, "learning_rate": 1.1399383862592927e-06, "loss": 0.0814, "step": 270 }, { "epoch": 8.768, "grad_norm": 0.28355342149734497, "learning_rate": 2.5653383040524227e-07, "loss": 0.0611, "step": 280 }, { "epoch": 9.0, "eval_accuracy": 0.947486792929819, "eval_loss": 0.2522543668746948, "eval_runtime": 25.9049, "eval_samples_per_second": 14.862, "eval_steps_per_second": 1.892, "step": 288 }, { "epoch": 9.0, "step": 288, "total_flos": 3.686068977139712e+17, "train_loss": 0.22300153877586126, "train_runtime": 2410.3131, "train_samples_per_second": 3.734, "train_steps_per_second": 0.119 } ], "logging_steps": 10, "max_steps": 288, "num_input_tokens_seen": 0, "num_train_epochs": 9, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.686068977139712e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }