{ "best_metric": null, "best_model_checkpoint": null, "epoch": 50.0, "eval_steps": 500, "global_step": 11700, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.0, "eval_loss": 0.19447623193264008, "eval_runtime": 20.72, "eval_samples_per_second": 159.315, "eval_steps_per_second": 2.027, "step": 234 }, { "epoch": 2.0, "eval_loss": 0.19150546193122864, "eval_runtime": 20.6464, "eval_samples_per_second": 159.882, "eval_steps_per_second": 2.034, "step": 468 }, { "epoch": 2.1367521367521367, "grad_norm": 0.12204297631978989, "learning_rate": 0.0019145299145299146, "loss": 0.2699, "step": 500 }, { "epoch": 3.0, "eval_loss": 0.19507834315299988, "eval_runtime": 20.5681, "eval_samples_per_second": 160.491, "eval_steps_per_second": 2.042, "step": 702 }, { "epoch": 4.0, "eval_loss": 0.19967834651470184, "eval_runtime": 20.6125, "eval_samples_per_second": 160.146, "eval_steps_per_second": 2.038, "step": 936 }, { "epoch": 4.273504273504273, "grad_norm": 0.11764347553253174, "learning_rate": 0.001829059829059829, "loss": 0.1529, "step": 1000 }, { "epoch": 5.0, "eval_loss": 0.20491881668567657, "eval_runtime": 20.5999, "eval_samples_per_second": 160.243, "eval_steps_per_second": 2.039, "step": 1170 }, { "epoch": 6.0, "eval_loss": 0.2080976963043213, "eval_runtime": 20.6197, "eval_samples_per_second": 160.09, "eval_steps_per_second": 2.037, "step": 1404 }, { "epoch": 6.410256410256411, "grad_norm": 0.11261386424303055, "learning_rate": 0.0017435897435897436, "loss": 0.1202, "step": 1500 }, { "epoch": 7.0, "eval_loss": 0.21815814077854156, "eval_runtime": 20.8925, "eval_samples_per_second": 157.999, "eval_steps_per_second": 2.01, "step": 1638 }, { "epoch": 8.0, "eval_loss": 0.22400857508182526, "eval_runtime": 20.7258, "eval_samples_per_second": 159.27, "eval_steps_per_second": 2.026, "step": 1872 }, { "epoch": 8.547008547008547, "grad_norm": 0.11188158392906189, "learning_rate": 0.0016581196581196582, "loss": 0.0974, "step": 2000 }, { "epoch": 9.0, "eval_loss": 0.2344365417957306, "eval_runtime": 20.7076, "eval_samples_per_second": 159.41, "eval_steps_per_second": 2.028, "step": 2106 }, { "epoch": 10.0, "eval_loss": 0.24287687242031097, "eval_runtime": 20.6198, "eval_samples_per_second": 160.089, "eval_steps_per_second": 2.037, "step": 2340 }, { "epoch": 10.683760683760683, "grad_norm": 0.13455568253993988, "learning_rate": 0.0015726495726495727, "loss": 0.0777, "step": 2500 }, { "epoch": 11.0, "eval_loss": 0.24874372780323029, "eval_runtime": 20.6558, "eval_samples_per_second": 159.81, "eval_steps_per_second": 2.033, "step": 2574 }, { "epoch": 12.0, "eval_loss": 0.25835666060447693, "eval_runtime": 20.6234, "eval_samples_per_second": 160.061, "eval_steps_per_second": 2.037, "step": 2808 }, { "epoch": 12.820512820512821, "grad_norm": 0.12756307423114777, "learning_rate": 0.0014871794871794872, "loss": 0.0626, "step": 3000 }, { "epoch": 13.0, "eval_loss": 0.26079022884368896, "eval_runtime": 20.6639, "eval_samples_per_second": 159.747, "eval_steps_per_second": 2.033, "step": 3042 }, { "epoch": 14.0, "eval_loss": 0.27642908692359924, "eval_runtime": 20.65, "eval_samples_per_second": 159.855, "eval_steps_per_second": 2.034, "step": 3276 }, { "epoch": 14.957264957264957, "grad_norm": 0.11947252601385117, "learning_rate": 0.0014017094017094018, "loss": 0.051, "step": 3500 }, { "epoch": 15.0, "eval_loss": 0.2845572829246521, "eval_runtime": 20.7218, "eval_samples_per_second": 159.301, "eval_steps_per_second": 2.027, "step": 3510 }, { "epoch": 16.0, "eval_loss": 0.29512420296669006, "eval_runtime": 20.7037, "eval_samples_per_second": 159.44, "eval_steps_per_second": 2.029, "step": 3744 }, { "epoch": 17.0, "eval_loss": 0.3032258450984955, "eval_runtime": 20.6711, "eval_samples_per_second": 159.692, "eval_steps_per_second": 2.032, "step": 3978 }, { "epoch": 17.094017094017094, "grad_norm": 0.12442992627620697, "learning_rate": 0.0013162393162393163, "loss": 0.0412, "step": 4000 }, { "epoch": 18.0, "eval_loss": 0.3091732859611511, "eval_runtime": 21.0511, "eval_samples_per_second": 156.809, "eval_steps_per_second": 1.995, "step": 4212 }, { "epoch": 19.0, "eval_loss": 0.3136807978153229, "eval_runtime": 20.7571, "eval_samples_per_second": 159.03, "eval_steps_per_second": 2.023, "step": 4446 }, { "epoch": 19.23076923076923, "grad_norm": 0.13041317462921143, "learning_rate": 0.0012307692307692308, "loss": 0.0343, "step": 4500 }, { "epoch": 20.0, "eval_loss": 0.32218611240386963, "eval_runtime": 20.6267, "eval_samples_per_second": 160.035, "eval_steps_per_second": 2.036, "step": 4680 }, { "epoch": 21.0, "eval_loss": 0.3306692838668823, "eval_runtime": 20.6869, "eval_samples_per_second": 159.569, "eval_steps_per_second": 2.03, "step": 4914 }, { "epoch": 21.367521367521366, "grad_norm": 0.12146243453025818, "learning_rate": 0.0011452991452991453, "loss": 0.0287, "step": 5000 }, { "epoch": 22.0, "eval_loss": 0.3365094065666199, "eval_runtime": 21.008, "eval_samples_per_second": 157.131, "eval_steps_per_second": 1.999, "step": 5148 }, { "epoch": 23.0, "eval_loss": 0.3390742838382721, "eval_runtime": 20.7183, "eval_samples_per_second": 159.328, "eval_steps_per_second": 2.027, "step": 5382 }, { "epoch": 23.504273504273506, "grad_norm": 0.12700985372066498, "learning_rate": 0.0010598290598290599, "loss": 0.0242, "step": 5500 }, { "epoch": 24.0, "eval_loss": 0.34803667664527893, "eval_runtime": 20.7103, "eval_samples_per_second": 159.389, "eval_steps_per_second": 2.028, "step": 5616 }, { "epoch": 25.0, "eval_loss": 0.35619282722473145, "eval_runtime": 20.7173, "eval_samples_per_second": 159.336, "eval_steps_per_second": 2.027, "step": 5850 }, { "epoch": 25.641025641025642, "grad_norm": 0.09610779583454132, "learning_rate": 0.0009743589743589744, "loss": 0.0208, "step": 6000 }, { "epoch": 26.0, "eval_loss": 0.3632526993751526, "eval_runtime": 20.6691, "eval_samples_per_second": 159.707, "eval_steps_per_second": 2.032, "step": 6084 }, { "epoch": 27.0, "eval_loss": 0.3678593337535858, "eval_runtime": 20.7048, "eval_samples_per_second": 159.431, "eval_steps_per_second": 2.029, "step": 6318 }, { "epoch": 27.77777777777778, "grad_norm": 0.0948660671710968, "learning_rate": 0.0008888888888888888, "loss": 0.018, "step": 6500 }, { "epoch": 28.0, "eval_loss": 0.37015798687934875, "eval_runtime": 20.6758, "eval_samples_per_second": 159.655, "eval_steps_per_second": 2.031, "step": 6552 }, { "epoch": 29.0, "eval_loss": 0.3752131164073944, "eval_runtime": 20.6693, "eval_samples_per_second": 159.705, "eval_steps_per_second": 2.032, "step": 6786 }, { "epoch": 29.914529914529915, "grad_norm": 0.10043003410100937, "learning_rate": 0.0008034188034188035, "loss": 0.0156, "step": 7000 }, { "epoch": 30.0, "eval_loss": 0.3819948136806488, "eval_runtime": 20.9533, "eval_samples_per_second": 157.541, "eval_steps_per_second": 2.004, "step": 7020 }, { "epoch": 31.0, "eval_loss": 0.38426485657691956, "eval_runtime": 20.6265, "eval_samples_per_second": 160.037, "eval_steps_per_second": 2.036, "step": 7254 }, { "epoch": 32.0, "eval_loss": 0.38617241382598877, "eval_runtime": 20.6361, "eval_samples_per_second": 159.962, "eval_steps_per_second": 2.035, "step": 7488 }, { "epoch": 32.05128205128205, "grad_norm": 0.0730343759059906, "learning_rate": 0.000717948717948718, "loss": 0.0136, "step": 7500 }, { "epoch": 33.0, "eval_loss": 0.3947625756263733, "eval_runtime": 20.8904, "eval_samples_per_second": 158.015, "eval_steps_per_second": 2.01, "step": 7722 }, { "epoch": 34.0, "eval_loss": 0.40124306082725525, "eval_runtime": 20.7007, "eval_samples_per_second": 159.463, "eval_steps_per_second": 2.029, "step": 7956 }, { "epoch": 34.18803418803419, "grad_norm": 0.075799860060215, "learning_rate": 0.0006324786324786324, "loss": 0.0117, "step": 8000 }, { "epoch": 35.0, "eval_loss": 0.405514657497406, "eval_runtime": 20.5467, "eval_samples_per_second": 160.659, "eval_steps_per_second": 2.044, "step": 8190 }, { "epoch": 36.0, "eval_loss": 0.4083278179168701, "eval_runtime": 20.5842, "eval_samples_per_second": 160.365, "eval_steps_per_second": 2.04, "step": 8424 }, { "epoch": 36.324786324786324, "grad_norm": 0.0853864997625351, "learning_rate": 0.000547008547008547, "loss": 0.0103, "step": 8500 }, { "epoch": 37.0, "eval_loss": 0.41271889209747314, "eval_runtime": 20.5448, "eval_samples_per_second": 160.673, "eval_steps_per_second": 2.044, "step": 8658 }, { "epoch": 38.0, "eval_loss": 0.41810309886932373, "eval_runtime": 20.479, "eval_samples_per_second": 161.19, "eval_steps_per_second": 2.051, "step": 8892 }, { "epoch": 38.46153846153846, "grad_norm": 0.10202949494123459, "learning_rate": 0.0004615384615384616, "loss": 0.0089, "step": 9000 }, { "epoch": 39.0, "eval_loss": 0.4218575954437256, "eval_runtime": 20.5194, "eval_samples_per_second": 160.872, "eval_steps_per_second": 2.047, "step": 9126 }, { "epoch": 40.0, "eval_loss": 0.4203444719314575, "eval_runtime": 20.4962, "eval_samples_per_second": 161.054, "eval_steps_per_second": 2.049, "step": 9360 }, { "epoch": 40.598290598290596, "grad_norm": 0.07695678621530533, "learning_rate": 0.00037606837606837606, "loss": 0.008, "step": 9500 }, { "epoch": 41.0, "eval_loss": 0.4281730353832245, "eval_runtime": 20.4864, "eval_samples_per_second": 161.131, "eval_steps_per_second": 2.05, "step": 9594 }, { "epoch": 42.0, "eval_loss": 0.4289074242115021, "eval_runtime": 20.5054, "eval_samples_per_second": 160.982, "eval_steps_per_second": 2.048, "step": 9828 }, { "epoch": 42.73504273504273, "grad_norm": 0.07477525621652603, "learning_rate": 0.00029059829059829064, "loss": 0.0071, "step": 10000 }, { "epoch": 43.0, "eval_loss": 0.43153491616249084, "eval_runtime": 20.4996, "eval_samples_per_second": 161.027, "eval_steps_per_second": 2.049, "step": 10062 }, { "epoch": 44.0, "eval_loss": 0.43742790818214417, "eval_runtime": 20.5391, "eval_samples_per_second": 160.718, "eval_steps_per_second": 2.045, "step": 10296 }, { "epoch": 44.87179487179487, "grad_norm": 0.07834827154874802, "learning_rate": 0.00020512820512820512, "loss": 0.0061, "step": 10500 }, { "epoch": 45.0, "eval_loss": 0.4394199550151825, "eval_runtime": 20.8641, "eval_samples_per_second": 158.214, "eval_steps_per_second": 2.013, "step": 10530 }, { "epoch": 46.0, "eval_loss": 0.4417046308517456, "eval_runtime": 20.4699, "eval_samples_per_second": 161.261, "eval_steps_per_second": 2.052, "step": 10764 }, { "epoch": 47.0, "eval_loss": 0.44045427441596985, "eval_runtime": 20.5918, "eval_samples_per_second": 160.306, "eval_steps_per_second": 2.04, "step": 10998 }, { "epoch": 47.00854700854701, "grad_norm": 0.05676428973674774, "learning_rate": 0.00011965811965811966, "loss": 0.0056, "step": 11000 }, { "epoch": 48.0, "eval_loss": 0.44194263219833374, "eval_runtime": 20.541, "eval_samples_per_second": 160.703, "eval_steps_per_second": 2.045, "step": 11232 }, { "epoch": 49.0, "eval_loss": 0.44418105483055115, "eval_runtime": 20.5518, "eval_samples_per_second": 160.619, "eval_steps_per_second": 2.044, "step": 11466 }, { "epoch": 49.14529914529915, "grad_norm": 0.06708718836307526, "learning_rate": 3.418803418803419e-05, "loss": 0.005, "step": 11500 } ], "logging_steps": 500, "max_steps": 11700, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.32824428232704e+16, "train_batch_size": 80, "trial_name": null, "trial_params": null }