| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 50.0, | |
| "eval_steps": 500, | |
| "global_step": 11700, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.19447623193264008, | |
| "eval_runtime": 20.72, | |
| "eval_samples_per_second": 159.315, | |
| "eval_steps_per_second": 2.027, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.19150546193122864, | |
| "eval_runtime": 20.6464, | |
| "eval_samples_per_second": 159.882, | |
| "eval_steps_per_second": 2.034, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 2.1367521367521367, | |
| "grad_norm": 0.12204297631978989, | |
| "learning_rate": 0.0019145299145299146, | |
| "loss": 0.2699, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "eval_loss": 0.19507834315299988, | |
| "eval_runtime": 20.5681, | |
| "eval_samples_per_second": 160.491, | |
| "eval_steps_per_second": 2.042, | |
| "step": 702 | |
| }, | |
| { | |
| "epoch": 4.0, | |
| "eval_loss": 0.19967834651470184, | |
| "eval_runtime": 20.6125, | |
| "eval_samples_per_second": 160.146, | |
| "eval_steps_per_second": 2.038, | |
| "step": 936 | |
| }, | |
| { | |
| "epoch": 4.273504273504273, | |
| "grad_norm": 0.11764347553253174, | |
| "learning_rate": 0.001829059829059829, | |
| "loss": 0.1529, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "eval_loss": 0.20491881668567657, | |
| "eval_runtime": 20.5999, | |
| "eval_samples_per_second": 160.243, | |
| "eval_steps_per_second": 2.039, | |
| "step": 1170 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "eval_loss": 0.2080976963043213, | |
| "eval_runtime": 20.6197, | |
| "eval_samples_per_second": 160.09, | |
| "eval_steps_per_second": 2.037, | |
| "step": 1404 | |
| }, | |
| { | |
| "epoch": 6.410256410256411, | |
| "grad_norm": 0.11261386424303055, | |
| "learning_rate": 0.0017435897435897436, | |
| "loss": 0.1202, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 7.0, | |
| "eval_loss": 0.21815814077854156, | |
| "eval_runtime": 20.8925, | |
| "eval_samples_per_second": 157.999, | |
| "eval_steps_per_second": 2.01, | |
| "step": 1638 | |
| }, | |
| { | |
| "epoch": 8.0, | |
| "eval_loss": 0.22400857508182526, | |
| "eval_runtime": 20.7258, | |
| "eval_samples_per_second": 159.27, | |
| "eval_steps_per_second": 2.026, | |
| "step": 1872 | |
| }, | |
| { | |
| "epoch": 8.547008547008547, | |
| "grad_norm": 0.11188158392906189, | |
| "learning_rate": 0.0016581196581196582, | |
| "loss": 0.0974, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 9.0, | |
| "eval_loss": 0.2344365417957306, | |
| "eval_runtime": 20.7076, | |
| "eval_samples_per_second": 159.41, | |
| "eval_steps_per_second": 2.028, | |
| "step": 2106 | |
| }, | |
| { | |
| "epoch": 10.0, | |
| "eval_loss": 0.24287687242031097, | |
| "eval_runtime": 20.6198, | |
| "eval_samples_per_second": 160.089, | |
| "eval_steps_per_second": 2.037, | |
| "step": 2340 | |
| }, | |
| { | |
| "epoch": 10.683760683760683, | |
| "grad_norm": 0.13455568253993988, | |
| "learning_rate": 0.0015726495726495727, | |
| "loss": 0.0777, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 11.0, | |
| "eval_loss": 0.24874372780323029, | |
| "eval_runtime": 20.6558, | |
| "eval_samples_per_second": 159.81, | |
| "eval_steps_per_second": 2.033, | |
| "step": 2574 | |
| }, | |
| { | |
| "epoch": 12.0, | |
| "eval_loss": 0.25835666060447693, | |
| "eval_runtime": 20.6234, | |
| "eval_samples_per_second": 160.061, | |
| "eval_steps_per_second": 2.037, | |
| "step": 2808 | |
| }, | |
| { | |
| "epoch": 12.820512820512821, | |
| "grad_norm": 0.12756307423114777, | |
| "learning_rate": 0.0014871794871794872, | |
| "loss": 0.0626, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 13.0, | |
| "eval_loss": 0.26079022884368896, | |
| "eval_runtime": 20.6639, | |
| "eval_samples_per_second": 159.747, | |
| "eval_steps_per_second": 2.033, | |
| "step": 3042 | |
| }, | |
| { | |
| "epoch": 14.0, | |
| "eval_loss": 0.27642908692359924, | |
| "eval_runtime": 20.65, | |
| "eval_samples_per_second": 159.855, | |
| "eval_steps_per_second": 2.034, | |
| "step": 3276 | |
| }, | |
| { | |
| "epoch": 14.957264957264957, | |
| "grad_norm": 0.11947252601385117, | |
| "learning_rate": 0.0014017094017094018, | |
| "loss": 0.051, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 15.0, | |
| "eval_loss": 0.2845572829246521, | |
| "eval_runtime": 20.7218, | |
| "eval_samples_per_second": 159.301, | |
| "eval_steps_per_second": 2.027, | |
| "step": 3510 | |
| }, | |
| { | |
| "epoch": 16.0, | |
| "eval_loss": 0.29512420296669006, | |
| "eval_runtime": 20.7037, | |
| "eval_samples_per_second": 159.44, | |
| "eval_steps_per_second": 2.029, | |
| "step": 3744 | |
| }, | |
| { | |
| "epoch": 17.0, | |
| "eval_loss": 0.3032258450984955, | |
| "eval_runtime": 20.6711, | |
| "eval_samples_per_second": 159.692, | |
| "eval_steps_per_second": 2.032, | |
| "step": 3978 | |
| }, | |
| { | |
| "epoch": 17.094017094017094, | |
| "grad_norm": 0.12442992627620697, | |
| "learning_rate": 0.0013162393162393163, | |
| "loss": 0.0412, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 18.0, | |
| "eval_loss": 0.3091732859611511, | |
| "eval_runtime": 21.0511, | |
| "eval_samples_per_second": 156.809, | |
| "eval_steps_per_second": 1.995, | |
| "step": 4212 | |
| }, | |
| { | |
| "epoch": 19.0, | |
| "eval_loss": 0.3136807978153229, | |
| "eval_runtime": 20.7571, | |
| "eval_samples_per_second": 159.03, | |
| "eval_steps_per_second": 2.023, | |
| "step": 4446 | |
| }, | |
| { | |
| "epoch": 19.23076923076923, | |
| "grad_norm": 0.13041317462921143, | |
| "learning_rate": 0.0012307692307692308, | |
| "loss": 0.0343, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 20.0, | |
| "eval_loss": 0.32218611240386963, | |
| "eval_runtime": 20.6267, | |
| "eval_samples_per_second": 160.035, | |
| "eval_steps_per_second": 2.036, | |
| "step": 4680 | |
| }, | |
| { | |
| "epoch": 21.0, | |
| "eval_loss": 0.3306692838668823, | |
| "eval_runtime": 20.6869, | |
| "eval_samples_per_second": 159.569, | |
| "eval_steps_per_second": 2.03, | |
| "step": 4914 | |
| }, | |
| { | |
| "epoch": 21.367521367521366, | |
| "grad_norm": 0.12146243453025818, | |
| "learning_rate": 0.0011452991452991453, | |
| "loss": 0.0287, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 22.0, | |
| "eval_loss": 0.3365094065666199, | |
| "eval_runtime": 21.008, | |
| "eval_samples_per_second": 157.131, | |
| "eval_steps_per_second": 1.999, | |
| "step": 5148 | |
| }, | |
| { | |
| "epoch": 23.0, | |
| "eval_loss": 0.3390742838382721, | |
| "eval_runtime": 20.7183, | |
| "eval_samples_per_second": 159.328, | |
| "eval_steps_per_second": 2.027, | |
| "step": 5382 | |
| }, | |
| { | |
| "epoch": 23.504273504273506, | |
| "grad_norm": 0.12700985372066498, | |
| "learning_rate": 0.0010598290598290599, | |
| "loss": 0.0242, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 24.0, | |
| "eval_loss": 0.34803667664527893, | |
| "eval_runtime": 20.7103, | |
| "eval_samples_per_second": 159.389, | |
| "eval_steps_per_second": 2.028, | |
| "step": 5616 | |
| }, | |
| { | |
| "epoch": 25.0, | |
| "eval_loss": 0.35619282722473145, | |
| "eval_runtime": 20.7173, | |
| "eval_samples_per_second": 159.336, | |
| "eval_steps_per_second": 2.027, | |
| "step": 5850 | |
| }, | |
| { | |
| "epoch": 25.641025641025642, | |
| "grad_norm": 0.09610779583454132, | |
| "learning_rate": 0.0009743589743589744, | |
| "loss": 0.0208, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 26.0, | |
| "eval_loss": 0.3632526993751526, | |
| "eval_runtime": 20.6691, | |
| "eval_samples_per_second": 159.707, | |
| "eval_steps_per_second": 2.032, | |
| "step": 6084 | |
| }, | |
| { | |
| "epoch": 27.0, | |
| "eval_loss": 0.3678593337535858, | |
| "eval_runtime": 20.7048, | |
| "eval_samples_per_second": 159.431, | |
| "eval_steps_per_second": 2.029, | |
| "step": 6318 | |
| }, | |
| { | |
| "epoch": 27.77777777777778, | |
| "grad_norm": 0.0948660671710968, | |
| "learning_rate": 0.0008888888888888888, | |
| "loss": 0.018, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 28.0, | |
| "eval_loss": 0.37015798687934875, | |
| "eval_runtime": 20.6758, | |
| "eval_samples_per_second": 159.655, | |
| "eval_steps_per_second": 2.031, | |
| "step": 6552 | |
| }, | |
| { | |
| "epoch": 29.0, | |
| "eval_loss": 0.3752131164073944, | |
| "eval_runtime": 20.6693, | |
| "eval_samples_per_second": 159.705, | |
| "eval_steps_per_second": 2.032, | |
| "step": 6786 | |
| }, | |
| { | |
| "epoch": 29.914529914529915, | |
| "grad_norm": 0.10043003410100937, | |
| "learning_rate": 0.0008034188034188035, | |
| "loss": 0.0156, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 30.0, | |
| "eval_loss": 0.3819948136806488, | |
| "eval_runtime": 20.9533, | |
| "eval_samples_per_second": 157.541, | |
| "eval_steps_per_second": 2.004, | |
| "step": 7020 | |
| }, | |
| { | |
| "epoch": 31.0, | |
| "eval_loss": 0.38426485657691956, | |
| "eval_runtime": 20.6265, | |
| "eval_samples_per_second": 160.037, | |
| "eval_steps_per_second": 2.036, | |
| "step": 7254 | |
| }, | |
| { | |
| "epoch": 32.0, | |
| "eval_loss": 0.38617241382598877, | |
| "eval_runtime": 20.6361, | |
| "eval_samples_per_second": 159.962, | |
| "eval_steps_per_second": 2.035, | |
| "step": 7488 | |
| }, | |
| { | |
| "epoch": 32.05128205128205, | |
| "grad_norm": 0.0730343759059906, | |
| "learning_rate": 0.000717948717948718, | |
| "loss": 0.0136, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 33.0, | |
| "eval_loss": 0.3947625756263733, | |
| "eval_runtime": 20.8904, | |
| "eval_samples_per_second": 158.015, | |
| "eval_steps_per_second": 2.01, | |
| "step": 7722 | |
| }, | |
| { | |
| "epoch": 34.0, | |
| "eval_loss": 0.40124306082725525, | |
| "eval_runtime": 20.7007, | |
| "eval_samples_per_second": 159.463, | |
| "eval_steps_per_second": 2.029, | |
| "step": 7956 | |
| }, | |
| { | |
| "epoch": 34.18803418803419, | |
| "grad_norm": 0.075799860060215, | |
| "learning_rate": 0.0006324786324786324, | |
| "loss": 0.0117, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 35.0, | |
| "eval_loss": 0.405514657497406, | |
| "eval_runtime": 20.5467, | |
| "eval_samples_per_second": 160.659, | |
| "eval_steps_per_second": 2.044, | |
| "step": 8190 | |
| }, | |
| { | |
| "epoch": 36.0, | |
| "eval_loss": 0.4083278179168701, | |
| "eval_runtime": 20.5842, | |
| "eval_samples_per_second": 160.365, | |
| "eval_steps_per_second": 2.04, | |
| "step": 8424 | |
| }, | |
| { | |
| "epoch": 36.324786324786324, | |
| "grad_norm": 0.0853864997625351, | |
| "learning_rate": 0.000547008547008547, | |
| "loss": 0.0103, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 37.0, | |
| "eval_loss": 0.41271889209747314, | |
| "eval_runtime": 20.5448, | |
| "eval_samples_per_second": 160.673, | |
| "eval_steps_per_second": 2.044, | |
| "step": 8658 | |
| }, | |
| { | |
| "epoch": 38.0, | |
| "eval_loss": 0.41810309886932373, | |
| "eval_runtime": 20.479, | |
| "eval_samples_per_second": 161.19, | |
| "eval_steps_per_second": 2.051, | |
| "step": 8892 | |
| }, | |
| { | |
| "epoch": 38.46153846153846, | |
| "grad_norm": 0.10202949494123459, | |
| "learning_rate": 0.0004615384615384616, | |
| "loss": 0.0089, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 39.0, | |
| "eval_loss": 0.4218575954437256, | |
| "eval_runtime": 20.5194, | |
| "eval_samples_per_second": 160.872, | |
| "eval_steps_per_second": 2.047, | |
| "step": 9126 | |
| }, | |
| { | |
| "epoch": 40.0, | |
| "eval_loss": 0.4203444719314575, | |
| "eval_runtime": 20.4962, | |
| "eval_samples_per_second": 161.054, | |
| "eval_steps_per_second": 2.049, | |
| "step": 9360 | |
| }, | |
| { | |
| "epoch": 40.598290598290596, | |
| "grad_norm": 0.07695678621530533, | |
| "learning_rate": 0.00037606837606837606, | |
| "loss": 0.008, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 41.0, | |
| "eval_loss": 0.4281730353832245, | |
| "eval_runtime": 20.4864, | |
| "eval_samples_per_second": 161.131, | |
| "eval_steps_per_second": 2.05, | |
| "step": 9594 | |
| }, | |
| { | |
| "epoch": 42.0, | |
| "eval_loss": 0.4289074242115021, | |
| "eval_runtime": 20.5054, | |
| "eval_samples_per_second": 160.982, | |
| "eval_steps_per_second": 2.048, | |
| "step": 9828 | |
| }, | |
| { | |
| "epoch": 42.73504273504273, | |
| "grad_norm": 0.07477525621652603, | |
| "learning_rate": 0.00029059829059829064, | |
| "loss": 0.0071, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 43.0, | |
| "eval_loss": 0.43153491616249084, | |
| "eval_runtime": 20.4996, | |
| "eval_samples_per_second": 161.027, | |
| "eval_steps_per_second": 2.049, | |
| "step": 10062 | |
| }, | |
| { | |
| "epoch": 44.0, | |
| "eval_loss": 0.43742790818214417, | |
| "eval_runtime": 20.5391, | |
| "eval_samples_per_second": 160.718, | |
| "eval_steps_per_second": 2.045, | |
| "step": 10296 | |
| }, | |
| { | |
| "epoch": 44.87179487179487, | |
| "grad_norm": 0.07834827154874802, | |
| "learning_rate": 0.00020512820512820512, | |
| "loss": 0.0061, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 45.0, | |
| "eval_loss": 0.4394199550151825, | |
| "eval_runtime": 20.8641, | |
| "eval_samples_per_second": 158.214, | |
| "eval_steps_per_second": 2.013, | |
| "step": 10530 | |
| }, | |
| { | |
| "epoch": 46.0, | |
| "eval_loss": 0.4417046308517456, | |
| "eval_runtime": 20.4699, | |
| "eval_samples_per_second": 161.261, | |
| "eval_steps_per_second": 2.052, | |
| "step": 10764 | |
| }, | |
| { | |
| "epoch": 47.0, | |
| "eval_loss": 0.44045427441596985, | |
| "eval_runtime": 20.5918, | |
| "eval_samples_per_second": 160.306, | |
| "eval_steps_per_second": 2.04, | |
| "step": 10998 | |
| }, | |
| { | |
| "epoch": 47.00854700854701, | |
| "grad_norm": 0.05676428973674774, | |
| "learning_rate": 0.00011965811965811966, | |
| "loss": 0.0056, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 48.0, | |
| "eval_loss": 0.44194263219833374, | |
| "eval_runtime": 20.541, | |
| "eval_samples_per_second": 160.703, | |
| "eval_steps_per_second": 2.045, | |
| "step": 11232 | |
| }, | |
| { | |
| "epoch": 49.0, | |
| "eval_loss": 0.44418105483055115, | |
| "eval_runtime": 20.5518, | |
| "eval_samples_per_second": 160.619, | |
| "eval_steps_per_second": 2.044, | |
| "step": 11466 | |
| }, | |
| { | |
| "epoch": 49.14529914529915, | |
| "grad_norm": 0.06708718836307526, | |
| "learning_rate": 3.418803418803419e-05, | |
| "loss": 0.005, | |
| "step": 11500 | |
| } | |
| ], | |
| "logging_steps": 500, | |
| "max_steps": 11700, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 50, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 6.32824428232704e+16, | |
| "train_batch_size": 80, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |