{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.999050933248972, "eval_steps": 1000, "global_step": 4740, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06327111673521038, "grad_norm": 4.5707688331604, "learning_rate": 1.0548523206751056e-05, "loss": 6.0536, "step": 100 }, { "epoch": 0.12654223347042076, "grad_norm": 7.803924083709717, "learning_rate": 2.1097046413502112e-05, "loss": 5.8177, "step": 200 }, { "epoch": 0.18981335020563114, "grad_norm": 8.050126075744629, "learning_rate": 3.1645569620253167e-05, "loss": 5.0311, "step": 300 }, { "epoch": 0.2530844669408415, "grad_norm": 8.803975105285645, "learning_rate": 4.2194092827004224e-05, "loss": 4.7351, "step": 400 }, { "epoch": 0.3163555836760519, "grad_norm": 12.846040725708008, "learning_rate": 4.96952648851383e-05, "loss": 4.5453, "step": 500 }, { "epoch": 0.3796267004112623, "grad_norm": 15.128095626831055, "learning_rate": 4.852320675105486e-05, "loss": 4.3979, "step": 600 }, { "epoch": 0.44289781714647264, "grad_norm": 10.33956527709961, "learning_rate": 4.7351148616971405e-05, "loss": 4.4056, "step": 700 }, { "epoch": 0.506168933881683, "grad_norm": 23.287179946899414, "learning_rate": 4.617909048288795e-05, "loss": 4.2787, "step": 800 }, { "epoch": 0.5694400506168934, "grad_norm": 12.955565452575684, "learning_rate": 4.50070323488045e-05, "loss": 4.3504, "step": 900 }, { "epoch": 0.6327111673521038, "grad_norm": 10.07479476928711, "learning_rate": 4.3834974214721055e-05, "loss": 4.2521, "step": 1000 }, { "epoch": 0.6327111673521038, "eval_runtime": 33.0567, "eval_samples_per_second": 95.593, "eval_steps_per_second": 11.949, "step": 1000 }, { "epoch": 0.6959822840873141, "grad_norm": 12.32780647277832, "learning_rate": 4.26629160806376e-05, "loss": 4.2314, "step": 1100 }, { "epoch": 0.7592534008225246, "grad_norm": 15.515801429748535, "learning_rate": 4.149085794655415e-05, "loss": 4.3058, "step": 1200 }, { "epoch": 0.8225245175577349, "grad_norm": 12.472834587097168, "learning_rate": 4.03187998124707e-05, "loss": 4.2158, "step": 1300 }, { "epoch": 0.8857956342929453, "grad_norm": 16.903112411499023, "learning_rate": 3.914674167838725e-05, "loss": 4.142, "step": 1400 }, { "epoch": 0.9490667510281556, "grad_norm": 13.487791061401367, "learning_rate": 3.79746835443038e-05, "loss": 4.0534, "step": 1500 }, { "epoch": 1.012337867763366, "grad_norm": 14.721494674682617, "learning_rate": 3.680262541022035e-05, "loss": 4.1405, "step": 1600 }, { "epoch": 1.0756089844985763, "grad_norm": 16.011690139770508, "learning_rate": 3.56305672761369e-05, "loss": 4.1192, "step": 1700 }, { "epoch": 1.1388801012337868, "grad_norm": 15.692846298217773, "learning_rate": 3.445850914205345e-05, "loss": 4.1407, "step": 1800 }, { "epoch": 1.2021512179689973, "grad_norm": 13.71811294555664, "learning_rate": 3.328645100797e-05, "loss": 4.1839, "step": 1900 }, { "epoch": 1.2654223347042075, "grad_norm": 13.474448204040527, "learning_rate": 3.2114392873886545e-05, "loss": 4.0813, "step": 2000 }, { "epoch": 1.2654223347042075, "eval_runtime": 33.0889, "eval_samples_per_second": 95.5, "eval_steps_per_second": 11.938, "step": 2000 }, { "epoch": 1.328693451439418, "grad_norm": 15.276654243469238, "learning_rate": 3.09423347398031e-05, "loss": 3.9624, "step": 2100 }, { "epoch": 1.3919645681746282, "grad_norm": 13.796238899230957, "learning_rate": 2.9770276605719643e-05, "loss": 4.0653, "step": 2200 }, { "epoch": 1.4552356849098387, "grad_norm": 16.452594757080078, "learning_rate": 2.8598218471636194e-05, "loss": 4.0338, "step": 2300 }, { "epoch": 1.518506801645049, "grad_norm": 20.27753257751465, "learning_rate": 2.7426160337552742e-05, "loss": 4.0962, "step": 2400 }, { "epoch": 1.5817779183802594, "grad_norm": 15.492554664611816, "learning_rate": 2.6254102203469293e-05, "loss": 4.057, "step": 2500 }, { "epoch": 1.6450490351154698, "grad_norm": 15.298516273498535, "learning_rate": 2.508204406938584e-05, "loss": 4.0301, "step": 2600 }, { "epoch": 1.70832015185068, "grad_norm": 15.670785903930664, "learning_rate": 2.3909985935302392e-05, "loss": 4.0129, "step": 2700 }, { "epoch": 1.7715912685858906, "grad_norm": 18.541555404663086, "learning_rate": 2.2737927801218943e-05, "loss": 3.9724, "step": 2800 }, { "epoch": 1.834862385321101, "grad_norm": 19.13411521911621, "learning_rate": 2.156586966713549e-05, "loss": 4.0044, "step": 2900 }, { "epoch": 1.8981335020563113, "grad_norm": 14.532624244689941, "learning_rate": 2.039381153305204e-05, "loss": 3.9882, "step": 3000 }, { "epoch": 1.8981335020563113, "eval_runtime": 33.1181, "eval_samples_per_second": 95.416, "eval_steps_per_second": 11.927, "step": 3000 }, { "epoch": 1.9614046187915217, "grad_norm": 15.767202377319336, "learning_rate": 1.922175339896859e-05, "loss": 3.9372, "step": 3100 }, { "epoch": 2.024675735526732, "grad_norm": 17.210546493530273, "learning_rate": 1.804969526488514e-05, "loss": 3.9757, "step": 3200 }, { "epoch": 2.0879468522619424, "grad_norm": 15.209254264831543, "learning_rate": 1.6877637130801688e-05, "loss": 3.9668, "step": 3300 }, { "epoch": 2.1512179689971527, "grad_norm": 16.821176528930664, "learning_rate": 1.570557899671824e-05, "loss": 3.9732, "step": 3400 }, { "epoch": 2.2144890857323634, "grad_norm": 15.914960861206055, "learning_rate": 1.4533520862634786e-05, "loss": 3.9375, "step": 3500 }, { "epoch": 2.2777602024675736, "grad_norm": 16.489627838134766, "learning_rate": 1.3361462728551336e-05, "loss": 3.9993, "step": 3600 }, { "epoch": 2.341031319202784, "grad_norm": 17.943021774291992, "learning_rate": 1.2189404594467887e-05, "loss": 3.9889, "step": 3700 }, { "epoch": 2.4043024359379945, "grad_norm": 14.150239944458008, "learning_rate": 1.1017346460384436e-05, "loss": 4.0, "step": 3800 }, { "epoch": 2.4675735526732048, "grad_norm": 15.843707084655762, "learning_rate": 9.845288326300985e-06, "loss": 3.9527, "step": 3900 }, { "epoch": 2.530844669408415, "grad_norm": 16.922142028808594, "learning_rate": 8.673230192217533e-06, "loss": 3.8801, "step": 4000 }, { "epoch": 2.530844669408415, "eval_runtime": 33.0326, "eval_samples_per_second": 95.663, "eval_steps_per_second": 11.958, "step": 4000 }, { "epoch": 2.5941157861436253, "grad_norm": 19.046831130981445, "learning_rate": 7.501172058134085e-06, "loss": 3.9705, "step": 4100 }, { "epoch": 2.657386902878836, "grad_norm": 15.516547203063965, "learning_rate": 6.329113924050633e-06, "loss": 3.9296, "step": 4200 }, { "epoch": 2.720658019614046, "grad_norm": 16.0513916015625, "learning_rate": 5.157055789967183e-06, "loss": 4.0594, "step": 4300 }, { "epoch": 2.7839291363492564, "grad_norm": 13.820748329162598, "learning_rate": 3.984997655883732e-06, "loss": 3.9353, "step": 4400 }, { "epoch": 2.847200253084467, "grad_norm": 14.607758522033691, "learning_rate": 2.8129395218002813e-06, "loss": 4.0605, "step": 4500 }, { "epoch": 2.9104713698196774, "grad_norm": 18.928266525268555, "learning_rate": 1.6408813877168308e-06, "loss": 3.9663, "step": 4600 }, { "epoch": 2.9737424865548876, "grad_norm": 17.247835159301758, "learning_rate": 4.688232536333803e-07, "loss": 3.9057, "step": 4700 } ], "logging_steps": 100, "max_steps": 4740, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2016724755342822.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }