{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9899159663865547, "eval_steps": 500, "global_step": 222, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06722689075630252, "grad_norm": 0.38315412402153015, "learning_rate": 4.99374449148625e-05, "loss": 1.369, "num_input_tokens_seen": 299568, "step": 5 }, { "epoch": 0.13445378151260504, "grad_norm": 0.22612476348876953, "learning_rate": 4.975009271054409e-05, "loss": 1.3378, "num_input_tokens_seen": 599264, "step": 10 }, { "epoch": 0.20168067226890757, "grad_norm": 0.2075585424900055, "learning_rate": 4.943888097369216e-05, "loss": 1.3046, "num_input_tokens_seen": 898016, "step": 15 }, { "epoch": 0.2689075630252101, "grad_norm": 0.1732514202594757, "learning_rate": 4.9005367134442235e-05, "loss": 1.2645, "num_input_tokens_seen": 1194592, "step": 20 }, { "epoch": 0.33613445378151263, "grad_norm": 0.1338847130537033, "learning_rate": 4.845172067240415e-05, "loss": 1.2228, "num_input_tokens_seen": 1497584, "step": 25 }, { "epoch": 0.40336134453781514, "grad_norm": 0.13163860142230988, "learning_rate": 4.77807122597034e-05, "loss": 1.1848, "num_input_tokens_seen": 1798048, "step": 30 }, { "epoch": 0.47058823529411764, "grad_norm": 0.1262914389371872, "learning_rate": 4.699569989541074e-05, "loss": 1.2055, "num_input_tokens_seen": 2096432, "step": 35 }, { "epoch": 0.5378151260504201, "grad_norm": 0.12664474546909332, "learning_rate": 4.6100612100748765e-05, "loss": 1.1743, "num_input_tokens_seen": 2397040, "step": 40 }, { "epoch": 0.6050420168067226, "grad_norm": 0.11807628720998764, "learning_rate": 4.5099928259173516e-05, "loss": 1.1572, "num_input_tokens_seen": 2698704, "step": 45 }, { "epoch": 0.6722689075630253, "grad_norm": 0.11550261825323105, "learning_rate": 4.3998656199717435e-05, "loss": 1.1726, "num_input_tokens_seen": 2995808, "step": 50 }, { "epoch": 0.7394957983193278, "grad_norm": 0.11989202350378036, "learning_rate": 4.280230713577564e-05, "loss": 1.1402, "num_input_tokens_seen": 3292272, "step": 55 }, { "epoch": 0.8067226890756303, "grad_norm": 0.12094570696353912, "learning_rate": 4.151686808475204e-05, "loss": 1.1252, "num_input_tokens_seen": 3591104, "step": 60 }, { "epoch": 0.8739495798319328, "grad_norm": 0.12436824291944504, "learning_rate": 4.0148771906588706e-05, "loss": 1.1096, "num_input_tokens_seen": 3886624, "step": 65 }, { "epoch": 0.9411764705882353, "grad_norm": 0.12026601284742355, "learning_rate": 3.8704865111117746e-05, "loss": 1.1523, "num_input_tokens_seen": 4181408, "step": 70 }, { "epoch": 1.0084033613445378, "grad_norm": 0.127424955368042, "learning_rate": 3.719237359534087e-05, "loss": 1.1077, "num_input_tokens_seen": 4480512, "step": 75 }, { "epoch": 1.0756302521008403, "grad_norm": 0.12994937598705292, "learning_rate": 3.56188664821012e-05, "loss": 1.0889, "num_input_tokens_seen": 4779072, "step": 80 }, { "epoch": 1.1428571428571428, "grad_norm": 0.13282892107963562, "learning_rate": 3.39922182411134e-05, "loss": 1.0952, "num_input_tokens_seen": 5078288, "step": 85 }, { "epoch": 1.2100840336134453, "grad_norm": 0.12781143188476562, "learning_rate": 3.232056928191376e-05, "loss": 1.0983, "num_input_tokens_seen": 5377136, "step": 90 }, { "epoch": 1.2773109243697478, "grad_norm": 0.149738147854805, "learning_rate": 3.061228521593931e-05, "loss": 1.0956, "num_input_tokens_seen": 5672784, "step": 95 }, { "epoch": 1.3445378151260505, "grad_norm": 0.14856387674808502, "learning_rate": 2.8875914991604948e-05, "loss": 1.08, "num_input_tokens_seen": 5974880, "step": 100 }, { "epoch": 1.4168067226890757, "grad_norm": 0.14656391739845276, "learning_rate": 2.7120148111887732e-05, "loss": 1.0489, "num_input_tokens_seen": 6273616, "step": 105 }, { "epoch": 1.4840336134453782, "grad_norm": 0.1688816398382187, "learning_rate": 2.5353771148519057e-05, "loss": 1.0607, "num_input_tokens_seen": 6574224, "step": 110 }, { "epoch": 1.5512605042016807, "grad_norm": 0.1456299126148224, "learning_rate": 2.358562377040519e-05, "loss": 1.0722, "num_input_tokens_seen": 6869008, "step": 115 }, { "epoch": 1.6184873949579832, "grad_norm": 0.16043561697006226, "learning_rate": 2.182455450632803e-05, "loss": 1.0662, "num_input_tokens_seen": 7167120, "step": 120 }, { "epoch": 1.6857142857142857, "grad_norm": 0.1590069830417633, "learning_rate": 2.0079376463307368e-05, "loss": 1.0548, "num_input_tokens_seen": 7465408, "step": 125 }, { "epoch": 1.7529411764705882, "grad_norm": 0.15698856115341187, "learning_rate": 1.8358823222228097e-05, "loss": 1.0645, "num_input_tokens_seen": 7764608, "step": 130 }, { "epoch": 1.8201680672268907, "grad_norm": 0.1606791466474533, "learning_rate": 1.667150513144856e-05, "loss": 1.0671, "num_input_tokens_seen": 8062192, "step": 135 }, { "epoch": 1.8873949579831932, "grad_norm": 0.16018208861351013, "learning_rate": 1.5025866217114592e-05, "loss": 1.0775, "num_input_tokens_seen": 8359072, "step": 140 }, { "epoch": 1.954621848739496, "grad_norm": 0.16284871101379395, "learning_rate": 1.3430141925817532e-05, "loss": 1.0781, "num_input_tokens_seen": 8655680, "step": 145 }, { "epoch": 2.0218487394957982, "grad_norm": 0.15517888963222504, "learning_rate": 1.1892317911069212e-05, "loss": 1.0598, "num_input_tokens_seen": 8952864, "step": 150 }, { "epoch": 2.089075630252101, "grad_norm": 0.18014486134052277, "learning_rate": 1.0420090069843167e-05, "loss": 1.0507, "num_input_tokens_seen": 9250368, "step": 155 }, { "epoch": 2.1563025210084033, "grad_norm": 0.1607990264892578, "learning_rate": 9.020826029175384e-06, "loss": 1.0532, "num_input_tokens_seen": 9552944, "step": 160 }, { "epoch": 2.223529411764706, "grad_norm": 0.16465440392494202, "learning_rate": 7.701528275561348e-06, "loss": 1.0589, "num_input_tokens_seen": 9849888, "step": 165 }, { "epoch": 2.2907563025210083, "grad_norm": 0.17045491933822632, "learning_rate": 6.468799111665003e-06, "loss": 1.034, "num_input_tokens_seen": 10149488, "step": 170 }, { "epoch": 2.357983193277311, "grad_norm": 0.16000358760356903, "learning_rate": 5.328807615710246e-06, "loss": 1.0442, "num_input_tokens_seen": 10448208, "step": 175 }, { "epoch": 2.4252100840336133, "grad_norm": 0.16633416712284088, "learning_rate": 4.2872587689039484e-06, "loss": 1.0579, "num_input_tokens_seen": 10744608, "step": 180 }, { "epoch": 2.492436974789916, "grad_norm": 0.15942485630512238, "learning_rate": 3.3493649053890326e-06, "loss": 1.0397, "num_input_tokens_seen": 11044688, "step": 185 }, { "epoch": 2.5596638655462183, "grad_norm": 0.1655404269695282, "learning_rate": 2.5198196276040782e-06, "loss": 1.0249, "num_input_tokens_seen": 11343152, "step": 190 }, { "epoch": 2.626890756302521, "grad_norm": 0.16440080106258392, "learning_rate": 1.8027743175872664e-06, "loss": 1.0343, "num_input_tokens_seen": 11645792, "step": 195 }, { "epoch": 2.6941176470588237, "grad_norm": 0.17174942791461945, "learning_rate": 1.201817361771837e-06, "loss": 1.045, "num_input_tokens_seen": 11942320, "step": 200 }, { "epoch": 2.761344537815126, "grad_norm": 0.16318759322166443, "learning_rate": 7.199561932405952e-07, "loss": 1.0425, "num_input_tokens_seen": 12239184, "step": 205 }, { "epoch": 2.8285714285714287, "grad_norm": 0.17799663543701172, "learning_rate": 3.5960224130728857e-07, "loss": 1.0649, "num_input_tokens_seen": 12534704, "step": 210 }, { "epoch": 2.895798319327731, "grad_norm": 0.1608886420726776, "learning_rate": 1.2255886374334946e-07, "loss": 1.0439, "num_input_tokens_seen": 12838560, "step": 215 }, { "epoch": 2.9630252100840337, "grad_norm": 0.16979870200157166, "learning_rate": 1.0012322041960676e-08, "loss": 1.0382, "num_input_tokens_seen": 13133488, "step": 220 }, { "epoch": 2.9899159663865547, "num_input_tokens_seen": 13254560, "step": 222, "total_flos": 2.251343464014807e+17, "train_loss": 0.5790731208818453, "train_runtime": 983.1045, "train_samples_per_second": 29.011, "train_steps_per_second": 0.226 } ], "logging_steps": 5, "max_steps": 222, "num_input_tokens_seen": 13254560, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.251343464014807e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }