{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0144927536231885, "eval_steps": 1, "global_step": 35, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.028985507246376812, "eval_loss": 3.890916585922241, "eval_runtime": 2.366, "eval_samples_per_second": 253.597, "eval_steps_per_second": 31.7, "step": 1 }, { "epoch": 0.057971014492753624, "grad_norm": 251.42497029296223, "learning_rate": 6.666666666666667e-06, "loss": 3.8462, "step": 2 }, { "epoch": 0.057971014492753624, "eval_loss": 3.1606125831604004, "eval_runtime": 2.3735, "eval_samples_per_second": 252.789, "eval_steps_per_second": 31.599, "step": 2 }, { "epoch": 0.08695652173913043, "eval_loss": 1.4003069400787354, "eval_runtime": 2.3791, "eval_samples_per_second": 252.195, "eval_steps_per_second": 31.524, "step": 3 }, { "epoch": 0.11594202898550725, "grad_norm": 174.88885660985272, "learning_rate": 9.994161134161635e-06, "loss": 2.3026, "step": 4 }, { "epoch": 0.11594202898550725, "eval_loss": 0.5247076749801636, "eval_runtime": 2.3571, "eval_samples_per_second": 254.551, "eval_steps_per_second": 31.819, "step": 4 }, { "epoch": 0.14492753623188406, "eval_loss": 0.25349560379981995, "eval_runtime": 2.3667, "eval_samples_per_second": 253.52, "eval_steps_per_second": 31.69, "step": 5 }, { "epoch": 0.17391304347826086, "grad_norm": 9.541839408808285, "learning_rate": 9.947531997255256e-06, "loss": 0.3725, "step": 6 }, { "epoch": 0.17391304347826086, "eval_loss": 0.12238868325948715, "eval_runtime": 2.3709, "eval_samples_per_second": 253.068, "eval_steps_per_second": 31.633, "step": 6 }, { "epoch": 0.2028985507246377, "eval_loss": 0.07106433808803558, "eval_runtime": 2.3595, "eval_samples_per_second": 254.287, "eval_steps_per_second": 31.786, "step": 7 }, { "epoch": 0.2318840579710145, "grad_norm": 8.373395170519098, "learning_rate": 9.854709087130261e-06, "loss": 0.1704, "step": 8 }, { "epoch": 0.2318840579710145, "eval_loss": 0.07050631195306778, "eval_runtime": 2.374, "eval_samples_per_second": 252.739, "eval_steps_per_second": 31.592, "step": 8 }, { "epoch": 0.2608695652173913, "eval_loss": 0.0841919556260109, "eval_runtime": 2.3733, "eval_samples_per_second": 252.81, "eval_steps_per_second": 31.601, "step": 9 }, { "epoch": 0.2898550724637681, "grad_norm": 9.328057178580242, "learning_rate": 9.716559066288716e-06, "loss": 0.0719, "step": 10 }, { "epoch": 0.2898550724637681, "eval_loss": 0.06837386637926102, "eval_runtime": 2.402, "eval_samples_per_second": 249.788, "eval_steps_per_second": 31.223, "step": 10 }, { "epoch": 0.3188405797101449, "eval_loss": 0.08372741937637329, "eval_runtime": 2.3771, "eval_samples_per_second": 252.413, "eval_steps_per_second": 31.552, "step": 11 }, { "epoch": 0.34782608695652173, "grad_norm": 8.195685627940097, "learning_rate": 9.534371804252727e-06, "loss": 0.0719, "step": 12 }, { "epoch": 0.34782608695652173, "eval_loss": 0.07937659323215485, "eval_runtime": 2.3703, "eval_samples_per_second": 253.131, "eval_steps_per_second": 31.641, "step": 12 }, { "epoch": 0.37681159420289856, "eval_loss": 0.06787987053394318, "eval_runtime": 2.3654, "eval_samples_per_second": 253.659, "eval_steps_per_second": 31.707, "step": 13 }, { "epoch": 0.4057971014492754, "grad_norm": 3.0846120042199954, "learning_rate": 9.309848334400247e-06, "loss": 0.0729, "step": 14 }, { "epoch": 0.4057971014492754, "eval_loss": 0.060705069452524185, "eval_runtime": 2.3698, "eval_samples_per_second": 253.186, "eval_steps_per_second": 31.648, "step": 14 }, { "epoch": 0.43478260869565216, "eval_loss": 0.06819155067205429, "eval_runtime": 2.3712, "eval_samples_per_second": 253.037, "eval_steps_per_second": 31.63, "step": 15 }, { "epoch": 0.463768115942029, "grad_norm": 3.7022895578403414, "learning_rate": 9.045084971874738e-06, "loss": 0.0639, "step": 16 }, { "epoch": 0.463768115942029, "eval_loss": 0.06595086306333542, "eval_runtime": 2.3702, "eval_samples_per_second": 253.148, "eval_steps_per_second": 31.643, "step": 16 }, { "epoch": 0.4927536231884058, "eval_loss": 0.06074570491909981, "eval_runtime": 2.3929, "eval_samples_per_second": 250.74, "eval_steps_per_second": 31.342, "step": 17 }, { "epoch": 0.5217391304347826, "grad_norm": 2.6201997383559235, "learning_rate": 8.742553740855507e-06, "loss": 0.0659, "step": 18 }, { "epoch": 0.5217391304347826, "eval_loss": 0.060938794165849686, "eval_runtime": 2.3734, "eval_samples_per_second": 252.797, "eval_steps_per_second": 31.6, "step": 18 }, { "epoch": 0.5507246376811594, "eval_loss": 0.05989724025130272, "eval_runtime": 2.386, "eval_samples_per_second": 251.47, "eval_steps_per_second": 31.434, "step": 19 }, { "epoch": 0.5797101449275363, "grad_norm": 1.5759739495214995, "learning_rate": 8.405079293933986e-06, "loss": 0.0584, "step": 20 }, { "epoch": 0.5797101449275363, "eval_loss": 0.05950001999735832, "eval_runtime": 2.3751, "eval_samples_per_second": 252.625, "eval_steps_per_second": 31.578, "step": 20 }, { "epoch": 0.6086956521739131, "eval_loss": 0.057929884642362595, "eval_runtime": 2.3951, "eval_samples_per_second": 250.515, "eval_steps_per_second": 31.314, "step": 21 }, { "epoch": 0.6376811594202898, "grad_norm": 0.9083257875769617, "learning_rate": 8.035812539093557e-06, "loss": 0.059, "step": 22 }, { "epoch": 0.6376811594202898, "eval_loss": 0.05716191604733467, "eval_runtime": 2.3793, "eval_samples_per_second": 252.176, "eval_steps_per_second": 31.522, "step": 22 }, { "epoch": 0.6666666666666666, "eval_loss": 0.05785393714904785, "eval_runtime": 2.3743, "eval_samples_per_second": 252.704, "eval_steps_per_second": 31.588, "step": 23 }, { "epoch": 0.6956521739130435, "grad_norm": 9.258583060973042, "learning_rate": 7.638201220530664e-06, "loss": 0.1069, "step": 24 }, { "epoch": 0.6956521739130435, "eval_loss": 0.06170507147908211, "eval_runtime": 2.3968, "eval_samples_per_second": 250.337, "eval_steps_per_second": 31.292, "step": 24 }, { "epoch": 0.7246376811594203, "eval_loss": 0.06007671728730202, "eval_runtime": 2.375, "eval_samples_per_second": 252.631, "eval_steps_per_second": 31.579, "step": 25 }, { "epoch": 0.7536231884057971, "grad_norm": 2.788879674143748, "learning_rate": 7.215957727996208e-06, "loss": 0.0585, "step": 26 }, { "epoch": 0.7536231884057971, "eval_loss": 0.05631522089242935, "eval_runtime": 2.4038, "eval_samples_per_second": 249.609, "eval_steps_per_second": 31.201, "step": 26 }, { "epoch": 0.782608695652174, "eval_loss": 0.05981193110346794, "eval_runtime": 2.3841, "eval_samples_per_second": 251.665, "eval_steps_per_second": 31.458, "step": 27 }, { "epoch": 0.8115942028985508, "grad_norm": 3.982184927790719, "learning_rate": 6.773024435212678e-06, "loss": 0.097, "step": 28 }, { "epoch": 0.8115942028985508, "eval_loss": 0.05898861214518547, "eval_runtime": 2.3921, "eval_samples_per_second": 250.824, "eval_steps_per_second": 31.353, "step": 28 }, { "epoch": 0.8405797101449275, "eval_loss": 0.05481765791773796, "eval_runtime": 2.3767, "eval_samples_per_second": 252.451, "eval_steps_per_second": 31.556, "step": 29 }, { "epoch": 0.8695652173913043, "grad_norm": 0.18833058180333875, "learning_rate": 6.313536890992935e-06, "loss": 0.059, "step": 30 }, { "epoch": 0.8695652173913043, "eval_loss": 0.05593809857964516, "eval_runtime": 2.3764, "eval_samples_per_second": 252.478, "eval_steps_per_second": 31.56, "step": 30 }, { "epoch": 0.8985507246376812, "eval_loss": 0.05695917829871178, "eval_runtime": 2.39, "eval_samples_per_second": 251.049, "eval_steps_per_second": 31.381, "step": 31 }, { "epoch": 0.927536231884058, "grad_norm": 3.4944330077548207, "learning_rate": 5.841785206735192e-06, "loss": 0.0695, "step": 32 }, { "epoch": 0.927536231884058, "eval_loss": 0.05482754111289978, "eval_runtime": 2.3734, "eval_samples_per_second": 252.799, "eval_steps_per_second": 31.6, "step": 32 }, { "epoch": 0.9565217391304348, "eval_loss": 0.055433232337236404, "eval_runtime": 2.3729, "eval_samples_per_second": 252.86, "eval_steps_per_second": 31.607, "step": 33 }, { "epoch": 0.9855072463768116, "grad_norm": 2.742927364863374, "learning_rate": 5.362174000808813e-06, "loss": 0.0533, "step": 34 }, { "epoch": 0.9855072463768116, "eval_loss": 0.05639192834496498, "eval_runtime": 2.3727, "eval_samples_per_second": 252.873, "eval_steps_per_second": 31.609, "step": 34 }, { "epoch": 1.0144927536231885, "eval_loss": 0.054112281650304794, "eval_runtime": 2.37, "eval_samples_per_second": 253.168, "eval_steps_per_second": 31.646, "step": 35 } ], "logging_steps": 2, "max_steps": 68, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 18479969206272.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }