{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9420289855072463, "eval_steps": 1, "global_step": 68, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.028985507246376812, "eval_loss": 3.890916585922241, "eval_runtime": 2.366, "eval_samples_per_second": 253.597, "eval_steps_per_second": 31.7, "step": 1 }, { "epoch": 0.057971014492753624, "grad_norm": 251.42497029296223, "learning_rate": 6.666666666666667e-06, "loss": 3.8462, "step": 2 }, { "epoch": 0.057971014492753624, "eval_loss": 3.1606125831604004, "eval_runtime": 2.3735, "eval_samples_per_second": 252.789, "eval_steps_per_second": 31.599, "step": 2 }, { "epoch": 0.08695652173913043, "eval_loss": 1.4003069400787354, "eval_runtime": 2.3791, "eval_samples_per_second": 252.195, "eval_steps_per_second": 31.524, "step": 3 }, { "epoch": 0.11594202898550725, "grad_norm": 174.88885660985272, "learning_rate": 9.994161134161635e-06, "loss": 2.3026, "step": 4 }, { "epoch": 0.11594202898550725, "eval_loss": 0.5247076749801636, "eval_runtime": 2.3571, "eval_samples_per_second": 254.551, "eval_steps_per_second": 31.819, "step": 4 }, { "epoch": 0.14492753623188406, "eval_loss": 0.25349560379981995, "eval_runtime": 2.3667, "eval_samples_per_second": 253.52, "eval_steps_per_second": 31.69, "step": 5 }, { "epoch": 0.17391304347826086, "grad_norm": 9.541839408808285, "learning_rate": 9.947531997255256e-06, "loss": 0.3725, "step": 6 }, { "epoch": 0.17391304347826086, "eval_loss": 0.12238868325948715, "eval_runtime": 2.3709, "eval_samples_per_second": 253.068, "eval_steps_per_second": 31.633, "step": 6 }, { "epoch": 0.2028985507246377, "eval_loss": 0.07106433808803558, "eval_runtime": 2.3595, "eval_samples_per_second": 254.287, "eval_steps_per_second": 31.786, "step": 7 }, { "epoch": 0.2318840579710145, "grad_norm": 8.373395170519098, "learning_rate": 9.854709087130261e-06, "loss": 0.1704, "step": 8 }, { "epoch": 0.2318840579710145, "eval_loss": 0.07050631195306778, "eval_runtime": 2.374, "eval_samples_per_second": 252.739, "eval_steps_per_second": 31.592, "step": 8 }, { "epoch": 0.2608695652173913, "eval_loss": 0.0841919556260109, "eval_runtime": 2.3733, "eval_samples_per_second": 252.81, "eval_steps_per_second": 31.601, "step": 9 }, { "epoch": 0.2898550724637681, "grad_norm": 9.328057178580242, "learning_rate": 9.716559066288716e-06, "loss": 0.0719, "step": 10 }, { "epoch": 0.2898550724637681, "eval_loss": 0.06837386637926102, "eval_runtime": 2.402, "eval_samples_per_second": 249.788, "eval_steps_per_second": 31.223, "step": 10 }, { "epoch": 0.3188405797101449, "eval_loss": 0.08372741937637329, "eval_runtime": 2.3771, "eval_samples_per_second": 252.413, "eval_steps_per_second": 31.552, "step": 11 }, { "epoch": 0.34782608695652173, "grad_norm": 8.195685627940097, "learning_rate": 9.534371804252727e-06, "loss": 0.0719, "step": 12 }, { "epoch": 0.34782608695652173, "eval_loss": 0.07937659323215485, "eval_runtime": 2.3703, "eval_samples_per_second": 253.131, "eval_steps_per_second": 31.641, "step": 12 }, { "epoch": 0.37681159420289856, "eval_loss": 0.06787987053394318, "eval_runtime": 2.3654, "eval_samples_per_second": 253.659, "eval_steps_per_second": 31.707, "step": 13 }, { "epoch": 0.4057971014492754, "grad_norm": 3.0846120042199954, "learning_rate": 9.309848334400247e-06, "loss": 0.0729, "step": 14 }, { "epoch": 0.4057971014492754, "eval_loss": 0.060705069452524185, "eval_runtime": 2.3698, "eval_samples_per_second": 253.186, "eval_steps_per_second": 31.648, "step": 14 }, { "epoch": 0.43478260869565216, "eval_loss": 0.06819155067205429, "eval_runtime": 2.3712, "eval_samples_per_second": 253.037, "eval_steps_per_second": 31.63, "step": 15 }, { "epoch": 0.463768115942029, "grad_norm": 3.7022895578403414, "learning_rate": 9.045084971874738e-06, "loss": 0.0639, "step": 16 }, { "epoch": 0.463768115942029, "eval_loss": 0.06595086306333542, "eval_runtime": 2.3702, "eval_samples_per_second": 253.148, "eval_steps_per_second": 31.643, "step": 16 }, { "epoch": 0.4927536231884058, "eval_loss": 0.06074570491909981, "eval_runtime": 2.3929, "eval_samples_per_second": 250.74, "eval_steps_per_second": 31.342, "step": 17 }, { "epoch": 0.5217391304347826, "grad_norm": 2.6201997383559235, "learning_rate": 8.742553740855507e-06, "loss": 0.0659, "step": 18 }, { "epoch": 0.5217391304347826, "eval_loss": 0.060938794165849686, "eval_runtime": 2.3734, "eval_samples_per_second": 252.797, "eval_steps_per_second": 31.6, "step": 18 }, { "epoch": 0.5507246376811594, "eval_loss": 0.05989724025130272, "eval_runtime": 2.386, "eval_samples_per_second": 251.47, "eval_steps_per_second": 31.434, "step": 19 }, { "epoch": 0.5797101449275363, "grad_norm": 1.5759739495214995, "learning_rate": 8.405079293933986e-06, "loss": 0.0584, "step": 20 }, { "epoch": 0.5797101449275363, "eval_loss": 0.05950001999735832, "eval_runtime": 2.3751, "eval_samples_per_second": 252.625, "eval_steps_per_second": 31.578, "step": 20 }, { "epoch": 0.6086956521739131, "eval_loss": 0.057929884642362595, "eval_runtime": 2.3951, "eval_samples_per_second": 250.515, "eval_steps_per_second": 31.314, "step": 21 }, { "epoch": 0.6376811594202898, "grad_norm": 0.9083257875769617, "learning_rate": 8.035812539093557e-06, "loss": 0.059, "step": 22 }, { "epoch": 0.6376811594202898, "eval_loss": 0.05716191604733467, "eval_runtime": 2.3793, "eval_samples_per_second": 252.176, "eval_steps_per_second": 31.522, "step": 22 }, { "epoch": 0.6666666666666666, "eval_loss": 0.05785393714904785, "eval_runtime": 2.3743, "eval_samples_per_second": 252.704, "eval_steps_per_second": 31.588, "step": 23 }, { "epoch": 0.6956521739130435, "grad_norm": 9.258583060973042, "learning_rate": 7.638201220530664e-06, "loss": 0.1069, "step": 24 }, { "epoch": 0.6956521739130435, "eval_loss": 0.06170507147908211, "eval_runtime": 2.3968, "eval_samples_per_second": 250.337, "eval_steps_per_second": 31.292, "step": 24 }, { "epoch": 0.7246376811594203, "eval_loss": 0.06007671728730202, "eval_runtime": 2.375, "eval_samples_per_second": 252.631, "eval_steps_per_second": 31.579, "step": 25 }, { "epoch": 0.7536231884057971, "grad_norm": 2.788879674143748, "learning_rate": 7.215957727996208e-06, "loss": 0.0585, "step": 26 }, { "epoch": 0.7536231884057971, "eval_loss": 0.05631522089242935, "eval_runtime": 2.4038, "eval_samples_per_second": 249.609, "eval_steps_per_second": 31.201, "step": 26 }, { "epoch": 0.782608695652174, "eval_loss": 0.05981193110346794, "eval_runtime": 2.3841, "eval_samples_per_second": 251.665, "eval_steps_per_second": 31.458, "step": 27 }, { "epoch": 0.8115942028985508, "grad_norm": 3.982184927790719, "learning_rate": 6.773024435212678e-06, "loss": 0.097, "step": 28 }, { "epoch": 0.8115942028985508, "eval_loss": 0.05898861214518547, "eval_runtime": 2.3921, "eval_samples_per_second": 250.824, "eval_steps_per_second": 31.353, "step": 28 }, { "epoch": 0.8405797101449275, "eval_loss": 0.05481765791773796, "eval_runtime": 2.3767, "eval_samples_per_second": 252.451, "eval_steps_per_second": 31.556, "step": 29 }, { "epoch": 0.8695652173913043, "grad_norm": 0.18833058180333875, "learning_rate": 6.313536890992935e-06, "loss": 0.059, "step": 30 }, { "epoch": 0.8695652173913043, "eval_loss": 0.05593809857964516, "eval_runtime": 2.3764, "eval_samples_per_second": 252.478, "eval_steps_per_second": 31.56, "step": 30 }, { "epoch": 0.8985507246376812, "eval_loss": 0.05695917829871178, "eval_runtime": 2.39, "eval_samples_per_second": 251.049, "eval_steps_per_second": 31.381, "step": 31 }, { "epoch": 0.927536231884058, "grad_norm": 3.4944330077548207, "learning_rate": 5.841785206735192e-06, "loss": 0.0695, "step": 32 }, { "epoch": 0.927536231884058, "eval_loss": 0.05482754111289978, "eval_runtime": 2.3734, "eval_samples_per_second": 252.799, "eval_steps_per_second": 31.6, "step": 32 }, { "epoch": 0.9565217391304348, "eval_loss": 0.055433232337236404, "eval_runtime": 2.3729, "eval_samples_per_second": 252.86, "eval_steps_per_second": 31.607, "step": 33 }, { "epoch": 0.9855072463768116, "grad_norm": 2.742927364863374, "learning_rate": 5.362174000808813e-06, "loss": 0.0533, "step": 34 }, { "epoch": 0.9855072463768116, "eval_loss": 0.05639192834496498, "eval_runtime": 2.3727, "eval_samples_per_second": 252.873, "eval_steps_per_second": 31.609, "step": 34 }, { "epoch": 1.0144927536231885, "eval_loss": 0.054112281650304794, "eval_runtime": 2.37, "eval_samples_per_second": 253.168, "eval_steps_per_second": 31.646, "step": 35 }, { "epoch": 1.0144927536231885, "grad_norm": 1.5488276127209792, "learning_rate": 4.87918127381934e-06, "loss": 0.0544, "step": 36 }, { "epoch": 1.0144927536231885, "eval_loss": 0.0547836609184742, "eval_runtime": 2.372, "eval_samples_per_second": 252.956, "eval_steps_per_second": 31.619, "step": 36 }, { "epoch": 1.0434782608695652, "eval_loss": 0.05551725998520851, "eval_runtime": 2.376, "eval_samples_per_second": 252.528, "eval_steps_per_second": 31.566, "step": 37 }, { "epoch": 1.0724637681159421, "grad_norm": 2.1899578742270838, "learning_rate": 4.397316598723385e-06, "loss": 0.0555, "step": 38 }, { "epoch": 1.0724637681159421, "eval_loss": 0.05312129110097885, "eval_runtime": 2.3838, "eval_samples_per_second": 251.695, "eval_steps_per_second": 31.462, "step": 38 }, { "epoch": 1.1014492753623188, "eval_loss": 0.053158555179834366, "eval_runtime": 2.3919, "eval_samples_per_second": 250.844, "eval_steps_per_second": 31.355, "step": 39 }, { "epoch": 1.1304347826086956, "grad_norm": 2.1733146824122724, "learning_rate": 3.92107901616097e-06, "loss": 0.0524, "step": 40 }, { "epoch": 1.1304347826086956, "eval_loss": 0.05355316773056984, "eval_runtime": 2.3826, "eval_samples_per_second": 251.826, "eval_steps_per_second": 31.478, "step": 40 }, { "epoch": 1.1594202898550725, "eval_loss": 0.05187664180994034, "eval_runtime": 2.3848, "eval_samples_per_second": 251.594, "eval_steps_per_second": 31.449, "step": 41 }, { "epoch": 1.1884057971014492, "grad_norm": 2.315639907284586, "learning_rate": 3.4549150281252635e-06, "loss": 0.0641, "step": 42 }, { "epoch": 1.1884057971014492, "eval_loss": 0.05204891413450241, "eval_runtime": 2.3997, "eval_samples_per_second": 250.028, "eval_steps_per_second": 31.254, "step": 42 }, { "epoch": 1.2173913043478262, "eval_loss": 0.052227165549993515, "eval_runtime": 2.3864, "eval_samples_per_second": 251.429, "eval_steps_per_second": 31.429, "step": 43 }, { "epoch": 1.2463768115942029, "grad_norm": 1.5853052624042796, "learning_rate": 3.0031770821715233e-06, "loss": 0.0494, "step": 44 }, { "epoch": 1.2463768115942029, "eval_loss": 0.05136393383145332, "eval_runtime": 2.3856, "eval_samples_per_second": 251.513, "eval_steps_per_second": 31.439, "step": 44 }, { "epoch": 1.2753623188405796, "eval_loss": 0.051076941192150116, "eval_runtime": 2.3923, "eval_samples_per_second": 250.8, "eval_steps_per_second": 31.35, "step": 45 }, { "epoch": 1.3043478260869565, "grad_norm": 0.2342975899295018, "learning_rate": 2.57008293378697e-06, "loss": 0.0502, "step": 46 }, { "epoch": 1.3043478260869565, "eval_loss": 0.05139908567070961, "eval_runtime": 2.3812, "eval_samples_per_second": 251.974, "eval_steps_per_second": 31.497, "step": 46 }, { "epoch": 1.3333333333333333, "eval_loss": 0.05105065554380417, "eval_runtime": 2.4018, "eval_samples_per_second": 249.811, "eval_steps_per_second": 31.226, "step": 47 }, { "epoch": 1.3623188405797102, "grad_norm": 1.150174671283694, "learning_rate": 2.159676266344222e-06, "loss": 0.0482, "step": 48 }, { "epoch": 1.3623188405797102, "eval_loss": 0.050515007227659225, "eval_runtime": 2.3849, "eval_samples_per_second": 251.588, "eval_steps_per_second": 31.449, "step": 48 }, { "epoch": 1.391304347826087, "eval_loss": 0.05112989619374275, "eval_runtime": 2.3861, "eval_samples_per_second": 251.452, "eval_steps_per_second": 31.432, "step": 49 }, { "epoch": 1.4202898550724639, "grad_norm": 0.9809479332869758, "learning_rate": 1.7757889363191484e-06, "loss": 0.0472, "step": 50 }, { "epoch": 1.4202898550724639, "eval_loss": 0.050852496176958084, "eval_runtime": 2.3888, "eval_samples_per_second": 251.167, "eval_steps_per_second": 31.396, "step": 50 }, { "epoch": 1.4492753623188406, "eval_loss": 0.04979565367102623, "eval_runtime": 2.3955, "eval_samples_per_second": 250.469, "eval_steps_per_second": 31.309, "step": 51 }, { "epoch": 1.4782608695652173, "grad_norm": 0.7181974427552101, "learning_rate": 1.4220051962793952e-06, "loss": 0.0478, "step": 52 }, { "epoch": 1.4782608695652173, "eval_loss": 0.04979529604315758, "eval_runtime": 2.3784, "eval_samples_per_second": 252.271, "eval_steps_per_second": 31.534, "step": 52 }, { "epoch": 1.5072463768115942, "eval_loss": 0.050219178199768066, "eval_runtime": 2.3756, "eval_samples_per_second": 252.57, "eval_steps_per_second": 31.571, "step": 53 }, { "epoch": 1.5362318840579712, "grad_norm": 1.7774492629444423, "learning_rate": 1.1016282296838887e-06, "loss": 0.055, "step": 54 }, { "epoch": 1.5362318840579712, "eval_loss": 0.04986535757780075, "eval_runtime": 2.3832, "eval_samples_per_second": 251.765, "eval_steps_per_second": 31.471, "step": 54 }, { "epoch": 1.5652173913043477, "eval_loss": 0.04925783351063728, "eval_runtime": 2.3863, "eval_samples_per_second": 251.431, "eval_steps_per_second": 31.429, "step": 55 }, { "epoch": 1.5942028985507246, "grad_norm": 0.7119057580240911, "learning_rate": 8.176493099488664e-07, "loss": 0.0459, "step": 56 }, { "epoch": 1.5942028985507246, "eval_loss": 0.049321793019771576, "eval_runtime": 2.3816, "eval_samples_per_second": 251.931, "eval_steps_per_second": 31.491, "step": 56 }, { "epoch": 1.6231884057971016, "eval_loss": 0.049656953662633896, "eval_runtime": 2.3857, "eval_samples_per_second": 251.499, "eval_steps_per_second": 31.437, "step": 57 }, { "epoch": 1.6521739130434783, "grad_norm": 1.6066866518550498, "learning_rate": 5.727198717339511e-07, "loss": 0.0492, "step": 58 }, { "epoch": 1.6521739130434783, "eval_loss": 0.04972606897354126, "eval_runtime": 2.3792, "eval_samples_per_second": 252.181, "eval_steps_per_second": 31.523, "step": 58 }, { "epoch": 1.681159420289855, "eval_loss": 0.04940681904554367, "eval_runtime": 2.3921, "eval_samples_per_second": 250.828, "eval_steps_per_second": 31.354, "step": 59 }, { "epoch": 1.710144927536232, "grad_norm": 1.5684134897039972, "learning_rate": 3.691267552111183e-07, "loss": 0.0504, "step": 60 }, { "epoch": 1.710144927536232, "eval_loss": 0.04902585968375206, "eval_runtime": 2.3816, "eval_samples_per_second": 251.932, "eval_steps_per_second": 31.491, "step": 60 }, { "epoch": 1.7391304347826086, "eval_loss": 0.04878399893641472, "eval_runtime": 2.3923, "eval_samples_per_second": 250.809, "eval_steps_per_second": 31.351, "step": 61 }, { "epoch": 1.7681159420289854, "grad_norm": 3.0252941822207946, "learning_rate": 2.0877085445416889e-07, "loss": 0.0564, "step": 62 }, { "epoch": 1.7681159420289854, "eval_loss": 0.04881977662444115, "eval_runtime": 2.3726, "eval_samples_per_second": 252.892, "eval_steps_per_second": 31.611, "step": 62 }, { "epoch": 1.7971014492753623, "eval_loss": 0.04876958206295967, "eval_runtime": 2.383, "eval_samples_per_second": 251.783, "eval_steps_per_second": 31.473, "step": 63 }, { "epoch": 1.8260869565217392, "grad_norm": 0.5275348821410851, "learning_rate": 9.314936930293283e-08, "loss": 0.0503, "step": 64 }, { "epoch": 1.8260869565217392, "eval_loss": 0.048753608018159866, "eval_runtime": 2.3935, "eval_samples_per_second": 250.679, "eval_steps_per_second": 31.335, "step": 64 }, { "epoch": 1.855072463768116, "eval_loss": 0.04874425381422043, "eval_runtime": 2.3788, "eval_samples_per_second": 252.224, "eval_steps_per_second": 31.528, "step": 65 }, { "epoch": 1.8840579710144927, "grad_norm": 0.7635061728851181, "learning_rate": 2.3341826411756863e-08, "loss": 0.0495, "step": 66 }, { "epoch": 1.8840579710144927, "eval_loss": 0.048726994544267654, "eval_runtime": 2.3833, "eval_samples_per_second": 251.751, "eval_steps_per_second": 31.469, "step": 66 }, { "epoch": 1.9130434782608696, "eval_loss": 0.048674505203962326, "eval_runtime": 2.3749, "eval_samples_per_second": 252.645, "eval_steps_per_second": 31.581, "step": 67 }, { "epoch": 1.9420289855072463, "grad_norm": 0.4446688344675618, "learning_rate": 0.0, "loss": 0.0446, "step": 68 }, { "epoch": 1.9420289855072463, "eval_loss": 0.048731766641139984, "eval_runtime": 2.3934, "eval_samples_per_second": 250.692, "eval_steps_per_second": 31.336, "step": 68 } ], "logging_steps": 2, "max_steps": 68, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 35518238687232.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }