{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.9574468085106385, "eval_steps": 500, "global_step": 372, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05319148936170213, "grad_norm": 2.453125, "learning_rate": 7.935238627346029e-06, "loss": 1.0178, "step": 5 }, { "epoch": 0.10638297872340426, "grad_norm": 1.90625, "learning_rate": 1.7854286911528565e-05, "loss": 0.9868, "step": 10 }, { "epoch": 0.1595744680851064, "grad_norm": 1.71875, "learning_rate": 2.7773335195711106e-05, "loss": 0.9207, "step": 15 }, { "epoch": 0.2127659574468085, "grad_norm": 1.90625, "learning_rate": 3.769238347989364e-05, "loss": 0.9234, "step": 20 }, { "epoch": 0.26595744680851063, "grad_norm": 1.7109375, "learning_rate": 4.7611431764076175e-05, "loss": 0.8535, "step": 25 }, { "epoch": 0.3191489361702128, "grad_norm": 1.7890625, "learning_rate": 5.753048004825872e-05, "loss": 0.8737, "step": 30 }, { "epoch": 0.3723404255319149, "grad_norm": 1.765625, "learning_rate": 6.744952833244124e-05, "loss": 0.8314, "step": 35 }, { "epoch": 0.425531914893617, "grad_norm": 1.75, "learning_rate": 6.941566007038542e-05, "loss": 0.8324, "step": 40 }, { "epoch": 0.4787234042553192, "grad_norm": 1.5625, "learning_rate": 6.934388466345933e-05, "loss": 0.8325, "step": 45 }, { "epoch": 0.5319148936170213, "grad_norm": 1.484375, "learning_rate": 6.921705905677817e-05, "loss": 0.7441, "step": 50 }, { "epoch": 0.5851063829787234, "grad_norm": 1.4140625, "learning_rate": 6.903545231778275e-05, "loss": 0.8058, "step": 55 }, { "epoch": 0.6382978723404256, "grad_norm": 1.515625, "learning_rate": 6.879944973507346e-05, "loss": 0.8016, "step": 60 }, { "epoch": 0.6914893617021277, "grad_norm": 1.390625, "learning_rate": 6.850955200099949e-05, "loss": 0.7364, "step": 65 }, { "epoch": 0.7446808510638298, "grad_norm": 1.4921875, "learning_rate": 6.816637414941277e-05, "loss": 0.7176, "step": 70 }, { "epoch": 0.7978723404255319, "grad_norm": 1.28125, "learning_rate": 6.777064425084002e-05, "loss": 0.6915, "step": 75 }, { "epoch": 0.851063829787234, "grad_norm": 1.21875, "learning_rate": 6.732320186784154e-05, "loss": 0.6654, "step": 80 }, { "epoch": 0.9042553191489362, "grad_norm": 1.265625, "learning_rate": 6.682499627383332e-05, "loss": 0.6093, "step": 85 }, { "epoch": 0.9574468085106383, "grad_norm": 1.4921875, "learning_rate": 6.627708443915185e-05, "loss": 0.597, "step": 90 }, { "epoch": 0.9893617021276596, "eval_loss": 0.6273989677429199, "eval_runtime": 18.5736, "eval_samples_per_second": 15.721, "eval_steps_per_second": 7.861, "step": 93 }, { "epoch": 1.0106382978723405, "grad_norm": 1.046875, "learning_rate": 6.56806287886338e-05, "loss": 0.5524, "step": 95 }, { "epoch": 1.0638297872340425, "grad_norm": 1.5859375, "learning_rate": 6.503689473546842e-05, "loss": 0.368, "step": 100 }, { "epoch": 1.1170212765957448, "grad_norm": 1.0859375, "learning_rate": 6.434724799655444e-05, "loss": 0.4125, "step": 105 }, { "epoch": 1.1702127659574468, "grad_norm": 2.484375, "learning_rate": 6.361315169505714e-05, "loss": 0.4224, "step": 110 }, { "epoch": 1.2234042553191489, "grad_norm": 1.265625, "learning_rate": 6.283616325631279e-05, "loss": 0.389, "step": 115 }, { "epoch": 1.2765957446808511, "grad_norm": 1.59375, "learning_rate": 6.201793110366593e-05, "loss": 0.4041, "step": 120 }, { "epoch": 1.3297872340425532, "grad_norm": 1.2109375, "learning_rate": 6.116019116124924e-05, "loss": 0.3353, "step": 125 }, { "epoch": 1.3829787234042552, "grad_norm": 1.0390625, "learning_rate": 6.026476317112594e-05, "loss": 0.3782, "step": 130 }, { "epoch": 1.4361702127659575, "grad_norm": 1.0546875, "learning_rate": 5.933354683260767e-05, "loss": 0.3213, "step": 135 }, { "epoch": 1.4893617021276595, "grad_norm": 1.203125, "learning_rate": 5.836851777193884e-05, "loss": 0.3487, "step": 140 }, { "epoch": 1.5425531914893615, "grad_norm": 1.171875, "learning_rate": 5.7371723350897874e-05, "loss": 0.3598, "step": 145 }, { "epoch": 1.5957446808510638, "grad_norm": 1.0546875, "learning_rate": 5.634527832320745e-05, "loss": 0.3477, "step": 150 }, { "epoch": 1.648936170212766, "grad_norm": 1.1640625, "learning_rate": 5.529136034796919e-05, "loss": 0.3577, "step": 155 }, { "epoch": 1.702127659574468, "grad_norm": 1.2421875, "learning_rate": 5.421220536964117e-05, "loss": 0.3336, "step": 160 }, { "epoch": 1.7553191489361701, "grad_norm": 0.97265625, "learning_rate": 5.311010287435973e-05, "loss": 0.3394, "step": 165 }, { "epoch": 1.8085106382978724, "grad_norm": 1.0234375, "learning_rate": 5.1987391032669876e-05, "loss": 0.3255, "step": 170 }, { "epoch": 1.8617021276595744, "grad_norm": 1.0078125, "learning_rate": 5.084645173896887e-05, "loss": 0.3493, "step": 175 }, { "epoch": 1.9148936170212765, "grad_norm": 1.0625, "learning_rate": 4.968970555818745e-05, "loss": 0.3236, "step": 180 }, { "epoch": 1.9680851063829787, "grad_norm": 0.92578125, "learning_rate": 4.8519606590429275e-05, "loss": 0.3471, "step": 185 }, { "epoch": 1.978723404255319, "eval_loss": 0.4493549168109894, "eval_runtime": 18.4912, "eval_samples_per_second": 15.791, "eval_steps_per_second": 7.896, "step": 186 }, { "epoch": 2.021276595744681, "grad_norm": 0.8125, "learning_rate": 4.733863726446376e-05, "loss": 0.275, "step": 190 }, { "epoch": 2.074468085106383, "grad_norm": 0.6875, "learning_rate": 4.614930307111805e-05, "loss": 0.1776, "step": 195 }, { "epoch": 2.127659574468085, "grad_norm": 0.75390625, "learning_rate": 4.49541272477416e-05, "loss": 0.1978, "step": 200 }, { "epoch": 2.1808510638297873, "grad_norm": 0.8828125, "learning_rate": 4.375564542502052e-05, "loss": 0.1901, "step": 205 }, { "epoch": 2.2340425531914896, "grad_norm": 1.3203125, "learning_rate": 4.2556400247498865e-05, "loss": 0.1859, "step": 210 }, { "epoch": 2.2872340425531914, "grad_norm": 0.73046875, "learning_rate": 4.1358935979219476e-05, "loss": 0.1814, "step": 215 }, { "epoch": 2.3404255319148937, "grad_norm": 0.7109375, "learning_rate": 4.016579310592914e-05, "loss": 0.1697, "step": 220 }, { "epoch": 2.393617021276596, "grad_norm": 0.89453125, "learning_rate": 3.8979502945299524e-05, "loss": 0.1779, "step": 225 }, { "epoch": 2.4468085106382977, "grad_norm": 0.65625, "learning_rate": 3.7802582276598663e-05, "loss": 0.1608, "step": 230 }, { "epoch": 2.5, "grad_norm": 0.55078125, "learning_rate": 3.663752800120652e-05, "loss": 0.1612, "step": 235 }, { "epoch": 2.5531914893617023, "grad_norm": 0.953125, "learning_rate": 3.548681184530257e-05, "loss": 0.1611, "step": 240 }, { "epoch": 2.6063829787234045, "grad_norm": 0.71875, "learning_rate": 3.435287511596386e-05, "loss": 0.1778, "step": 245 }, { "epoch": 2.6595744680851063, "grad_norm": 0.5859375, "learning_rate": 3.323812352179894e-05, "loss": 0.1653, "step": 250 }, { "epoch": 2.7127659574468086, "grad_norm": 0.76953125, "learning_rate": 3.214492206910587e-05, "loss": 0.1565, "step": 255 }, { "epoch": 2.7659574468085104, "grad_norm": 0.6015625, "learning_rate": 3.107559004438238e-05, "loss": 0.1452, "step": 260 }, { "epoch": 2.8191489361702127, "grad_norm": 0.81640625, "learning_rate": 3.0032396093833086e-05, "loss": 0.1574, "step": 265 }, { "epoch": 2.872340425531915, "grad_norm": 0.71484375, "learning_rate": 2.9017553410312856e-05, "loss": 0.1733, "step": 270 }, { "epoch": 2.925531914893617, "grad_norm": 0.6640625, "learning_rate": 2.8033215037917425e-05, "loss": 0.1406, "step": 275 }, { "epoch": 2.9680851063829787, "eval_loss": 0.3713552951812744, "eval_runtime": 18.5312, "eval_samples_per_second": 15.757, "eval_steps_per_second": 7.879, "step": 279 }, { "epoch": 2.978723404255319, "grad_norm": 0.75390625, "learning_rate": 2.7081469304183043e-05, "loss": 0.1598, "step": 280 }, { "epoch": 3.0319148936170213, "grad_norm": 0.73046875, "learning_rate": 2.616433538958562e-05, "loss": 0.1067, "step": 285 }, { "epoch": 3.0851063829787235, "grad_norm": 0.5546875, "learning_rate": 2.5283759043739265e-05, "loss": 0.0771, "step": 290 }, { "epoch": 3.1382978723404253, "grad_norm": 0.53515625, "learning_rate": 2.4441608457382394e-05, "loss": 0.0756, "step": 295 }, { "epoch": 3.1914893617021276, "grad_norm": 0.7734375, "learning_rate": 2.3639670298909187e-05, "loss": 0.0723, "step": 300 }, { "epoch": 3.24468085106383, "grad_norm": 0.6171875, "learning_rate": 2.287964592385515e-05, "loss": 0.076, "step": 305 }, { "epoch": 3.297872340425532, "grad_norm": 0.76171875, "learning_rate": 2.216314776537855e-05, "loss": 0.0883, "step": 310 }, { "epoch": 3.351063829787234, "grad_norm": 0.75, "learning_rate": 2.149169591339558e-05, "loss": 0.0718, "step": 315 }, { "epoch": 3.404255319148936, "grad_norm": 0.734375, "learning_rate": 2.08667148896267e-05, "loss": 0.0757, "step": 320 }, { "epoch": 3.4574468085106385, "grad_norm": 0.74609375, "learning_rate": 2.028953062539607e-05, "loss": 0.0693, "step": 325 }, { "epoch": 3.5106382978723403, "grad_norm": 0.6953125, "learning_rate": 1.9761367648596e-05, "loss": 0.0742, "step": 330 }, { "epoch": 3.5638297872340425, "grad_norm": 0.78125, "learning_rate": 1.9283346485784222e-05, "loss": 0.0692, "step": 335 }, { "epoch": 3.617021276595745, "grad_norm": 0.86328125, "learning_rate": 1.8856481284925694e-05, "loss": 0.0769, "step": 340 }, { "epoch": 3.670212765957447, "grad_norm": 0.51171875, "learning_rate": 1.8481677663822384e-05, "loss": 0.0736, "step": 345 }, { "epoch": 3.723404255319149, "grad_norm": 0.57421875, "learning_rate": 1.815973078879573e-05, "loss": 0.0665, "step": 350 }, { "epoch": 3.776595744680851, "grad_norm": 0.7890625, "learning_rate": 1.789132368769791e-05, "loss": 0.0664, "step": 355 }, { "epoch": 3.829787234042553, "grad_norm": 0.56640625, "learning_rate": 1.7677025800831022e-05, "loss": 0.0634, "step": 360 }, { "epoch": 3.882978723404255, "grad_norm": 0.412109375, "learning_rate": 1.7517291772848414e-05, "loss": 0.0553, "step": 365 }, { "epoch": 3.9361702127659575, "grad_norm": 0.498046875, "learning_rate": 1.7412460488201273e-05, "loss": 0.068, "step": 370 }, { "epoch": 3.9574468085106385, "eval_loss": 0.33195412158966064, "eval_runtime": 18.482, "eval_samples_per_second": 15.799, "eval_steps_per_second": 7.9, "step": 372 } ], "logging_steps": 5, "max_steps": 376, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.809962648758911e+18, "train_batch_size": 14, "trial_name": null, "trial_params": null }