| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.9574468085106385, |
| "eval_steps": 500, |
| "global_step": 372, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.05319148936170213, |
| "grad_norm": 2.453125, |
| "learning_rate": 7.935238627346029e-06, |
| "loss": 1.0178, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.10638297872340426, |
| "grad_norm": 1.90625, |
| "learning_rate": 1.7854286911528565e-05, |
| "loss": 0.9868, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.1595744680851064, |
| "grad_norm": 1.71875, |
| "learning_rate": 2.7773335195711106e-05, |
| "loss": 0.9207, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.2127659574468085, |
| "grad_norm": 1.90625, |
| "learning_rate": 3.769238347989364e-05, |
| "loss": 0.9234, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.26595744680851063, |
| "grad_norm": 1.7109375, |
| "learning_rate": 4.7611431764076175e-05, |
| "loss": 0.8535, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.3191489361702128, |
| "grad_norm": 1.7890625, |
| "learning_rate": 5.753048004825872e-05, |
| "loss": 0.8737, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.3723404255319149, |
| "grad_norm": 1.765625, |
| "learning_rate": 6.744952833244124e-05, |
| "loss": 0.8314, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.425531914893617, |
| "grad_norm": 1.75, |
| "learning_rate": 6.941566007038542e-05, |
| "loss": 0.8324, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.4787234042553192, |
| "grad_norm": 1.5625, |
| "learning_rate": 6.934388466345933e-05, |
| "loss": 0.8325, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.5319148936170213, |
| "grad_norm": 1.484375, |
| "learning_rate": 6.921705905677817e-05, |
| "loss": 0.7441, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.5851063829787234, |
| "grad_norm": 1.4140625, |
| "learning_rate": 6.903545231778275e-05, |
| "loss": 0.8058, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.6382978723404256, |
| "grad_norm": 1.515625, |
| "learning_rate": 6.879944973507346e-05, |
| "loss": 0.8016, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.6914893617021277, |
| "grad_norm": 1.390625, |
| "learning_rate": 6.850955200099949e-05, |
| "loss": 0.7364, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.7446808510638298, |
| "grad_norm": 1.4921875, |
| "learning_rate": 6.816637414941277e-05, |
| "loss": 0.7176, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.7978723404255319, |
| "grad_norm": 1.28125, |
| "learning_rate": 6.777064425084002e-05, |
| "loss": 0.6915, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.851063829787234, |
| "grad_norm": 1.21875, |
| "learning_rate": 6.732320186784154e-05, |
| "loss": 0.6654, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.9042553191489362, |
| "grad_norm": 1.265625, |
| "learning_rate": 6.682499627383332e-05, |
| "loss": 0.6093, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.9574468085106383, |
| "grad_norm": 1.4921875, |
| "learning_rate": 6.627708443915185e-05, |
| "loss": 0.597, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.9893617021276596, |
| "eval_loss": 0.6273989677429199, |
| "eval_runtime": 18.5736, |
| "eval_samples_per_second": 15.721, |
| "eval_steps_per_second": 7.861, |
| "step": 93 |
| }, |
| { |
| "epoch": 1.0106382978723405, |
| "grad_norm": 1.046875, |
| "learning_rate": 6.56806287886338e-05, |
| "loss": 0.5524, |
| "step": 95 |
| }, |
| { |
| "epoch": 1.0638297872340425, |
| "grad_norm": 1.5859375, |
| "learning_rate": 6.503689473546842e-05, |
| "loss": 0.368, |
| "step": 100 |
| }, |
| { |
| "epoch": 1.1170212765957448, |
| "grad_norm": 1.0859375, |
| "learning_rate": 6.434724799655444e-05, |
| "loss": 0.4125, |
| "step": 105 |
| }, |
| { |
| "epoch": 1.1702127659574468, |
| "grad_norm": 2.484375, |
| "learning_rate": 6.361315169505714e-05, |
| "loss": 0.4224, |
| "step": 110 |
| }, |
| { |
| "epoch": 1.2234042553191489, |
| "grad_norm": 1.265625, |
| "learning_rate": 6.283616325631279e-05, |
| "loss": 0.389, |
| "step": 115 |
| }, |
| { |
| "epoch": 1.2765957446808511, |
| "grad_norm": 1.59375, |
| "learning_rate": 6.201793110366593e-05, |
| "loss": 0.4041, |
| "step": 120 |
| }, |
| { |
| "epoch": 1.3297872340425532, |
| "grad_norm": 1.2109375, |
| "learning_rate": 6.116019116124924e-05, |
| "loss": 0.3353, |
| "step": 125 |
| }, |
| { |
| "epoch": 1.3829787234042552, |
| "grad_norm": 1.0390625, |
| "learning_rate": 6.026476317112594e-05, |
| "loss": 0.3782, |
| "step": 130 |
| }, |
| { |
| "epoch": 1.4361702127659575, |
| "grad_norm": 1.0546875, |
| "learning_rate": 5.933354683260767e-05, |
| "loss": 0.3213, |
| "step": 135 |
| }, |
| { |
| "epoch": 1.4893617021276595, |
| "grad_norm": 1.203125, |
| "learning_rate": 5.836851777193884e-05, |
| "loss": 0.3487, |
| "step": 140 |
| }, |
| { |
| "epoch": 1.5425531914893615, |
| "grad_norm": 1.171875, |
| "learning_rate": 5.7371723350897874e-05, |
| "loss": 0.3598, |
| "step": 145 |
| }, |
| { |
| "epoch": 1.5957446808510638, |
| "grad_norm": 1.0546875, |
| "learning_rate": 5.634527832320745e-05, |
| "loss": 0.3477, |
| "step": 150 |
| }, |
| { |
| "epoch": 1.648936170212766, |
| "grad_norm": 1.1640625, |
| "learning_rate": 5.529136034796919e-05, |
| "loss": 0.3577, |
| "step": 155 |
| }, |
| { |
| "epoch": 1.702127659574468, |
| "grad_norm": 1.2421875, |
| "learning_rate": 5.421220536964117e-05, |
| "loss": 0.3336, |
| "step": 160 |
| }, |
| { |
| "epoch": 1.7553191489361701, |
| "grad_norm": 0.97265625, |
| "learning_rate": 5.311010287435973e-05, |
| "loss": 0.3394, |
| "step": 165 |
| }, |
| { |
| "epoch": 1.8085106382978724, |
| "grad_norm": 1.0234375, |
| "learning_rate": 5.1987391032669876e-05, |
| "loss": 0.3255, |
| "step": 170 |
| }, |
| { |
| "epoch": 1.8617021276595744, |
| "grad_norm": 1.0078125, |
| "learning_rate": 5.084645173896887e-05, |
| "loss": 0.3493, |
| "step": 175 |
| }, |
| { |
| "epoch": 1.9148936170212765, |
| "grad_norm": 1.0625, |
| "learning_rate": 4.968970555818745e-05, |
| "loss": 0.3236, |
| "step": 180 |
| }, |
| { |
| "epoch": 1.9680851063829787, |
| "grad_norm": 0.92578125, |
| "learning_rate": 4.8519606590429275e-05, |
| "loss": 0.3471, |
| "step": 185 |
| }, |
| { |
| "epoch": 1.978723404255319, |
| "eval_loss": 0.4493549168109894, |
| "eval_runtime": 18.4912, |
| "eval_samples_per_second": 15.791, |
| "eval_steps_per_second": 7.896, |
| "step": 186 |
| }, |
| { |
| "epoch": 2.021276595744681, |
| "grad_norm": 0.8125, |
| "learning_rate": 4.733863726446376e-05, |
| "loss": 0.275, |
| "step": 190 |
| }, |
| { |
| "epoch": 2.074468085106383, |
| "grad_norm": 0.6875, |
| "learning_rate": 4.614930307111805e-05, |
| "loss": 0.1776, |
| "step": 195 |
| }, |
| { |
| "epoch": 2.127659574468085, |
| "grad_norm": 0.75390625, |
| "learning_rate": 4.49541272477416e-05, |
| "loss": 0.1978, |
| "step": 200 |
| }, |
| { |
| "epoch": 2.1808510638297873, |
| "grad_norm": 0.8828125, |
| "learning_rate": 4.375564542502052e-05, |
| "loss": 0.1901, |
| "step": 205 |
| }, |
| { |
| "epoch": 2.2340425531914896, |
| "grad_norm": 1.3203125, |
| "learning_rate": 4.2556400247498865e-05, |
| "loss": 0.1859, |
| "step": 210 |
| }, |
| { |
| "epoch": 2.2872340425531914, |
| "grad_norm": 0.73046875, |
| "learning_rate": 4.1358935979219476e-05, |
| "loss": 0.1814, |
| "step": 215 |
| }, |
| { |
| "epoch": 2.3404255319148937, |
| "grad_norm": 0.7109375, |
| "learning_rate": 4.016579310592914e-05, |
| "loss": 0.1697, |
| "step": 220 |
| }, |
| { |
| "epoch": 2.393617021276596, |
| "grad_norm": 0.89453125, |
| "learning_rate": 3.8979502945299524e-05, |
| "loss": 0.1779, |
| "step": 225 |
| }, |
| { |
| "epoch": 2.4468085106382977, |
| "grad_norm": 0.65625, |
| "learning_rate": 3.7802582276598663e-05, |
| "loss": 0.1608, |
| "step": 230 |
| }, |
| { |
| "epoch": 2.5, |
| "grad_norm": 0.55078125, |
| "learning_rate": 3.663752800120652e-05, |
| "loss": 0.1612, |
| "step": 235 |
| }, |
| { |
| "epoch": 2.5531914893617023, |
| "grad_norm": 0.953125, |
| "learning_rate": 3.548681184530257e-05, |
| "loss": 0.1611, |
| "step": 240 |
| }, |
| { |
| "epoch": 2.6063829787234045, |
| "grad_norm": 0.71875, |
| "learning_rate": 3.435287511596386e-05, |
| "loss": 0.1778, |
| "step": 245 |
| }, |
| { |
| "epoch": 2.6595744680851063, |
| "grad_norm": 0.5859375, |
| "learning_rate": 3.323812352179894e-05, |
| "loss": 0.1653, |
| "step": 250 |
| }, |
| { |
| "epoch": 2.7127659574468086, |
| "grad_norm": 0.76953125, |
| "learning_rate": 3.214492206910587e-05, |
| "loss": 0.1565, |
| "step": 255 |
| }, |
| { |
| "epoch": 2.7659574468085104, |
| "grad_norm": 0.6015625, |
| "learning_rate": 3.107559004438238e-05, |
| "loss": 0.1452, |
| "step": 260 |
| }, |
| { |
| "epoch": 2.8191489361702127, |
| "grad_norm": 0.81640625, |
| "learning_rate": 3.0032396093833086e-05, |
| "loss": 0.1574, |
| "step": 265 |
| }, |
| { |
| "epoch": 2.872340425531915, |
| "grad_norm": 0.71484375, |
| "learning_rate": 2.9017553410312856e-05, |
| "loss": 0.1733, |
| "step": 270 |
| }, |
| { |
| "epoch": 2.925531914893617, |
| "grad_norm": 0.6640625, |
| "learning_rate": 2.8033215037917425e-05, |
| "loss": 0.1406, |
| "step": 275 |
| }, |
| { |
| "epoch": 2.9680851063829787, |
| "eval_loss": 0.3713552951812744, |
| "eval_runtime": 18.5312, |
| "eval_samples_per_second": 15.757, |
| "eval_steps_per_second": 7.879, |
| "step": 279 |
| }, |
| { |
| "epoch": 2.978723404255319, |
| "grad_norm": 0.75390625, |
| "learning_rate": 2.7081469304183043e-05, |
| "loss": 0.1598, |
| "step": 280 |
| }, |
| { |
| "epoch": 3.0319148936170213, |
| "grad_norm": 0.73046875, |
| "learning_rate": 2.616433538958562e-05, |
| "loss": 0.1067, |
| "step": 285 |
| }, |
| { |
| "epoch": 3.0851063829787235, |
| "grad_norm": 0.5546875, |
| "learning_rate": 2.5283759043739265e-05, |
| "loss": 0.0771, |
| "step": 290 |
| }, |
| { |
| "epoch": 3.1382978723404253, |
| "grad_norm": 0.53515625, |
| "learning_rate": 2.4441608457382394e-05, |
| "loss": 0.0756, |
| "step": 295 |
| }, |
| { |
| "epoch": 3.1914893617021276, |
| "grad_norm": 0.7734375, |
| "learning_rate": 2.3639670298909187e-05, |
| "loss": 0.0723, |
| "step": 300 |
| }, |
| { |
| "epoch": 3.24468085106383, |
| "grad_norm": 0.6171875, |
| "learning_rate": 2.287964592385515e-05, |
| "loss": 0.076, |
| "step": 305 |
| }, |
| { |
| "epoch": 3.297872340425532, |
| "grad_norm": 0.76171875, |
| "learning_rate": 2.216314776537855e-05, |
| "loss": 0.0883, |
| "step": 310 |
| }, |
| { |
| "epoch": 3.351063829787234, |
| "grad_norm": 0.75, |
| "learning_rate": 2.149169591339558e-05, |
| "loss": 0.0718, |
| "step": 315 |
| }, |
| { |
| "epoch": 3.404255319148936, |
| "grad_norm": 0.734375, |
| "learning_rate": 2.08667148896267e-05, |
| "loss": 0.0757, |
| "step": 320 |
| }, |
| { |
| "epoch": 3.4574468085106385, |
| "grad_norm": 0.74609375, |
| "learning_rate": 2.028953062539607e-05, |
| "loss": 0.0693, |
| "step": 325 |
| }, |
| { |
| "epoch": 3.5106382978723403, |
| "grad_norm": 0.6953125, |
| "learning_rate": 1.9761367648596e-05, |
| "loss": 0.0742, |
| "step": 330 |
| }, |
| { |
| "epoch": 3.5638297872340425, |
| "grad_norm": 0.78125, |
| "learning_rate": 1.9283346485784222e-05, |
| "loss": 0.0692, |
| "step": 335 |
| }, |
| { |
| "epoch": 3.617021276595745, |
| "grad_norm": 0.86328125, |
| "learning_rate": 1.8856481284925694e-05, |
| "loss": 0.0769, |
| "step": 340 |
| }, |
| { |
| "epoch": 3.670212765957447, |
| "grad_norm": 0.51171875, |
| "learning_rate": 1.8481677663822384e-05, |
| "loss": 0.0736, |
| "step": 345 |
| }, |
| { |
| "epoch": 3.723404255319149, |
| "grad_norm": 0.57421875, |
| "learning_rate": 1.815973078879573e-05, |
| "loss": 0.0665, |
| "step": 350 |
| }, |
| { |
| "epoch": 3.776595744680851, |
| "grad_norm": 0.7890625, |
| "learning_rate": 1.789132368769791e-05, |
| "loss": 0.0664, |
| "step": 355 |
| }, |
| { |
| "epoch": 3.829787234042553, |
| "grad_norm": 0.56640625, |
| "learning_rate": 1.7677025800831022e-05, |
| "loss": 0.0634, |
| "step": 360 |
| }, |
| { |
| "epoch": 3.882978723404255, |
| "grad_norm": 0.412109375, |
| "learning_rate": 1.7517291772848414e-05, |
| "loss": 0.0553, |
| "step": 365 |
| }, |
| { |
| "epoch": 3.9361702127659575, |
| "grad_norm": 0.498046875, |
| "learning_rate": 1.7412460488201273e-05, |
| "loss": 0.068, |
| "step": 370 |
| }, |
| { |
| "epoch": 3.9574468085106385, |
| "eval_loss": 0.33195412158966064, |
| "eval_runtime": 18.482, |
| "eval_samples_per_second": 15.799, |
| "eval_steps_per_second": 7.9, |
| "step": 372 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 376, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 4, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.809962648758911e+18, |
| "train_batch_size": 14, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|