{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9997672540080229, "eval_steps": 1000, "global_step": 2282, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0021905505127257294, "grad_norm": 13096.0, "learning_rate": 4.366812227074236e-07, "loss": 742.6802, "step": 5 }, { "epoch": 0.004381101025451459, "grad_norm": 90.625, "learning_rate": 8.733624454148472e-07, "loss": 111.4364, "step": 10 }, { "epoch": 0.006571651538177188, "grad_norm": 64.625, "learning_rate": 1.3100436681222709e-06, "loss": 44.6282, "step": 15 }, { "epoch": 0.008762202050902917, "grad_norm": 56.09375, "learning_rate": 1.7467248908296944e-06, "loss": 44.0656, "step": 20 }, { "epoch": 0.010952752563628647, "grad_norm": 40.4375, "learning_rate": 2.183406113537118e-06, "loss": 41.889, "step": 25 }, { "epoch": 0.013143303076354376, "grad_norm": 50.6875, "learning_rate": 2.6200873362445417e-06, "loss": 40.9611, "step": 30 }, { "epoch": 0.015333853589080106, "grad_norm": 34.125, "learning_rate": 3.0567685589519653e-06, "loss": 40.9319, "step": 35 }, { "epoch": 0.017524404101805835, "grad_norm": 37.34375, "learning_rate": 3.493449781659389e-06, "loss": 41.2922, "step": 40 }, { "epoch": 0.019714954614531564, "grad_norm": 30.140625, "learning_rate": 3.930131004366812e-06, "loss": 39.5204, "step": 45 }, { "epoch": 0.021905505127257294, "grad_norm": 30.953125, "learning_rate": 4.366812227074236e-06, "loss": 39.9045, "step": 50 }, { "epoch": 0.024096055639983023, "grad_norm": 30.453125, "learning_rate": 4.80349344978166e-06, "loss": 40.7265, "step": 55 }, { "epoch": 0.026286606152708752, "grad_norm": 32.65625, "learning_rate": 5.2401746724890834e-06, "loss": 39.5216, "step": 60 }, { "epoch": 0.028477156665434482, "grad_norm": 4284.0, "learning_rate": 5.676855895196507e-06, "loss": 50.5946, "step": 65 }, { "epoch": 0.03066770717816021, "grad_norm": 1012.0, "learning_rate": 6.1135371179039305e-06, "loss": 55.345, "step": 70 }, { "epoch": 0.03285825769088594, "grad_norm": 662.5, "learning_rate": 6.550218340611354e-06, "loss": 72.4566, "step": 75 }, { "epoch": 0.03504880820361167, "grad_norm": 92.875, "learning_rate": 6.986899563318778e-06, "loss": 50.0984, "step": 80 }, { "epoch": 0.0372393587163374, "grad_norm": 33.5, "learning_rate": 7.423580786026201e-06, "loss": 42.5405, "step": 85 }, { "epoch": 0.03942990922906313, "grad_norm": 32.3125, "learning_rate": 7.860262008733624e-06, "loss": 40.9513, "step": 90 }, { "epoch": 0.04162045974178886, "grad_norm": 73.125, "learning_rate": 8.296943231441049e-06, "loss": 39.5486, "step": 95 }, { "epoch": 0.04381101025451459, "grad_norm": 36.5625, "learning_rate": 8.733624454148473e-06, "loss": 38.6845, "step": 100 }, { "epoch": 0.04600156076724032, "grad_norm": 28.875, "learning_rate": 9.170305676855896e-06, "loss": 40.9694, "step": 105 }, { "epoch": 0.048192111279966046, "grad_norm": 30.09375, "learning_rate": 9.60698689956332e-06, "loss": 40.6168, "step": 110 }, { "epoch": 0.050382661792691776, "grad_norm": 28.40625, "learning_rate": 1.0043668122270742e-05, "loss": 39.6493, "step": 115 }, { "epoch": 0.052573212305417505, "grad_norm": 33.90625, "learning_rate": 1.0480349344978167e-05, "loss": 38.692, "step": 120 }, { "epoch": 0.054763762818143234, "grad_norm": 24.046875, "learning_rate": 1.0917030567685592e-05, "loss": 39.2953, "step": 125 }, { "epoch": 0.056954313330868964, "grad_norm": 50.1875, "learning_rate": 1.1353711790393014e-05, "loss": 39.2942, "step": 130 }, { "epoch": 0.05914486384359469, "grad_norm": 25.578125, "learning_rate": 1.179039301310044e-05, "loss": 40.2007, "step": 135 }, { "epoch": 0.06133541435632042, "grad_norm": 24.453125, "learning_rate": 1.2227074235807861e-05, "loss": 40.4033, "step": 140 }, { "epoch": 0.06352596486904616, "grad_norm": 25.109375, "learning_rate": 1.2663755458515286e-05, "loss": 40.2883, "step": 145 }, { "epoch": 0.06571651538177188, "grad_norm": 24.3125, "learning_rate": 1.3100436681222708e-05, "loss": 38.8529, "step": 150 }, { "epoch": 0.06790706589449762, "grad_norm": 33.03125, "learning_rate": 1.3537117903930132e-05, "loss": 40.0824, "step": 155 }, { "epoch": 0.07009761640722334, "grad_norm": 36.375, "learning_rate": 1.3973799126637555e-05, "loss": 40.8283, "step": 160 }, { "epoch": 0.07228816691994908, "grad_norm": 32.375, "learning_rate": 1.4410480349344979e-05, "loss": 38.3279, "step": 165 }, { "epoch": 0.0744787174326748, "grad_norm": 23.328125, "learning_rate": 1.4847161572052402e-05, "loss": 39.6786, "step": 170 }, { "epoch": 0.07666926794540053, "grad_norm": 33.65625, "learning_rate": 1.5283842794759826e-05, "loss": 39.6196, "step": 175 }, { "epoch": 0.07885981845812626, "grad_norm": 34.46875, "learning_rate": 1.5720524017467248e-05, "loss": 39.4438, "step": 180 }, { "epoch": 0.081050368970852, "grad_norm": 38.03125, "learning_rate": 1.6157205240174673e-05, "loss": 39.3557, "step": 185 }, { "epoch": 0.08324091948357772, "grad_norm": 45.59375, "learning_rate": 1.6593886462882098e-05, "loss": 39.7753, "step": 190 }, { "epoch": 0.08543146999630345, "grad_norm": 42.0625, "learning_rate": 1.703056768558952e-05, "loss": 38.6387, "step": 195 }, { "epoch": 0.08762202050902917, "grad_norm": 31.296875, "learning_rate": 1.7467248908296945e-05, "loss": 38.944, "step": 200 }, { "epoch": 0.08981257102175491, "grad_norm": 23.0, "learning_rate": 1.7903930131004367e-05, "loss": 39.5787, "step": 205 }, { "epoch": 0.09200312153448063, "grad_norm": 31.796875, "learning_rate": 1.8340611353711792e-05, "loss": 40.2236, "step": 210 }, { "epoch": 0.09419367204720637, "grad_norm": 31.796875, "learning_rate": 1.8777292576419214e-05, "loss": 39.7339, "step": 215 }, { "epoch": 0.09638422255993209, "grad_norm": 48.78125, "learning_rate": 1.921397379912664e-05, "loss": 41.8436, "step": 220 }, { "epoch": 0.09857477307265783, "grad_norm": 385.25, "learning_rate": 1.965065502183406e-05, "loss": 44.5202, "step": 225 }, { "epoch": 0.10076532358538355, "grad_norm": 39.03125, "learning_rate": 1.9990258158792012e-05, "loss": 45.2576, "step": 230 }, { "epoch": 0.10295587409810929, "grad_norm": 29.96875, "learning_rate": 1.994154895275207e-05, "loss": 42.9545, "step": 235 }, { "epoch": 0.10514642461083501, "grad_norm": 26.09375, "learning_rate": 1.989283974671213e-05, "loss": 41.42, "step": 240 }, { "epoch": 0.10733697512356075, "grad_norm": 28.609375, "learning_rate": 1.984413054067219e-05, "loss": 40.9569, "step": 245 }, { "epoch": 0.10952752563628647, "grad_norm": 45.03125, "learning_rate": 1.979542133463225e-05, "loss": 43.257, "step": 250 }, { "epoch": 0.1117180761490122, "grad_norm": 31.65625, "learning_rate": 1.9746712128592305e-05, "loss": 43.1437, "step": 255 }, { "epoch": 0.11390862666173793, "grad_norm": 24.5, "learning_rate": 1.9698002922552364e-05, "loss": 40.8937, "step": 260 }, { "epoch": 0.11609917717446366, "grad_norm": 37.34375, "learning_rate": 1.9649293716512423e-05, "loss": 41.702, "step": 265 }, { "epoch": 0.11828972768718939, "grad_norm": 28.078125, "learning_rate": 1.960058451047248e-05, "loss": 40.8823, "step": 270 }, { "epoch": 0.12048027819991512, "grad_norm": 30.703125, "learning_rate": 1.955187530443254e-05, "loss": 41.0227, "step": 275 }, { "epoch": 0.12267082871264084, "grad_norm": 28.703125, "learning_rate": 1.9503166098392598e-05, "loss": 41.8211, "step": 280 }, { "epoch": 0.12486137922536658, "grad_norm": 40.5625, "learning_rate": 1.9454456892352657e-05, "loss": 40.5589, "step": 285 }, { "epoch": 0.12705192973809232, "grad_norm": 32.625, "learning_rate": 1.9405747686312716e-05, "loss": 40.9865, "step": 290 }, { "epoch": 0.12924248025081803, "grad_norm": 67.3125, "learning_rate": 1.9357038480272775e-05, "loss": 40.3525, "step": 295 }, { "epoch": 0.13143303076354376, "grad_norm": 32.3125, "learning_rate": 1.930832927423283e-05, "loss": 40.1777, "step": 300 }, { "epoch": 0.1336235812762695, "grad_norm": 24.765625, "learning_rate": 1.925962006819289e-05, "loss": 39.8659, "step": 305 }, { "epoch": 0.13581413178899523, "grad_norm": 26.921875, "learning_rate": 1.921091086215295e-05, "loss": 41.8758, "step": 310 }, { "epoch": 0.13800468230172094, "grad_norm": 30.34375, "learning_rate": 1.9162201656113005e-05, "loss": 40.2781, "step": 315 }, { "epoch": 0.14019523281444668, "grad_norm": 100.1875, "learning_rate": 1.9113492450073065e-05, "loss": 38.3503, "step": 320 }, { "epoch": 0.14238578332717242, "grad_norm": 30.6875, "learning_rate": 1.9064783244033124e-05, "loss": 41.6195, "step": 325 }, { "epoch": 0.14457633383989815, "grad_norm": 31.53125, "learning_rate": 1.9016074037993183e-05, "loss": 40.62, "step": 330 }, { "epoch": 0.14676688435262386, "grad_norm": 32.75, "learning_rate": 1.8967364831953242e-05, "loss": 48.5258, "step": 335 }, { "epoch": 0.1489574348653496, "grad_norm": 21.390625, "learning_rate": 1.8918655625913298e-05, "loss": 41.4756, "step": 340 }, { "epoch": 0.15114798537807533, "grad_norm": 31.734375, "learning_rate": 1.8869946419873357e-05, "loss": 40.7427, "step": 345 }, { "epoch": 0.15333853589080107, "grad_norm": 47.53125, "learning_rate": 1.8821237213833417e-05, "loss": 40.1635, "step": 350 }, { "epoch": 0.15552908640352678, "grad_norm": 25.46875, "learning_rate": 1.8772528007793472e-05, "loss": 40.0878, "step": 355 }, { "epoch": 0.15771963691625251, "grad_norm": 29.828125, "learning_rate": 1.872381880175353e-05, "loss": 41.3363, "step": 360 }, { "epoch": 0.15991018742897825, "grad_norm": 32.65625, "learning_rate": 1.867510959571359e-05, "loss": 40.1477, "step": 365 }, { "epoch": 0.162100737941704, "grad_norm": 27.765625, "learning_rate": 1.862640038967365e-05, "loss": 39.9975, "step": 370 }, { "epoch": 0.1642912884544297, "grad_norm": 21.671875, "learning_rate": 1.857769118363371e-05, "loss": 40.2893, "step": 375 }, { "epoch": 0.16648183896715543, "grad_norm": 19.515625, "learning_rate": 1.852898197759377e-05, "loss": 38.7055, "step": 380 }, { "epoch": 0.16867238947988117, "grad_norm": 162.625, "learning_rate": 1.8480272771553824e-05, "loss": 48.099, "step": 385 }, { "epoch": 0.1708629399926069, "grad_norm": 29.8125, "learning_rate": 1.8431563565513884e-05, "loss": 48.3286, "step": 390 }, { "epoch": 0.1730534905053326, "grad_norm": 34.5625, "learning_rate": 1.8382854359473943e-05, "loss": 40.6334, "step": 395 }, { "epoch": 0.17524404101805835, "grad_norm": 20.9375, "learning_rate": 1.8334145153434e-05, "loss": 39.5327, "step": 400 }, { "epoch": 0.17743459153078409, "grad_norm": 24.359375, "learning_rate": 1.8285435947394058e-05, "loss": 38.2548, "step": 405 }, { "epoch": 0.17962514204350982, "grad_norm": 26.109375, "learning_rate": 1.8236726741354117e-05, "loss": 39.5699, "step": 410 }, { "epoch": 0.18181569255623553, "grad_norm": 74.4375, "learning_rate": 1.8188017535314176e-05, "loss": 40.1855, "step": 415 }, { "epoch": 0.18400624306896127, "grad_norm": 38.1875, "learning_rate": 1.8139308329274236e-05, "loss": 39.2502, "step": 420 }, { "epoch": 0.186196793581687, "grad_norm": 19.75, "learning_rate": 1.8090599123234295e-05, "loss": 38.5868, "step": 425 }, { "epoch": 0.18838734409441274, "grad_norm": 22.546875, "learning_rate": 1.804188991719435e-05, "loss": 38.2078, "step": 430 }, { "epoch": 0.19057789460713845, "grad_norm": 32.96875, "learning_rate": 1.799318071115441e-05, "loss": 38.6803, "step": 435 }, { "epoch": 0.19276844511986418, "grad_norm": 36.75, "learning_rate": 1.794447150511447e-05, "loss": 39.3906, "step": 440 }, { "epoch": 0.19495899563258992, "grad_norm": 23.71875, "learning_rate": 1.7895762299074525e-05, "loss": 38.3763, "step": 445 }, { "epoch": 0.19714954614531566, "grad_norm": 26.328125, "learning_rate": 1.7847053093034584e-05, "loss": 37.9731, "step": 450 }, { "epoch": 0.19934009665804137, "grad_norm": 27.03125, "learning_rate": 1.7798343886994643e-05, "loss": 39.5706, "step": 455 }, { "epoch": 0.2015306471707671, "grad_norm": 17.046875, "learning_rate": 1.7749634680954703e-05, "loss": 38.7336, "step": 460 }, { "epoch": 0.20372119768349284, "grad_norm": 24.59375, "learning_rate": 1.7700925474914762e-05, "loss": 38.5834, "step": 465 }, { "epoch": 0.20591174819621857, "grad_norm": 33.5625, "learning_rate": 1.7652216268874818e-05, "loss": 39.1163, "step": 470 }, { "epoch": 0.20810229870894428, "grad_norm": 28.875, "learning_rate": 1.7603507062834877e-05, "loss": 39.7169, "step": 475 }, { "epoch": 0.21029284922167002, "grad_norm": 26.0625, "learning_rate": 1.7554797856794936e-05, "loss": 38.2182, "step": 480 }, { "epoch": 0.21248339973439576, "grad_norm": 25.265625, "learning_rate": 1.7506088650754992e-05, "loss": 38.4219, "step": 485 }, { "epoch": 0.2146739502471215, "grad_norm": 40.53125, "learning_rate": 1.745737944471505e-05, "loss": 39.3143, "step": 490 }, { "epoch": 0.2168645007598472, "grad_norm": 28.828125, "learning_rate": 1.740867023867511e-05, "loss": 38.7423, "step": 495 }, { "epoch": 0.21905505127257294, "grad_norm": 19.84375, "learning_rate": 1.735996103263517e-05, "loss": 37.1676, "step": 500 }, { "epoch": 0.22124560178529867, "grad_norm": 22.0625, "learning_rate": 1.731125182659523e-05, "loss": 38.0275, "step": 505 }, { "epoch": 0.2234361522980244, "grad_norm": 45.5, "learning_rate": 1.7262542620555288e-05, "loss": 37.0928, "step": 510 }, { "epoch": 0.22562670281075012, "grad_norm": 40.5625, "learning_rate": 1.7213833414515344e-05, "loss": 37.4132, "step": 515 }, { "epoch": 0.22781725332347585, "grad_norm": 26.9375, "learning_rate": 1.7165124208475403e-05, "loss": 38.0051, "step": 520 }, { "epoch": 0.2300078038362016, "grad_norm": 31.796875, "learning_rate": 1.7116415002435462e-05, "loss": 36.731, "step": 525 }, { "epoch": 0.23219835434892733, "grad_norm": 24.328125, "learning_rate": 1.7067705796395518e-05, "loss": 36.7867, "step": 530 }, { "epoch": 0.23438890486165304, "grad_norm": 28.390625, "learning_rate": 1.7018996590355577e-05, "loss": 38.4257, "step": 535 }, { "epoch": 0.23657945537437877, "grad_norm": 33.5625, "learning_rate": 1.6970287384315637e-05, "loss": 38.0036, "step": 540 }, { "epoch": 0.2387700058871045, "grad_norm": 36.125, "learning_rate": 1.6921578178275696e-05, "loss": 37.2956, "step": 545 }, { "epoch": 0.24096055639983024, "grad_norm": 59.34375, "learning_rate": 1.6872868972235755e-05, "loss": 36.8695, "step": 550 }, { "epoch": 0.24315110691255595, "grad_norm": 17.25, "learning_rate": 1.6824159766195814e-05, "loss": 36.5629, "step": 555 }, { "epoch": 0.2453416574252817, "grad_norm": 26.875, "learning_rate": 1.677545056015587e-05, "loss": 36.3663, "step": 560 }, { "epoch": 0.24753220793800743, "grad_norm": 29.59375, "learning_rate": 1.672674135411593e-05, "loss": 36.5514, "step": 565 }, { "epoch": 0.24972275845073316, "grad_norm": 94.8125, "learning_rate": 1.667803214807599e-05, "loss": 37.2822, "step": 570 }, { "epoch": 0.25191330896345887, "grad_norm": 25.921875, "learning_rate": 1.6629322942036044e-05, "loss": 37.0216, "step": 575 }, { "epoch": 0.25410385947618463, "grad_norm": 29.46875, "learning_rate": 1.6580613735996104e-05, "loss": 36.9, "step": 580 }, { "epoch": 0.25629440998891034, "grad_norm": 25.40625, "learning_rate": 1.6531904529956163e-05, "loss": 36.3711, "step": 585 }, { "epoch": 0.25848496050163605, "grad_norm": 23.453125, "learning_rate": 1.6483195323916222e-05, "loss": 36.7133, "step": 590 }, { "epoch": 0.2606755110143618, "grad_norm": 28.328125, "learning_rate": 1.643448611787628e-05, "loss": 36.4338, "step": 595 }, { "epoch": 0.2628660615270875, "grad_norm": 31.875, "learning_rate": 1.6385776911836337e-05, "loss": 36.4187, "step": 600 }, { "epoch": 0.26505661203981323, "grad_norm": 26.375, "learning_rate": 1.6337067705796396e-05, "loss": 37.0881, "step": 605 }, { "epoch": 0.267247162552539, "grad_norm": 47.84375, "learning_rate": 1.6288358499756456e-05, "loss": 36.7284, "step": 610 }, { "epoch": 0.2694377130652647, "grad_norm": 36.9375, "learning_rate": 1.623964929371651e-05, "loss": 35.6421, "step": 615 }, { "epoch": 0.27162826357799047, "grad_norm": 44.9375, "learning_rate": 1.619094008767657e-05, "loss": 35.6973, "step": 620 }, { "epoch": 0.2738188140907162, "grad_norm": 17.4375, "learning_rate": 1.614223088163663e-05, "loss": 35.1017, "step": 625 }, { "epoch": 0.2760093646034419, "grad_norm": 30.0625, "learning_rate": 1.609352167559669e-05, "loss": 36.4951, "step": 630 }, { "epoch": 0.27819991511616765, "grad_norm": 35.84375, "learning_rate": 1.604481246955675e-05, "loss": 35.1649, "step": 635 }, { "epoch": 0.28039046562889336, "grad_norm": 25.703125, "learning_rate": 1.5996103263516808e-05, "loss": 34.9533, "step": 640 }, { "epoch": 0.28258101614161907, "grad_norm": 41.28125, "learning_rate": 1.5947394057476863e-05, "loss": 34.6937, "step": 645 }, { "epoch": 0.28477156665434483, "grad_norm": 27.390625, "learning_rate": 1.5898684851436923e-05, "loss": 36.1568, "step": 650 }, { "epoch": 0.28696211716707054, "grad_norm": 24.828125, "learning_rate": 1.5849975645396982e-05, "loss": 34.8646, "step": 655 }, { "epoch": 0.2891526676797963, "grad_norm": 29.734375, "learning_rate": 1.5801266439357038e-05, "loss": 35.4552, "step": 660 }, { "epoch": 0.291343218192522, "grad_norm": 21.140625, "learning_rate": 1.5752557233317097e-05, "loss": 35.4214, "step": 665 }, { "epoch": 0.2935337687052477, "grad_norm": 20.53125, "learning_rate": 1.5703848027277156e-05, "loss": 35.3401, "step": 670 }, { "epoch": 0.2957243192179735, "grad_norm": 57.25, "learning_rate": 1.5655138821237215e-05, "loss": 35.3161, "step": 675 }, { "epoch": 0.2979148697306992, "grad_norm": 21.375, "learning_rate": 1.5606429615197275e-05, "loss": 34.5353, "step": 680 }, { "epoch": 0.3001054202434249, "grad_norm": 19.28125, "learning_rate": 1.5557720409157334e-05, "loss": 34.1415, "step": 685 }, { "epoch": 0.30229597075615067, "grad_norm": 46.0625, "learning_rate": 1.550901120311739e-05, "loss": 34.5647, "step": 690 }, { "epoch": 0.3044865212688764, "grad_norm": 19.796875, "learning_rate": 1.546030199707745e-05, "loss": 35.0429, "step": 695 }, { "epoch": 0.30667707178160214, "grad_norm": 79.375, "learning_rate": 1.5411592791037505e-05, "loss": 33.6274, "step": 700 }, { "epoch": 0.30886762229432785, "grad_norm": 51.71875, "learning_rate": 1.5362883584997564e-05, "loss": 35.4228, "step": 705 }, { "epoch": 0.31105817280705356, "grad_norm": 27.234375, "learning_rate": 1.5314174378957623e-05, "loss": 35.1287, "step": 710 }, { "epoch": 0.3132487233197793, "grad_norm": 30.390625, "learning_rate": 1.5265465172917682e-05, "loss": 34.1134, "step": 715 }, { "epoch": 0.31543927383250503, "grad_norm": 27.359375, "learning_rate": 1.5216755966877742e-05, "loss": 34.4372, "step": 720 }, { "epoch": 0.31762982434523074, "grad_norm": 25.90625, "learning_rate": 1.51680467608378e-05, "loss": 34.7943, "step": 725 }, { "epoch": 0.3198203748579565, "grad_norm": 31.765625, "learning_rate": 1.5119337554797857e-05, "loss": 34.0021, "step": 730 }, { "epoch": 0.3220109253706822, "grad_norm": 32.625, "learning_rate": 1.5070628348757916e-05, "loss": 34.3029, "step": 735 }, { "epoch": 0.324201475883408, "grad_norm": 26.109375, "learning_rate": 1.5021919142717975e-05, "loss": 34.2254, "step": 740 }, { "epoch": 0.3263920263961337, "grad_norm": 32.65625, "learning_rate": 1.4973209936678033e-05, "loss": 34.6909, "step": 745 }, { "epoch": 0.3285825769088594, "grad_norm": 26.890625, "learning_rate": 1.4924500730638092e-05, "loss": 34.4121, "step": 750 }, { "epoch": 0.33077312742158516, "grad_norm": 28.453125, "learning_rate": 1.4875791524598151e-05, "loss": 34.1999, "step": 755 }, { "epoch": 0.33296367793431086, "grad_norm": 24.25, "learning_rate": 1.4827082318558209e-05, "loss": 34.5926, "step": 760 }, { "epoch": 0.3351542284470366, "grad_norm": 29.4375, "learning_rate": 1.4778373112518268e-05, "loss": 34.2804, "step": 765 }, { "epoch": 0.33734477895976234, "grad_norm": 36.125, "learning_rate": 1.4729663906478327e-05, "loss": 33.5547, "step": 770 }, { "epoch": 0.33953532947248805, "grad_norm": 19.875, "learning_rate": 1.4680954700438383e-05, "loss": 33.3131, "step": 775 }, { "epoch": 0.3417258799852138, "grad_norm": 37.09375, "learning_rate": 1.4632245494398442e-05, "loss": 33.288, "step": 780 }, { "epoch": 0.3439164304979395, "grad_norm": 32.09375, "learning_rate": 1.4583536288358501e-05, "loss": 34.4507, "step": 785 }, { "epoch": 0.3461069810106652, "grad_norm": 27.0625, "learning_rate": 1.4534827082318559e-05, "loss": 34.0287, "step": 790 }, { "epoch": 0.348297531523391, "grad_norm": 34.59375, "learning_rate": 1.4486117876278618e-05, "loss": 33.4372, "step": 795 }, { "epoch": 0.3504880820361167, "grad_norm": 33.84375, "learning_rate": 1.4437408670238677e-05, "loss": 33.7221, "step": 800 }, { "epoch": 0.3526786325488424, "grad_norm": 30.1875, "learning_rate": 1.4388699464198735e-05, "loss": 33.3108, "step": 805 }, { "epoch": 0.35486918306156817, "grad_norm": 39.8125, "learning_rate": 1.4339990258158794e-05, "loss": 33.231, "step": 810 }, { "epoch": 0.3570597335742939, "grad_norm": 30.296875, "learning_rate": 1.429128105211885e-05, "loss": 33.9207, "step": 815 }, { "epoch": 0.35925028408701964, "grad_norm": 23.484375, "learning_rate": 1.4242571846078909e-05, "loss": 33.4625, "step": 820 }, { "epoch": 0.36144083459974535, "grad_norm": 32.15625, "learning_rate": 1.4193862640038968e-05, "loss": 33.2117, "step": 825 }, { "epoch": 0.36363138511247106, "grad_norm": 548.5, "learning_rate": 1.4145153433999026e-05, "loss": 33.2816, "step": 830 }, { "epoch": 0.3658219356251968, "grad_norm": 46.03125, "learning_rate": 1.4096444227959085e-05, "loss": 32.2993, "step": 835 }, { "epoch": 0.36801248613792253, "grad_norm": 25.96875, "learning_rate": 1.4047735021919144e-05, "loss": 33.0793, "step": 840 }, { "epoch": 0.37020303665064824, "grad_norm": 45.6875, "learning_rate": 1.3999025815879202e-05, "loss": 32.4659, "step": 845 }, { "epoch": 0.372393587163374, "grad_norm": 48.03125, "learning_rate": 1.3950316609839261e-05, "loss": 32.3893, "step": 850 }, { "epoch": 0.3745841376760997, "grad_norm": 28.9375, "learning_rate": 1.390160740379932e-05, "loss": 32.7858, "step": 855 }, { "epoch": 0.3767746881888255, "grad_norm": 39.90625, "learning_rate": 1.3852898197759376e-05, "loss": 32.1438, "step": 860 }, { "epoch": 0.3789652387015512, "grad_norm": 51.34375, "learning_rate": 1.3804188991719435e-05, "loss": 32.3068, "step": 865 }, { "epoch": 0.3811557892142769, "grad_norm": 25.125, "learning_rate": 1.3755479785679495e-05, "loss": 32.0212, "step": 870 }, { "epoch": 0.38334633972700266, "grad_norm": 41.96875, "learning_rate": 1.3706770579639552e-05, "loss": 32.0489, "step": 875 }, { "epoch": 0.38553689023972837, "grad_norm": 36.25, "learning_rate": 1.3658061373599611e-05, "loss": 31.558, "step": 880 }, { "epoch": 0.3877274407524541, "grad_norm": 39.5625, "learning_rate": 1.360935216755967e-05, "loss": 31.9841, "step": 885 }, { "epoch": 0.38991799126517984, "grad_norm": 26.921875, "learning_rate": 1.3560642961519728e-05, "loss": 33.2559, "step": 890 }, { "epoch": 0.39210854177790555, "grad_norm": 49.65625, "learning_rate": 1.3511933755479787e-05, "loss": 33.6144, "step": 895 }, { "epoch": 0.3942990922906313, "grad_norm": 51.8125, "learning_rate": 1.3463224549439847e-05, "loss": 33.1955, "step": 900 }, { "epoch": 0.396489642803357, "grad_norm": 46.28125, "learning_rate": 1.3414515343399902e-05, "loss": 32.873, "step": 905 }, { "epoch": 0.39868019331608273, "grad_norm": 26.859375, "learning_rate": 1.3365806137359962e-05, "loss": 31.7345, "step": 910 }, { "epoch": 0.4008707438288085, "grad_norm": 34.1875, "learning_rate": 1.331709693132002e-05, "loss": 31.5948, "step": 915 }, { "epoch": 0.4030612943415342, "grad_norm": 21.265625, "learning_rate": 1.3268387725280078e-05, "loss": 31.6987, "step": 920 }, { "epoch": 0.4052518448542599, "grad_norm": 28.09375, "learning_rate": 1.3219678519240138e-05, "loss": 32.0258, "step": 925 }, { "epoch": 0.4074423953669857, "grad_norm": 108.375, "learning_rate": 1.3170969313200197e-05, "loss": 32.0971, "step": 930 }, { "epoch": 0.4096329458797114, "grad_norm": 28.53125, "learning_rate": 1.3122260107160254e-05, "loss": 32.1107, "step": 935 }, { "epoch": 0.41182349639243715, "grad_norm": 22.46875, "learning_rate": 1.3073550901120314e-05, "loss": 31.9156, "step": 940 }, { "epoch": 0.41401404690516286, "grad_norm": 17.734375, "learning_rate": 1.302484169508037e-05, "loss": 31.5545, "step": 945 }, { "epoch": 0.41620459741788857, "grad_norm": 23.25, "learning_rate": 1.2976132489040429e-05, "loss": 31.4469, "step": 950 }, { "epoch": 0.41839514793061433, "grad_norm": 34.84375, "learning_rate": 1.2927423283000488e-05, "loss": 31.7367, "step": 955 }, { "epoch": 0.42058569844334004, "grad_norm": 20.53125, "learning_rate": 1.2878714076960545e-05, "loss": 32.3651, "step": 960 }, { "epoch": 0.42277624895606575, "grad_norm": 21.546875, "learning_rate": 1.2830004870920605e-05, "loss": 32.3428, "step": 965 }, { "epoch": 0.4249667994687915, "grad_norm": 22.328125, "learning_rate": 1.2781295664880664e-05, "loss": 32.0113, "step": 970 }, { "epoch": 0.4271573499815172, "grad_norm": 26.34375, "learning_rate": 1.2732586458840721e-05, "loss": 31.9163, "step": 975 }, { "epoch": 0.429347900494243, "grad_norm": 22.171875, "learning_rate": 1.268387725280078e-05, "loss": 32.0345, "step": 980 }, { "epoch": 0.4315384510069687, "grad_norm": 106.3125, "learning_rate": 1.263516804676084e-05, "loss": 32.1245, "step": 985 }, { "epoch": 0.4337290015196944, "grad_norm": 21.1875, "learning_rate": 1.2586458840720897e-05, "loss": 32.4236, "step": 990 }, { "epoch": 0.43591955203242017, "grad_norm": 34.84375, "learning_rate": 1.2537749634680957e-05, "loss": 31.7463, "step": 995 }, { "epoch": 0.4381101025451459, "grad_norm": 34.875, "learning_rate": 1.2489040428641016e-05, "loss": 32.5371, "step": 1000 }, { "epoch": 0.4381101025451459, "eval_loss": NaN, "eval_runtime": 244.1573, "eval_samples_per_second": 1007.682, "eval_steps_per_second": 31.492, "step": 1000 }, { "epoch": 0.4403006530578716, "grad_norm": 46.15625, "learning_rate": 1.2440331222601072e-05, "loss": 32.3911, "step": 1005 }, { "epoch": 0.44249120357059735, "grad_norm": 35.28125, "learning_rate": 1.2391622016561131e-05, "loss": 31.6862, "step": 1010 }, { "epoch": 0.44468175408332306, "grad_norm": 34.71875, "learning_rate": 1.234291281052119e-05, "loss": 31.7585, "step": 1015 }, { "epoch": 0.4468723045960488, "grad_norm": 133.875, "learning_rate": 1.2294203604481248e-05, "loss": 29.9359, "step": 1020 }, { "epoch": 0.44906285510877453, "grad_norm": 46.96875, "learning_rate": 1.2245494398441307e-05, "loss": 31.4081, "step": 1025 }, { "epoch": 0.45125340562150024, "grad_norm": 38.0, "learning_rate": 1.2196785192401366e-05, "loss": 31.1994, "step": 1030 }, { "epoch": 0.453443956134226, "grad_norm": 24.3125, "learning_rate": 1.2148075986361424e-05, "loss": 31.7705, "step": 1035 }, { "epoch": 0.4556345066469517, "grad_norm": 71.625, "learning_rate": 1.2099366780321483e-05, "loss": 29.8065, "step": 1040 }, { "epoch": 0.4578250571596774, "grad_norm": 28.515625, "learning_rate": 1.2050657574281542e-05, "loss": 30.1924, "step": 1045 }, { "epoch": 0.4600156076724032, "grad_norm": 80.0625, "learning_rate": 1.2001948368241598e-05, "loss": 30.6224, "step": 1050 }, { "epoch": 0.4622061581851289, "grad_norm": 20.65625, "learning_rate": 1.1953239162201657e-05, "loss": 31.4722, "step": 1055 }, { "epoch": 0.46439670869785465, "grad_norm": 44.75, "learning_rate": 1.1904529956161715e-05, "loss": 31.1045, "step": 1060 }, { "epoch": 0.46658725921058036, "grad_norm": 23.40625, "learning_rate": 1.1855820750121774e-05, "loss": 31.4498, "step": 1065 }, { "epoch": 0.46877780972330607, "grad_norm": 21.90625, "learning_rate": 1.1807111544081833e-05, "loss": 31.3105, "step": 1070 }, { "epoch": 0.47096836023603184, "grad_norm": 34.6875, "learning_rate": 1.175840233804189e-05, "loss": 31.2846, "step": 1075 }, { "epoch": 0.47315891074875754, "grad_norm": 19.625, "learning_rate": 1.170969313200195e-05, "loss": 31.1811, "step": 1080 }, { "epoch": 0.47534946126148325, "grad_norm": 33.6875, "learning_rate": 1.1660983925962009e-05, "loss": 31.6749, "step": 1085 }, { "epoch": 0.477540011774209, "grad_norm": 26.71875, "learning_rate": 1.1612274719922065e-05, "loss": 31.7564, "step": 1090 }, { "epoch": 0.4797305622869347, "grad_norm": 21.59375, "learning_rate": 1.1563565513882124e-05, "loss": 31.3069, "step": 1095 }, { "epoch": 0.4819211127996605, "grad_norm": 20.796875, "learning_rate": 1.1514856307842183e-05, "loss": 30.9148, "step": 1100 }, { "epoch": 0.4841116633123862, "grad_norm": 23.609375, "learning_rate": 1.1466147101802241e-05, "loss": 31.3271, "step": 1105 }, { "epoch": 0.4863022138251119, "grad_norm": 19.890625, "learning_rate": 1.14174378957623e-05, "loss": 30.8132, "step": 1110 }, { "epoch": 0.48849276433783767, "grad_norm": 27.265625, "learning_rate": 1.136872868972236e-05, "loss": 30.7294, "step": 1115 }, { "epoch": 0.4906833148505634, "grad_norm": 24.921875, "learning_rate": 1.1320019483682417e-05, "loss": 31.5751, "step": 1120 }, { "epoch": 0.4928738653632891, "grad_norm": 55.25, "learning_rate": 1.1271310277642476e-05, "loss": 31.0008, "step": 1125 }, { "epoch": 0.49506441587601485, "grad_norm": 31.453125, "learning_rate": 1.1222601071602535e-05, "loss": 30.4025, "step": 1130 }, { "epoch": 0.49725496638874056, "grad_norm": 31.625, "learning_rate": 1.1173891865562591e-05, "loss": 30.8471, "step": 1135 }, { "epoch": 0.4994455169014663, "grad_norm": 32.59375, "learning_rate": 1.112518265952265e-05, "loss": 30.6139, "step": 1140 }, { "epoch": 0.501636067414192, "grad_norm": 38.8125, "learning_rate": 1.107647345348271e-05, "loss": 30.6423, "step": 1145 }, { "epoch": 0.5038266179269177, "grad_norm": 18.953125, "learning_rate": 1.1027764247442767e-05, "loss": 30.5779, "step": 1150 }, { "epoch": 0.5060171684396435, "grad_norm": 28.640625, "learning_rate": 1.0979055041402826e-05, "loss": 30.5233, "step": 1155 }, { "epoch": 0.5082077189523693, "grad_norm": 37.21875, "learning_rate": 1.0930345835362886e-05, "loss": 30.4697, "step": 1160 }, { "epoch": 0.5103982694650949, "grad_norm": 21.59375, "learning_rate": 1.0881636629322943e-05, "loss": 30.9171, "step": 1165 }, { "epoch": 0.5125888199778207, "grad_norm": 22.46875, "learning_rate": 1.0832927423283002e-05, "loss": 30.2928, "step": 1170 }, { "epoch": 0.5147793704905465, "grad_norm": 40.125, "learning_rate": 1.0784218217243058e-05, "loss": 30.513, "step": 1175 }, { "epoch": 0.5169699210032721, "grad_norm": 23.9375, "learning_rate": 1.0735509011203117e-05, "loss": 30.9489, "step": 1180 }, { "epoch": 0.5191604715159979, "grad_norm": 37.78125, "learning_rate": 1.0686799805163177e-05, "loss": 30.9469, "step": 1185 }, { "epoch": 0.5213510220287236, "grad_norm": 33.3125, "learning_rate": 1.0638090599123234e-05, "loss": 30.9858, "step": 1190 }, { "epoch": 0.5235415725414493, "grad_norm": 27.046875, "learning_rate": 1.0589381393083293e-05, "loss": 30.1304, "step": 1195 }, { "epoch": 0.525732123054175, "grad_norm": 32.75, "learning_rate": 1.0540672187043353e-05, "loss": 30.1871, "step": 1200 }, { "epoch": 0.5279226735669008, "grad_norm": 29.984375, "learning_rate": 1.049196298100341e-05, "loss": 30.462, "step": 1205 }, { "epoch": 0.5301132240796265, "grad_norm": 32.1875, "learning_rate": 1.044325377496347e-05, "loss": 29.9546, "step": 1210 }, { "epoch": 0.5323037745923522, "grad_norm": 30.5625, "learning_rate": 1.0394544568923529e-05, "loss": 29.1937, "step": 1215 }, { "epoch": 0.534494325105078, "grad_norm": 36.1875, "learning_rate": 1.0345835362883584e-05, "loss": 29.6412, "step": 1220 }, { "epoch": 0.5366848756178036, "grad_norm": 23.25, "learning_rate": 1.0297126156843644e-05, "loss": 29.5891, "step": 1225 }, { "epoch": 0.5388754261305294, "grad_norm": 21.703125, "learning_rate": 1.0248416950803703e-05, "loss": 29.6652, "step": 1230 }, { "epoch": 0.5410659766432552, "grad_norm": 34.4375, "learning_rate": 1.019970774476376e-05, "loss": 29.9832, "step": 1235 }, { "epoch": 0.5432565271559809, "grad_norm": 35.84375, "learning_rate": 1.015099853872382e-05, "loss": 29.51, "step": 1240 }, { "epoch": 0.5454470776687066, "grad_norm": 63.8125, "learning_rate": 1.0102289332683879e-05, "loss": 29.3875, "step": 1245 }, { "epoch": 0.5476376281814324, "grad_norm": 22.28125, "learning_rate": 1.0053580126643936e-05, "loss": 29.7568, "step": 1250 }, { "epoch": 0.5498281786941581, "grad_norm": 27.265625, "learning_rate": 1.0004870920603996e-05, "loss": 29.1113, "step": 1255 }, { "epoch": 0.5520187292068838, "grad_norm": 19.625, "learning_rate": 9.956161714564053e-06, "loss": 29.71, "step": 1260 }, { "epoch": 0.5542092797196095, "grad_norm": 41.4375, "learning_rate": 9.90745250852411e-06, "loss": 30.138, "step": 1265 }, { "epoch": 0.5563998302323353, "grad_norm": 27.8125, "learning_rate": 9.85874330248417e-06, "loss": 30.3684, "step": 1270 }, { "epoch": 0.558590380745061, "grad_norm": 23.828125, "learning_rate": 9.810034096444229e-06, "loss": 29.5932, "step": 1275 }, { "epoch": 0.5607809312577867, "grad_norm": 26.140625, "learning_rate": 9.761324890404287e-06, "loss": 30.614, "step": 1280 }, { "epoch": 0.5629714817705125, "grad_norm": 39.875, "learning_rate": 9.712615684364346e-06, "loss": 30.409, "step": 1285 }, { "epoch": 0.5651620322832381, "grad_norm": 34.78125, "learning_rate": 9.663906478324403e-06, "loss": 29.0909, "step": 1290 }, { "epoch": 0.5673525827959639, "grad_norm": 23.21875, "learning_rate": 9.615197272284463e-06, "loss": 29.2596, "step": 1295 }, { "epoch": 0.5695431333086897, "grad_norm": 54.84375, "learning_rate": 9.566488066244522e-06, "loss": 28.9521, "step": 1300 }, { "epoch": 0.5717336838214153, "grad_norm": 18.9375, "learning_rate": 9.51777886020458e-06, "loss": 29.0617, "step": 1305 }, { "epoch": 0.5739242343341411, "grad_norm": 39.84375, "learning_rate": 9.469069654164637e-06, "loss": 29.763, "step": 1310 }, { "epoch": 0.5761147848468668, "grad_norm": 21.21875, "learning_rate": 9.420360448124696e-06, "loss": 29.089, "step": 1315 }, { "epoch": 0.5783053353595926, "grad_norm": 43.6875, "learning_rate": 9.371651242084755e-06, "loss": 28.8189, "step": 1320 }, { "epoch": 0.5804958858723183, "grad_norm": 24.25, "learning_rate": 9.322942036044813e-06, "loss": 29.4879, "step": 1325 }, { "epoch": 0.582686436385044, "grad_norm": 44.21875, "learning_rate": 9.27423283000487e-06, "loss": 28.8948, "step": 1330 }, { "epoch": 0.5848769868977698, "grad_norm": 64.5625, "learning_rate": 9.22552362396493e-06, "loss": 30.1701, "step": 1335 }, { "epoch": 0.5870675374104954, "grad_norm": 34.375, "learning_rate": 9.176814417924989e-06, "loss": 30.0669, "step": 1340 }, { "epoch": 0.5892580879232212, "grad_norm": 22.421875, "learning_rate": 9.128105211885046e-06, "loss": 29.2547, "step": 1345 }, { "epoch": 0.591448638435947, "grad_norm": 20.875, "learning_rate": 9.079396005845106e-06, "loss": 29.5238, "step": 1350 }, { "epoch": 0.5936391889486726, "grad_norm": 25.234375, "learning_rate": 9.030686799805163e-06, "loss": 28.7198, "step": 1355 }, { "epoch": 0.5958297394613984, "grad_norm": 44.46875, "learning_rate": 8.981977593765222e-06, "loss": 28.738, "step": 1360 }, { "epoch": 0.5980202899741242, "grad_norm": 29.609375, "learning_rate": 8.933268387725282e-06, "loss": 28.7452, "step": 1365 }, { "epoch": 0.6002108404868498, "grad_norm": 59.25, "learning_rate": 8.884559181685339e-06, "loss": 28.8729, "step": 1370 }, { "epoch": 0.6024013909995756, "grad_norm": 21.59375, "learning_rate": 8.835849975645398e-06, "loss": 29.592, "step": 1375 }, { "epoch": 0.6045919415123013, "grad_norm": 24.625, "learning_rate": 8.787140769605456e-06, "loss": 28.8652, "step": 1380 }, { "epoch": 0.606782492025027, "grad_norm": 48.9375, "learning_rate": 8.738431563565515e-06, "loss": 28.7544, "step": 1385 }, { "epoch": 0.6089730425377528, "grad_norm": 24.203125, "learning_rate": 8.689722357525573e-06, "loss": 28.9991, "step": 1390 }, { "epoch": 0.6111635930504785, "grad_norm": 51.75, "learning_rate": 8.641013151485632e-06, "loss": 29.0338, "step": 1395 }, { "epoch": 0.6133541435632043, "grad_norm": 34.40625, "learning_rate": 8.592303945445691e-06, "loss": 28.9075, "step": 1400 }, { "epoch": 0.6155446940759299, "grad_norm": 26.78125, "learning_rate": 8.543594739405749e-06, "loss": 29.1133, "step": 1405 }, { "epoch": 0.6177352445886557, "grad_norm": 19.25, "learning_rate": 8.494885533365806e-06, "loss": 29.0613, "step": 1410 }, { "epoch": 0.6199257951013815, "grad_norm": 26.421875, "learning_rate": 8.446176327325865e-06, "loss": 28.7795, "step": 1415 }, { "epoch": 0.6221163456141071, "grad_norm": 39.3125, "learning_rate": 8.397467121285925e-06, "loss": 28.8834, "step": 1420 }, { "epoch": 0.6243068961268329, "grad_norm": 23.703125, "learning_rate": 8.348757915245982e-06, "loss": 29.5495, "step": 1425 }, { "epoch": 0.6264974466395586, "grad_norm": 27.6875, "learning_rate": 8.300048709206041e-06, "loss": 29.0528, "step": 1430 }, { "epoch": 0.6286879971522843, "grad_norm": 33.65625, "learning_rate": 8.251339503166099e-06, "loss": 28.5143, "step": 1435 }, { "epoch": 0.6308785476650101, "grad_norm": 23.296875, "learning_rate": 8.202630297126158e-06, "loss": 27.779, "step": 1440 }, { "epoch": 0.6330690981777358, "grad_norm": 30.484375, "learning_rate": 8.153921091086216e-06, "loss": 28.6134, "step": 1445 }, { "epoch": 0.6352596486904615, "grad_norm": 23.546875, "learning_rate": 8.105211885046275e-06, "loss": 28.9237, "step": 1450 }, { "epoch": 0.6374501992031872, "grad_norm": 22.953125, "learning_rate": 8.056502679006332e-06, "loss": 28.2114, "step": 1455 }, { "epoch": 0.639640749715913, "grad_norm": 21.5, "learning_rate": 8.007793472966392e-06, "loss": 27.6828, "step": 1460 }, { "epoch": 0.6418313002286387, "grad_norm": 23.8125, "learning_rate": 7.95908426692645e-06, "loss": 28.4943, "step": 1465 }, { "epoch": 0.6440218507413644, "grad_norm": 26.578125, "learning_rate": 7.910375060886508e-06, "loss": 28.6824, "step": 1470 }, { "epoch": 0.6462124012540902, "grad_norm": 23.59375, "learning_rate": 7.861665854846566e-06, "loss": 28.4058, "step": 1475 }, { "epoch": 0.648402951766816, "grad_norm": 24.984375, "learning_rate": 7.812956648806625e-06, "loss": 28.8576, "step": 1480 }, { "epoch": 0.6505935022795416, "grad_norm": 21.21875, "learning_rate": 7.764247442766684e-06, "loss": 28.6734, "step": 1485 }, { "epoch": 0.6527840527922674, "grad_norm": 21.4375, "learning_rate": 7.715538236726742e-06, "loss": 27.8819, "step": 1490 }, { "epoch": 0.6549746033049931, "grad_norm": 20.3125, "learning_rate": 7.666829030686801e-06, "loss": 27.6257, "step": 1495 }, { "epoch": 0.6571651538177188, "grad_norm": 23.8125, "learning_rate": 7.6181198246468595e-06, "loss": 28.4266, "step": 1500 }, { "epoch": 0.6593557043304445, "grad_norm": 20.921875, "learning_rate": 7.569410618606917e-06, "loss": 27.6938, "step": 1505 }, { "epoch": 0.6615462548431703, "grad_norm": 20.96875, "learning_rate": 7.520701412566975e-06, "loss": 27.8491, "step": 1510 }, { "epoch": 0.663736805355896, "grad_norm": 38.53125, "learning_rate": 7.471992206527035e-06, "loss": 28.2246, "step": 1515 }, { "epoch": 0.6659273558686217, "grad_norm": 21.109375, "learning_rate": 7.423283000487093e-06, "loss": 27.786, "step": 1520 }, { "epoch": 0.6681179063813475, "grad_norm": 91.625, "learning_rate": 7.3745737944471505e-06, "loss": 28.1847, "step": 1525 }, { "epoch": 0.6703084568940731, "grad_norm": 20.703125, "learning_rate": 7.32586458840721e-06, "loss": 28.6043, "step": 1530 }, { "epoch": 0.6724990074067989, "grad_norm": 29.953125, "learning_rate": 7.277155382367268e-06, "loss": 28.3078, "step": 1535 }, { "epoch": 0.6746895579195247, "grad_norm": 21.296875, "learning_rate": 7.2284461763273265e-06, "loss": 28.0411, "step": 1540 }, { "epoch": 0.6768801084322503, "grad_norm": 82.5625, "learning_rate": 7.179736970287386e-06, "loss": 27.8437, "step": 1545 }, { "epoch": 0.6790706589449761, "grad_norm": 23.421875, "learning_rate": 7.131027764247443e-06, "loss": 28.013, "step": 1550 }, { "epoch": 0.6812612094577019, "grad_norm": 21.828125, "learning_rate": 7.082318558207502e-06, "loss": 28.3346, "step": 1555 }, { "epoch": 0.6834517599704276, "grad_norm": 27.734375, "learning_rate": 7.03360935216756e-06, "loss": 28.0322, "step": 1560 }, { "epoch": 0.6856423104831533, "grad_norm": 22.5625, "learning_rate": 6.984900146127619e-06, "loss": 27.7644, "step": 1565 }, { "epoch": 0.687832860995879, "grad_norm": 21.765625, "learning_rate": 6.936190940087677e-06, "loss": 28.3414, "step": 1570 }, { "epoch": 0.6900234115086048, "grad_norm": 26.09375, "learning_rate": 6.887481734047735e-06, "loss": 28.3187, "step": 1575 }, { "epoch": 0.6922139620213305, "grad_norm": 17.6875, "learning_rate": 6.838772528007794e-06, "loss": 27.6377, "step": 1580 }, { "epoch": 0.6944045125340562, "grad_norm": 19.546875, "learning_rate": 6.790063321967853e-06, "loss": 28.1024, "step": 1585 }, { "epoch": 0.696595063046782, "grad_norm": 24.625, "learning_rate": 6.74135411592791e-06, "loss": 27.7855, "step": 1590 }, { "epoch": 0.6987856135595076, "grad_norm": 26.78125, "learning_rate": 6.6926449098879695e-06, "loss": 26.9999, "step": 1595 }, { "epoch": 0.7009761640722334, "grad_norm": 26.96875, "learning_rate": 6.643935703848028e-06, "loss": 28.0347, "step": 1600 }, { "epoch": 0.7031667145849592, "grad_norm": 17.875, "learning_rate": 6.595226497808086e-06, "loss": 27.8989, "step": 1605 }, { "epoch": 0.7053572650976848, "grad_norm": 23.9375, "learning_rate": 6.5465172917681454e-06, "loss": 27.7298, "step": 1610 }, { "epoch": 0.7075478156104106, "grad_norm": 30.0625, "learning_rate": 6.497808085728203e-06, "loss": 27.9132, "step": 1615 }, { "epoch": 0.7097383661231363, "grad_norm": 15.4375, "learning_rate": 6.449098879688261e-06, "loss": 27.7105, "step": 1620 }, { "epoch": 0.711928916635862, "grad_norm": 20.140625, "learning_rate": 6.40038967364832e-06, "loss": 27.5961, "step": 1625 }, { "epoch": 0.7141194671485878, "grad_norm": 20.328125, "learning_rate": 6.351680467608379e-06, "loss": 28.1599, "step": 1630 }, { "epoch": 0.7163100176613135, "grad_norm": 34.8125, "learning_rate": 6.3029712615684365e-06, "loss": 27.6126, "step": 1635 }, { "epoch": 0.7185005681740393, "grad_norm": 26.0, "learning_rate": 6.254262055528495e-06, "loss": 28.024, "step": 1640 }, { "epoch": 0.7206911186867649, "grad_norm": 21.0625, "learning_rate": 6.205552849488554e-06, "loss": 27.0935, "step": 1645 }, { "epoch": 0.7228816691994907, "grad_norm": 28.6875, "learning_rate": 6.1568436434486125e-06, "loss": 27.4261, "step": 1650 }, { "epoch": 0.7250722197122165, "grad_norm": 19.25, "learning_rate": 6.10813443740867e-06, "loss": 27.7292, "step": 1655 }, { "epoch": 0.7272627702249421, "grad_norm": 16.140625, "learning_rate": 6.059425231368729e-06, "loss": 27.3485, "step": 1660 }, { "epoch": 0.7294533207376679, "grad_norm": 18.484375, "learning_rate": 6.010716025328788e-06, "loss": 27.549, "step": 1665 }, { "epoch": 0.7316438712503937, "grad_norm": 22.984375, "learning_rate": 5.962006819288846e-06, "loss": 27.3251, "step": 1670 }, { "epoch": 0.7338344217631193, "grad_norm": 18.96875, "learning_rate": 5.913297613248905e-06, "loss": 26.5725, "step": 1675 }, { "epoch": 0.7360249722758451, "grad_norm": 21.59375, "learning_rate": 5.8645884072089636e-06, "loss": 26.5293, "step": 1680 }, { "epoch": 0.7382155227885708, "grad_norm": 31.984375, "learning_rate": 5.815879201169021e-06, "loss": 27.2344, "step": 1685 }, { "epoch": 0.7404060733012965, "grad_norm": 27.75, "learning_rate": 5.7671699951290795e-06, "loss": 27.0698, "step": 1690 }, { "epoch": 0.7425966238140222, "grad_norm": 104.0, "learning_rate": 5.718460789089139e-06, "loss": 26.5105, "step": 1695 }, { "epoch": 0.744787174326748, "grad_norm": 21.734375, "learning_rate": 5.669751583049197e-06, "loss": 27.2865, "step": 1700 }, { "epoch": 0.7469777248394737, "grad_norm": 17.609375, "learning_rate": 5.621042377009255e-06, "loss": 26.9174, "step": 1705 }, { "epoch": 0.7491682753521994, "grad_norm": 35.0, "learning_rate": 5.572333170969314e-06, "loss": 27.4106, "step": 1710 }, { "epoch": 0.7513588258649252, "grad_norm": 26.484375, "learning_rate": 5.523623964929372e-06, "loss": 27.5595, "step": 1715 }, { "epoch": 0.753549376377651, "grad_norm": 23.15625, "learning_rate": 5.474914758889431e-06, "loss": 26.9554, "step": 1720 }, { "epoch": 0.7557399268903766, "grad_norm": 16.375, "learning_rate": 5.42620555284949e-06, "loss": 27.3082, "step": 1725 }, { "epoch": 0.7579304774031024, "grad_norm": 13.671875, "learning_rate": 5.377496346809547e-06, "loss": 26.6982, "step": 1730 }, { "epoch": 0.7601210279158281, "grad_norm": 18.109375, "learning_rate": 5.328787140769606e-06, "loss": 26.8219, "step": 1735 }, { "epoch": 0.7623115784285538, "grad_norm": 18.59375, "learning_rate": 5.280077934729664e-06, "loss": 27.0788, "step": 1740 }, { "epoch": 0.7645021289412796, "grad_norm": 15.15625, "learning_rate": 5.231368728689723e-06, "loss": 27.4361, "step": 1745 }, { "epoch": 0.7666926794540053, "grad_norm": 16.984375, "learning_rate": 5.182659522649781e-06, "loss": 27.1089, "step": 1750 }, { "epoch": 0.768883229966731, "grad_norm": 29.421875, "learning_rate": 5.133950316609839e-06, "loss": 26.8349, "step": 1755 }, { "epoch": 0.7710737804794567, "grad_norm": 15.9140625, "learning_rate": 5.0852411105698985e-06, "loss": 27.002, "step": 1760 }, { "epoch": 0.7732643309921825, "grad_norm": 25.296875, "learning_rate": 5.036531904529957e-06, "loss": 26.6214, "step": 1765 }, { "epoch": 0.7754548815049082, "grad_norm": 20.59375, "learning_rate": 4.987822698490015e-06, "loss": 27.3062, "step": 1770 }, { "epoch": 0.7776454320176339, "grad_norm": 14.0390625, "learning_rate": 4.939113492450074e-06, "loss": 26.6314, "step": 1775 }, { "epoch": 0.7798359825303597, "grad_norm": 18.53125, "learning_rate": 4.890404286410132e-06, "loss": 26.8917, "step": 1780 }, { "epoch": 0.7820265330430853, "grad_norm": 18.6875, "learning_rate": 4.84169508037019e-06, "loss": 26.6654, "step": 1785 }, { "epoch": 0.7842170835558111, "grad_norm": 19.46875, "learning_rate": 4.792985874330249e-06, "loss": 26.5455, "step": 1790 }, { "epoch": 0.7864076340685369, "grad_norm": 16.453125, "learning_rate": 4.744276668290307e-06, "loss": 26.6663, "step": 1795 }, { "epoch": 0.7885981845812626, "grad_norm": 22.703125, "learning_rate": 4.6955674622503655e-06, "loss": 25.947, "step": 1800 }, { "epoch": 0.7907887350939883, "grad_norm": 22.109375, "learning_rate": 4.646858256210424e-06, "loss": 26.1454, "step": 1805 }, { "epoch": 0.792979285606714, "grad_norm": 18.53125, "learning_rate": 4.598149050170483e-06, "loss": 27.0058, "step": 1810 }, { "epoch": 0.7951698361194398, "grad_norm": 21.203125, "learning_rate": 4.549439844130541e-06, "loss": 26.3893, "step": 1815 }, { "epoch": 0.7973603866321655, "grad_norm": 14.453125, "learning_rate": 4.5007306380906e-06, "loss": 26.0527, "step": 1820 }, { "epoch": 0.7995509371448912, "grad_norm": 33.625, "learning_rate": 4.452021432050657e-06, "loss": 26.2192, "step": 1825 }, { "epoch": 0.801741487657617, "grad_norm": 15.1328125, "learning_rate": 4.403312226010717e-06, "loss": 26.1952, "step": 1830 }, { "epoch": 0.8039320381703426, "grad_norm": 15.03125, "learning_rate": 4.354603019970775e-06, "loss": 25.9795, "step": 1835 }, { "epoch": 0.8061225886830684, "grad_norm": 14.671875, "learning_rate": 4.305893813930833e-06, "loss": 26.3182, "step": 1840 }, { "epoch": 0.8083131391957942, "grad_norm": 17.84375, "learning_rate": 4.257184607890892e-06, "loss": 25.6861, "step": 1845 }, { "epoch": 0.8105036897085198, "grad_norm": 16.875, "learning_rate": 4.20847540185095e-06, "loss": 26.4041, "step": 1850 }, { "epoch": 0.8126942402212456, "grad_norm": 55.34375, "learning_rate": 4.1597661958110085e-06, "loss": 26.3603, "step": 1855 }, { "epoch": 0.8148847907339714, "grad_norm": 29.40625, "learning_rate": 4.111056989771067e-06, "loss": 26.6706, "step": 1860 }, { "epoch": 0.817075341246697, "grad_norm": 35.03125, "learning_rate": 4.062347783731125e-06, "loss": 26.5124, "step": 1865 }, { "epoch": 0.8192658917594228, "grad_norm": 19.234375, "learning_rate": 4.0136385776911845e-06, "loss": 26.0298, "step": 1870 }, { "epoch": 0.8214564422721485, "grad_norm": 41.5, "learning_rate": 3.964929371651242e-06, "loss": 25.2979, "step": 1875 }, { "epoch": 0.8236469927848743, "grad_norm": 16.015625, "learning_rate": 3.916220165611301e-06, "loss": 26.1311, "step": 1880 }, { "epoch": 0.8258375432976, "grad_norm": 16.96875, "learning_rate": 3.86751095957136e-06, "loss": 26.0964, "step": 1885 }, { "epoch": 0.8280280938103257, "grad_norm": 25.9375, "learning_rate": 3.818801753531418e-06, "loss": 25.4731, "step": 1890 }, { "epoch": 0.8302186443230515, "grad_norm": 20.765625, "learning_rate": 3.7700925474914763e-06, "loss": 25.6541, "step": 1895 }, { "epoch": 0.8324091948357771, "grad_norm": 19.0, "learning_rate": 3.7213833414515347e-06, "loss": 25.302, "step": 1900 }, { "epoch": 0.8345997453485029, "grad_norm": 15.953125, "learning_rate": 3.672674135411593e-06, "loss": 25.319, "step": 1905 }, { "epoch": 0.8367902958612287, "grad_norm": 11.4140625, "learning_rate": 3.6239649293716515e-06, "loss": 25.2675, "step": 1910 }, { "epoch": 0.8389808463739543, "grad_norm": 46.0625, "learning_rate": 3.57525572333171e-06, "loss": 25.9301, "step": 1915 }, { "epoch": 0.8411713968866801, "grad_norm": 14.3828125, "learning_rate": 3.5265465172917687e-06, "loss": 26.4109, "step": 1920 }, { "epoch": 0.8433619473994058, "grad_norm": 21.921875, "learning_rate": 3.477837311251827e-06, "loss": 25.9918, "step": 1925 }, { "epoch": 0.8455524979121315, "grad_norm": 15.4453125, "learning_rate": 3.4291281052118854e-06, "loss": 25.5853, "step": 1930 }, { "epoch": 0.8477430484248573, "grad_norm": 12.71875, "learning_rate": 3.3804188991719438e-06, "loss": 25.7727, "step": 1935 }, { "epoch": 0.849933598937583, "grad_norm": 19.78125, "learning_rate": 3.331709693132002e-06, "loss": 25.9744, "step": 1940 }, { "epoch": 0.8521241494503087, "grad_norm": 14.4609375, "learning_rate": 3.2830004870920605e-06, "loss": 25.48, "step": 1945 }, { "epoch": 0.8543146999630344, "grad_norm": 14.015625, "learning_rate": 3.2342912810521193e-06, "loss": 25.2711, "step": 1950 }, { "epoch": 0.8565052504757602, "grad_norm": 16.9375, "learning_rate": 3.1855820750121773e-06, "loss": 26.5614, "step": 1955 }, { "epoch": 0.858695800988486, "grad_norm": 16.546875, "learning_rate": 3.136872868972236e-06, "loss": 25.3045, "step": 1960 }, { "epoch": 0.8608863515012116, "grad_norm": 14.890625, "learning_rate": 3.088163662932294e-06, "loss": 26.0494, "step": 1965 }, { "epoch": 0.8630769020139374, "grad_norm": 14.421875, "learning_rate": 3.039454456892353e-06, "loss": 25.8255, "step": 1970 }, { "epoch": 0.8652674525266632, "grad_norm": 13.3984375, "learning_rate": 2.9907452508524117e-06, "loss": 25.0552, "step": 1975 }, { "epoch": 0.8674580030393888, "grad_norm": 15.1953125, "learning_rate": 2.9420360448124696e-06, "loss": 25.1982, "step": 1980 }, { "epoch": 0.8696485535521146, "grad_norm": 24.96875, "learning_rate": 2.8933268387725284e-06, "loss": 25.4527, "step": 1985 }, { "epoch": 0.8718391040648403, "grad_norm": 61.21875, "learning_rate": 2.8446176327325868e-06, "loss": 24.8295, "step": 1990 }, { "epoch": 0.874029654577566, "grad_norm": 15.8125, "learning_rate": 2.795908426692645e-06, "loss": 24.9091, "step": 1995 }, { "epoch": 0.8762202050902917, "grad_norm": 13.40625, "learning_rate": 2.7471992206527035e-06, "loss": 25.4313, "step": 2000 }, { "epoch": 0.8762202050902917, "eval_loss": NaN, "eval_runtime": 242.6903, "eval_samples_per_second": 1013.774, "eval_steps_per_second": 31.682, "step": 2000 }, { "epoch": 0.8784107556030175, "grad_norm": 20.109375, "learning_rate": 2.698490014612762e-06, "loss": 25.8034, "step": 2005 }, { "epoch": 0.8806013061157432, "grad_norm": 10.953125, "learning_rate": 2.6497808085728203e-06, "loss": 25.0685, "step": 2010 }, { "epoch": 0.8827918566284689, "grad_norm": 31.0625, "learning_rate": 2.601071602532879e-06, "loss": 25.7096, "step": 2015 }, { "epoch": 0.8849824071411947, "grad_norm": 24.046875, "learning_rate": 2.5523623964929375e-06, "loss": 25.3376, "step": 2020 }, { "epoch": 0.8871729576539203, "grad_norm": 27.65625, "learning_rate": 2.503653190452996e-06, "loss": 24.6723, "step": 2025 }, { "epoch": 0.8893635081666461, "grad_norm": 12.5078125, "learning_rate": 2.4549439844130542e-06, "loss": 25.1757, "step": 2030 }, { "epoch": 0.8915540586793719, "grad_norm": 15.1015625, "learning_rate": 2.4062347783731126e-06, "loss": 25.6934, "step": 2035 }, { "epoch": 0.8937446091920976, "grad_norm": 28.75, "learning_rate": 2.357525572333171e-06, "loss": 25.4336, "step": 2040 }, { "epoch": 0.8959351597048233, "grad_norm": 18.53125, "learning_rate": 2.3088163662932294e-06, "loss": 24.4317, "step": 2045 }, { "epoch": 0.8981257102175491, "grad_norm": 15.265625, "learning_rate": 2.260107160253288e-06, "loss": 24.6006, "step": 2050 }, { "epoch": 0.9003162607302748, "grad_norm": 12.1796875, "learning_rate": 2.2113979542133465e-06, "loss": 24.7936, "step": 2055 }, { "epoch": 0.9025068112430005, "grad_norm": 12.21875, "learning_rate": 2.162688748173405e-06, "loss": 26.129, "step": 2060 }, { "epoch": 0.9046973617557262, "grad_norm": 12.984375, "learning_rate": 2.1139795421334633e-06, "loss": 24.6225, "step": 2065 }, { "epoch": 0.906887912268452, "grad_norm": 15.65625, "learning_rate": 2.065270336093522e-06, "loss": 24.7391, "step": 2070 }, { "epoch": 0.9090784627811777, "grad_norm": 13.3046875, "learning_rate": 2.0165611300535805e-06, "loss": 24.7855, "step": 2075 }, { "epoch": 0.9112690132939034, "grad_norm": 15.015625, "learning_rate": 1.967851924013639e-06, "loss": 24.7362, "step": 2080 }, { "epoch": 0.9134595638066292, "grad_norm": 18.53125, "learning_rate": 1.9191427179736972e-06, "loss": 24.8633, "step": 2085 }, { "epoch": 0.9156501143193548, "grad_norm": 19.0625, "learning_rate": 1.8704335119337556e-06, "loss": 24.7622, "step": 2090 }, { "epoch": 0.9178406648320806, "grad_norm": 22.96875, "learning_rate": 1.8217243058938142e-06, "loss": 24.8469, "step": 2095 }, { "epoch": 0.9200312153448064, "grad_norm": 16.546875, "learning_rate": 1.7730150998538726e-06, "loss": 24.4148, "step": 2100 }, { "epoch": 0.922221765857532, "grad_norm": 17.15625, "learning_rate": 1.724305893813931e-06, "loss": 24.5911, "step": 2105 }, { "epoch": 0.9244123163702578, "grad_norm": 32.375, "learning_rate": 1.6755966877739893e-06, "loss": 24.3472, "step": 2110 }, { "epoch": 0.9266028668829835, "grad_norm": 11.1796875, "learning_rate": 1.626887481734048e-06, "loss": 24.272, "step": 2115 }, { "epoch": 0.9287934173957093, "grad_norm": 15.828125, "learning_rate": 1.5781782756941063e-06, "loss": 24.1789, "step": 2120 }, { "epoch": 0.930983967908435, "grad_norm": 13.609375, "learning_rate": 1.5294690696541647e-06, "loss": 24.5268, "step": 2125 }, { "epoch": 0.9331745184211607, "grad_norm": 11.7890625, "learning_rate": 1.480759863614223e-06, "loss": 24.4547, "step": 2130 }, { "epoch": 0.9353650689338865, "grad_norm": 14.46875, "learning_rate": 1.4320506575742814e-06, "loss": 24.5342, "step": 2135 }, { "epoch": 0.9375556194466121, "grad_norm": 12.9765625, "learning_rate": 1.3833414515343402e-06, "loss": 24.1581, "step": 2140 }, { "epoch": 0.9397461699593379, "grad_norm": 13.4140625, "learning_rate": 1.3346322454943986e-06, "loss": 24.0952, "step": 2145 }, { "epoch": 0.9419367204720637, "grad_norm": 11.921875, "learning_rate": 1.285923039454457e-06, "loss": 24.6826, "step": 2150 }, { "epoch": 0.9441272709847893, "grad_norm": 13.7734375, "learning_rate": 1.2372138334145156e-06, "loss": 23.6338, "step": 2155 }, { "epoch": 0.9463178214975151, "grad_norm": 12.1015625, "learning_rate": 1.188504627374574e-06, "loss": 24.3637, "step": 2160 }, { "epoch": 0.9485083720102409, "grad_norm": 14.578125, "learning_rate": 1.1397954213346323e-06, "loss": 24.6624, "step": 2165 }, { "epoch": 0.9506989225229665, "grad_norm": 12.7578125, "learning_rate": 1.0910862152946907e-06, "loss": 23.9978, "step": 2170 }, { "epoch": 0.9528894730356923, "grad_norm": 13.2265625, "learning_rate": 1.0423770092547493e-06, "loss": 24.4468, "step": 2175 }, { "epoch": 0.955080023548418, "grad_norm": 11.7890625, "learning_rate": 9.936678032148077e-07, "loss": 24.303, "step": 2180 }, { "epoch": 0.9572705740611437, "grad_norm": 16.25, "learning_rate": 9.44958597174866e-07, "loss": 24.5248, "step": 2185 }, { "epoch": 0.9594611245738695, "grad_norm": 18.859375, "learning_rate": 8.962493911349246e-07, "loss": 24.4316, "step": 2190 }, { "epoch": 0.9616516750865952, "grad_norm": 10.6796875, "learning_rate": 8.47540185094983e-07, "loss": 23.8851, "step": 2195 }, { "epoch": 0.963842225599321, "grad_norm": 12.140625, "learning_rate": 7.988309790550415e-07, "loss": 24.2927, "step": 2200 }, { "epoch": 0.9660327761120466, "grad_norm": 18.078125, "learning_rate": 7.501217730150999e-07, "loss": 24.1125, "step": 2205 }, { "epoch": 0.9682233266247724, "grad_norm": 9.7109375, "learning_rate": 7.014125669751585e-07, "loss": 24.1763, "step": 2210 }, { "epoch": 0.9704138771374982, "grad_norm": 12.6953125, "learning_rate": 6.527033609352168e-07, "loss": 23.1948, "step": 2215 }, { "epoch": 0.9726044276502238, "grad_norm": 18.671875, "learning_rate": 6.039941548952752e-07, "loss": 24.2813, "step": 2220 }, { "epoch": 0.9747949781629496, "grad_norm": 9.96875, "learning_rate": 5.552849488553337e-07, "loss": 23.7533, "step": 2225 }, { "epoch": 0.9769855286756753, "grad_norm": 13.71875, "learning_rate": 5.065757428153922e-07, "loss": 24.5382, "step": 2230 }, { "epoch": 0.979176079188401, "grad_norm": 13.4296875, "learning_rate": 4.578665367754506e-07, "loss": 23.7636, "step": 2235 }, { "epoch": 0.9813666297011268, "grad_norm": 13.640625, "learning_rate": 4.091573307355091e-07, "loss": 23.785, "step": 2240 }, { "epoch": 0.9835571802138525, "grad_norm": 27.078125, "learning_rate": 3.6044812469556747e-07, "loss": 23.8269, "step": 2245 }, { "epoch": 0.9857477307265782, "grad_norm": 13.4609375, "learning_rate": 3.1173891865562595e-07, "loss": 24.0428, "step": 2250 }, { "epoch": 0.9879382812393039, "grad_norm": 12.109375, "learning_rate": 2.630297126156844e-07, "loss": 24.3293, "step": 2255 }, { "epoch": 0.9901288317520297, "grad_norm": 20.84375, "learning_rate": 2.1432050657574284e-07, "loss": 24.0019, "step": 2260 }, { "epoch": 0.9923193822647554, "grad_norm": 16.265625, "learning_rate": 1.6561130053580127e-07, "loss": 24.3604, "step": 2265 }, { "epoch": 0.9945099327774811, "grad_norm": 19.96875, "learning_rate": 1.1690209449585972e-07, "loss": 24.1925, "step": 2270 }, { "epoch": 0.9967004832902069, "grad_norm": 16.953125, "learning_rate": 6.819288845591817e-08, "loss": 23.6996, "step": 2275 }, { "epoch": 0.9988910338029326, "grad_norm": 19.6875, "learning_rate": 1.9483682415976622e-08, "loss": 23.9507, "step": 2280 } ], "logging_steps": 5, "max_steps": 2282, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.9772879918220706e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }