diff --git "a/resnet50/checkpoint-148500/trainer_state.json" "b/resnet50/checkpoint-148500/trainer_state.json" new file mode 100644--- /dev/null +++ "b/resnet50/checkpoint-148500/trainer_state.json" @@ -0,0 +1,106657 @@ +{ + "best_global_step": 143500, + "best_metric": 0.9901443377630826, + "best_model_checkpoint": "/workspace/output/resnet50/checkpoint-143500", + "epoch": 21.07877927608233, + "eval_steps": 500, + "global_step": 148500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0014194464158978, + "grad_norm": 3.2342276573181152, + "learning_rate": 9.999872249822569e-05, + "loss": 5.98863525390625, + "step": 10 + }, + { + "epoch": 0.0028388928317956, + "grad_norm": 3.3994972705841064, + "learning_rate": 9.99973030518098e-05, + "loss": 5.97633056640625, + "step": 20 + }, + { + "epoch": 0.0042583392476933995, + "grad_norm": 3.3180341720581055, + "learning_rate": 9.99958836053939e-05, + "loss": 5.97711181640625, + "step": 30 + }, + { + "epoch": 0.0056777856635912, + "grad_norm": 2.9379143714904785, + "learning_rate": 9.999446415897801e-05, + "loss": 5.9991455078125, + "step": 40 + }, + { + "epoch": 0.007097232079488999, + "grad_norm": 2.2698018550872803, + "learning_rate": 9.99930447125621e-05, + "loss": 5.96363525390625, + "step": 50 + }, + { + "epoch": 0.008516678495386799, + "grad_norm": 2.0626659393310547, + "learning_rate": 9.99916252661462e-05, + "loss": 5.96995849609375, + "step": 60 + }, + { + "epoch": 0.0099361249112846, + "grad_norm": 2.814460277557373, + "learning_rate": 9.999020581973031e-05, + "loss": 5.9493408203125, + "step": 70 + }, + { + "epoch": 0.0113555713271824, + "grad_norm": 2.871051788330078, + "learning_rate": 9.998878637331441e-05, + "loss": 5.9510498046875, + "step": 80 + }, + { + "epoch": 0.0127750177430802, + "grad_norm": 2.3897151947021484, + "learning_rate": 9.998736692689852e-05, + "loss": 5.94254150390625, + "step": 90 + }, + { + "epoch": 0.014194464158977998, + "grad_norm": 2.9910531044006348, + "learning_rate": 9.99859474804826e-05, + "loss": 5.9062255859375, + "step": 100 + }, + { + "epoch": 0.015613910574875798, + "grad_norm": 3.137518882751465, + "learning_rate": 9.998452803406672e-05, + "loss": 5.9070068359375, + "step": 110 + }, + { + "epoch": 0.017033356990773598, + "grad_norm": 3.021024703979492, + "learning_rate": 9.998310858765082e-05, + "loss": 5.87197265625, + "step": 120 + }, + { + "epoch": 0.018452803406671398, + "grad_norm": 3.499450445175171, + "learning_rate": 9.998168914123493e-05, + "loss": 5.8237548828125, + "step": 130 + }, + { + "epoch": 0.0198722498225692, + "grad_norm": 3.87576961517334, + "learning_rate": 9.998026969481902e-05, + "loss": 5.754150390625, + "step": 140 + }, + { + "epoch": 0.021291696238467, + "grad_norm": 3.9846458435058594, + "learning_rate": 9.997885024840313e-05, + "loss": 5.697198486328125, + "step": 150 + }, + { + "epoch": 0.0227111426543648, + "grad_norm": 4.339130878448486, + "learning_rate": 9.997743080198723e-05, + "loss": 5.63760986328125, + "step": 160 + }, + { + "epoch": 0.0241305890702626, + "grad_norm": 4.891483783721924, + "learning_rate": 9.997601135557133e-05, + "loss": 5.5271728515625, + "step": 170 + }, + { + "epoch": 0.0255500354861604, + "grad_norm": 5.147222995758057, + "learning_rate": 9.997459190915544e-05, + "loss": 5.45938720703125, + "step": 180 + }, + { + "epoch": 0.0269694819020582, + "grad_norm": 5.365755558013916, + "learning_rate": 9.997317246273954e-05, + "loss": 5.355255126953125, + "step": 190 + }, + { + "epoch": 0.028388928317955996, + "grad_norm": 5.888001918792725, + "learning_rate": 9.997175301632365e-05, + "loss": 5.1554931640625, + "step": 200 + }, + { + "epoch": 0.029808374733853796, + "grad_norm": 6.100172996520996, + "learning_rate": 9.997033356990773e-05, + "loss": 5.035284423828125, + "step": 210 + }, + { + "epoch": 0.031227821149751596, + "grad_norm": 6.491486549377441, + "learning_rate": 9.996891412349184e-05, + "loss": 4.899530029296875, + "step": 220 + }, + { + "epoch": 0.032647267565649396, + "grad_norm": 6.916806697845459, + "learning_rate": 9.996749467707594e-05, + "loss": 4.851350402832031, + "step": 230 + }, + { + "epoch": 0.034066713981547196, + "grad_norm": 6.837950706481934, + "learning_rate": 9.996607523066005e-05, + "loss": 4.726431274414063, + "step": 240 + }, + { + "epoch": 0.035486160397444996, + "grad_norm": 7.554074287414551, + "learning_rate": 9.996465578424415e-05, + "loss": 4.4839630126953125, + "step": 250 + }, + { + "epoch": 0.036905606813342796, + "grad_norm": 7.574995994567871, + "learning_rate": 9.996323633782825e-05, + "loss": 4.506732177734375, + "step": 260 + }, + { + "epoch": 0.0383250532292406, + "grad_norm": 7.498238563537598, + "learning_rate": 9.996181689141236e-05, + "loss": 4.319998168945313, + "step": 270 + }, + { + "epoch": 0.0397444996451384, + "grad_norm": 7.978142261505127, + "learning_rate": 9.996039744499645e-05, + "loss": 4.214613342285157, + "step": 280 + }, + { + "epoch": 0.0411639460610362, + "grad_norm": 8.194511413574219, + "learning_rate": 9.995897799858057e-05, + "loss": 4.212762451171875, + "step": 290 + }, + { + "epoch": 0.042583392476934, + "grad_norm": 8.136639595031738, + "learning_rate": 9.995755855216466e-05, + "loss": 4.009028625488281, + "step": 300 + }, + { + "epoch": 0.0440028388928318, + "grad_norm": 8.684012413024902, + "learning_rate": 9.995613910574876e-05, + "loss": 3.9817459106445314, + "step": 310 + }, + { + "epoch": 0.0454222853087296, + "grad_norm": 8.888952255249023, + "learning_rate": 9.995471965933286e-05, + "loss": 3.94019775390625, + "step": 320 + }, + { + "epoch": 0.0468417317246274, + "grad_norm": 8.79919719696045, + "learning_rate": 9.995330021291697e-05, + "loss": 3.9265777587890627, + "step": 330 + }, + { + "epoch": 0.0482611781405252, + "grad_norm": 8.571785926818848, + "learning_rate": 9.995188076650107e-05, + "loss": 3.7262115478515625, + "step": 340 + }, + { + "epoch": 0.049680624556423, + "grad_norm": 8.640142440795898, + "learning_rate": 9.995046132008518e-05, + "loss": 3.644915771484375, + "step": 350 + }, + { + "epoch": 0.0511000709723208, + "grad_norm": 9.322779655456543, + "learning_rate": 9.994904187366927e-05, + "loss": 3.644049072265625, + "step": 360 + }, + { + "epoch": 0.0525195173882186, + "grad_norm": 8.790424346923828, + "learning_rate": 9.994762242725337e-05, + "loss": 3.4869285583496095, + "step": 370 + }, + { + "epoch": 0.0539389638041164, + "grad_norm": 9.344154357910156, + "learning_rate": 9.994620298083748e-05, + "loss": 3.55142822265625, + "step": 380 + }, + { + "epoch": 0.05535841022001419, + "grad_norm": 8.807840347290039, + "learning_rate": 9.994478353442158e-05, + "loss": 3.4293190002441407, + "step": 390 + }, + { + "epoch": 0.05677785663591199, + "grad_norm": 9.36971378326416, + "learning_rate": 9.994336408800569e-05, + "loss": 3.429082489013672, + "step": 400 + }, + { + "epoch": 0.05819730305180979, + "grad_norm": 9.73521900177002, + "learning_rate": 9.994194464158977e-05, + "loss": 3.408639907836914, + "step": 410 + }, + { + "epoch": 0.05961674946770759, + "grad_norm": 9.646844863891602, + "learning_rate": 9.994052519517389e-05, + "loss": 3.1950119018554686, + "step": 420 + }, + { + "epoch": 0.06103619588360539, + "grad_norm": 9.722207069396973, + "learning_rate": 9.993910574875798e-05, + "loss": 3.4140243530273438, + "step": 430 + }, + { + "epoch": 0.06245564229950319, + "grad_norm": 10.609601020812988, + "learning_rate": 9.99376863023421e-05, + "loss": 3.320109558105469, + "step": 440 + }, + { + "epoch": 0.063875088715401, + "grad_norm": 10.271575927734375, + "learning_rate": 9.993626685592619e-05, + "loss": 3.232251739501953, + "step": 450 + }, + { + "epoch": 0.06529453513129879, + "grad_norm": 9.766585350036621, + "learning_rate": 9.993484740951029e-05, + "loss": 3.149517059326172, + "step": 460 + }, + { + "epoch": 0.0667139815471966, + "grad_norm": 10.358244895935059, + "learning_rate": 9.99334279630944e-05, + "loss": 3.1863967895507814, + "step": 470 + }, + { + "epoch": 0.06813342796309439, + "grad_norm": 10.473136901855469, + "learning_rate": 9.99320085166785e-05, + "loss": 3.222390365600586, + "step": 480 + }, + { + "epoch": 0.0695528743789922, + "grad_norm": 9.905110359191895, + "learning_rate": 9.993058907026261e-05, + "loss": 3.1823768615722656, + "step": 490 + }, + { + "epoch": 0.07097232079488999, + "grad_norm": 9.858973503112793, + "learning_rate": 9.99291696238467e-05, + "loss": 2.9202560424804687, + "step": 500 + }, + { + "epoch": 0.07097232079488999, + "eval_accuracy": 0.1867489031601704, + "eval_loss": 3.0744524002075195, + "eval_runtime": 31.2289, + "eval_samples_per_second": 503.605, + "eval_steps_per_second": 15.755, + "step": 500 + }, + { + "epoch": 0.0723917672107878, + "grad_norm": 10.224215507507324, + "learning_rate": 9.992775017743082e-05, + "loss": 3.0410499572753906, + "step": 510 + }, + { + "epoch": 0.07381121362668559, + "grad_norm": 9.867650032043457, + "learning_rate": 9.99263307310149e-05, + "loss": 3.116912078857422, + "step": 520 + }, + { + "epoch": 0.07523066004258339, + "grad_norm": 10.343064308166504, + "learning_rate": 9.992491128459901e-05, + "loss": 3.06390266418457, + "step": 530 + }, + { + "epoch": 0.0766501064584812, + "grad_norm": 10.38116455078125, + "learning_rate": 9.992349183818311e-05, + "loss": 2.973680114746094, + "step": 540 + }, + { + "epoch": 0.07806955287437899, + "grad_norm": 10.979643821716309, + "learning_rate": 9.992207239176722e-05, + "loss": 3.0906436920166014, + "step": 550 + }, + { + "epoch": 0.0794889992902768, + "grad_norm": 10.06657886505127, + "learning_rate": 9.992065294535132e-05, + "loss": 3.0091484069824217, + "step": 560 + }, + { + "epoch": 0.08090844570617459, + "grad_norm": 10.663322448730469, + "learning_rate": 9.991923349893541e-05, + "loss": 2.862255859375, + "step": 570 + }, + { + "epoch": 0.0823278921220724, + "grad_norm": 9.277785301208496, + "learning_rate": 9.991781405251952e-05, + "loss": 2.8638259887695314, + "step": 580 + }, + { + "epoch": 0.08374733853797019, + "grad_norm": 10.807332038879395, + "learning_rate": 9.991639460610362e-05, + "loss": 2.732352066040039, + "step": 590 + }, + { + "epoch": 0.085166784953868, + "grad_norm": 9.970373153686523, + "learning_rate": 9.991497515968773e-05, + "loss": 2.736968231201172, + "step": 600 + }, + { + "epoch": 0.08658623136976579, + "grad_norm": 11.008269309997559, + "learning_rate": 9.991355571327183e-05, + "loss": 2.7735246658325194, + "step": 610 + }, + { + "epoch": 0.0880056777856636, + "grad_norm": 8.758193969726562, + "learning_rate": 9.991213626685593e-05, + "loss": 2.5436214447021483, + "step": 620 + }, + { + "epoch": 0.08942512420156139, + "grad_norm": 11.253259658813477, + "learning_rate": 9.991071682044003e-05, + "loss": 2.748835563659668, + "step": 630 + }, + { + "epoch": 0.0908445706174592, + "grad_norm": 10.979547500610352, + "learning_rate": 9.990929737402414e-05, + "loss": 2.7314834594726562, + "step": 640 + }, + { + "epoch": 0.09226401703335699, + "grad_norm": 11.182887077331543, + "learning_rate": 9.990787792760823e-05, + "loss": 2.645678901672363, + "step": 650 + }, + { + "epoch": 0.0936834634492548, + "grad_norm": 10.636208534240723, + "learning_rate": 9.990645848119234e-05, + "loss": 2.5704013824462892, + "step": 660 + }, + { + "epoch": 0.09510290986515259, + "grad_norm": 10.351170539855957, + "learning_rate": 9.990503903477644e-05, + "loss": 2.5628406524658205, + "step": 670 + }, + { + "epoch": 0.0965223562810504, + "grad_norm": 9.914809226989746, + "learning_rate": 9.990361958836054e-05, + "loss": 2.5872230529785156, + "step": 680 + }, + { + "epoch": 0.09794180269694819, + "grad_norm": 10.839837074279785, + "learning_rate": 9.990220014194465e-05, + "loss": 2.490940475463867, + "step": 690 + }, + { + "epoch": 0.099361249112846, + "grad_norm": 11.259613990783691, + "learning_rate": 9.990078069552875e-05, + "loss": 2.64483585357666, + "step": 700 + }, + { + "epoch": 0.10078069552874379, + "grad_norm": 11.213078498840332, + "learning_rate": 9.989936124911286e-05, + "loss": 2.5397150039672853, + "step": 710 + }, + { + "epoch": 0.1022001419446416, + "grad_norm": 10.366206169128418, + "learning_rate": 9.989794180269694e-05, + "loss": 2.457781219482422, + "step": 720 + }, + { + "epoch": 0.10361958836053939, + "grad_norm": 11.44458293914795, + "learning_rate": 9.989652235628105e-05, + "loss": 2.5090484619140625, + "step": 730 + }, + { + "epoch": 0.1050390347764372, + "grad_norm": 11.689805030822754, + "learning_rate": 9.989510290986515e-05, + "loss": 2.409171485900879, + "step": 740 + }, + { + "epoch": 0.10645848119233499, + "grad_norm": 10.568279266357422, + "learning_rate": 9.989368346344926e-05, + "loss": 2.3308380126953123, + "step": 750 + }, + { + "epoch": 0.1078779276082328, + "grad_norm": 11.917696952819824, + "learning_rate": 9.989226401703337e-05, + "loss": 2.3733493804931642, + "step": 760 + }, + { + "epoch": 0.10929737402413059, + "grad_norm": 9.960722923278809, + "learning_rate": 9.989098651525906e-05, + "loss": 2.4058095932006838, + "step": 770 + }, + { + "epoch": 0.11071682044002838, + "grad_norm": 11.068999290466309, + "learning_rate": 9.988956706884315e-05, + "loss": 2.4371658325195313, + "step": 780 + }, + { + "epoch": 0.11213626685592619, + "grad_norm": 10.340009689331055, + "learning_rate": 9.988814762242725e-05, + "loss": 2.2587520599365236, + "step": 790 + }, + { + "epoch": 0.11355571327182398, + "grad_norm": 9.941303253173828, + "learning_rate": 9.988672817601136e-05, + "loss": 2.268446350097656, + "step": 800 + }, + { + "epoch": 0.11497515968772179, + "grad_norm": 11.490272521972656, + "learning_rate": 9.988530872959546e-05, + "loss": 2.471067428588867, + "step": 810 + }, + { + "epoch": 0.11639460610361958, + "grad_norm": 10.67241382598877, + "learning_rate": 9.988388928317957e-05, + "loss": 2.3497791290283203, + "step": 820 + }, + { + "epoch": 0.11781405251951739, + "grad_norm": 10.710894584655762, + "learning_rate": 9.988246983676367e-05, + "loss": 2.1724626541137697, + "step": 830 + }, + { + "epoch": 0.11923349893541518, + "grad_norm": 10.985452651977539, + "learning_rate": 9.988105039034778e-05, + "loss": 2.1848114013671873, + "step": 840 + }, + { + "epoch": 0.12065294535131299, + "grad_norm": 10.063145637512207, + "learning_rate": 9.987963094393186e-05, + "loss": 2.180558776855469, + "step": 850 + }, + { + "epoch": 0.12207239176721078, + "grad_norm": 11.236614227294922, + "learning_rate": 9.987821149751597e-05, + "loss": 2.282668876647949, + "step": 860 + }, + { + "epoch": 0.12349183818310859, + "grad_norm": 10.98898983001709, + "learning_rate": 9.987679205110007e-05, + "loss": 2.235186767578125, + "step": 870 + }, + { + "epoch": 0.12491128459900638, + "grad_norm": 11.805492401123047, + "learning_rate": 9.987537260468418e-05, + "loss": 2.2264921188354494, + "step": 880 + }, + { + "epoch": 0.1263307310149042, + "grad_norm": 10.717041015625, + "learning_rate": 9.987395315826828e-05, + "loss": 2.1385255813598634, + "step": 890 + }, + { + "epoch": 0.127750177430802, + "grad_norm": 9.613192558288574, + "learning_rate": 9.987253371185238e-05, + "loss": 2.1964336395263673, + "step": 900 + }, + { + "epoch": 0.12916962384669978, + "grad_norm": 10.594833374023438, + "learning_rate": 9.987111426543649e-05, + "loss": 2.050688362121582, + "step": 910 + }, + { + "epoch": 0.13058907026259758, + "grad_norm": 11.596671104431152, + "learning_rate": 9.986969481902059e-05, + "loss": 2.077385139465332, + "step": 920 + }, + { + "epoch": 0.1320085166784954, + "grad_norm": 10.779032707214355, + "learning_rate": 9.98682753726047e-05, + "loss": 2.0280479431152343, + "step": 930 + }, + { + "epoch": 0.1334279630943932, + "grad_norm": 10.522924423217773, + "learning_rate": 9.98668559261888e-05, + "loss": 1.9384689331054688, + "step": 940 + }, + { + "epoch": 0.13484740951029098, + "grad_norm": 9.86844539642334, + "learning_rate": 9.986543647977289e-05, + "loss": 2.0612548828125, + "step": 950 + }, + { + "epoch": 0.13626685592618878, + "grad_norm": 12.521405220031738, + "learning_rate": 9.986401703335699e-05, + "loss": 2.139466094970703, + "step": 960 + }, + { + "epoch": 0.1376863023420866, + "grad_norm": 11.292656898498535, + "learning_rate": 9.98625975869411e-05, + "loss": 2.077956199645996, + "step": 970 + }, + { + "epoch": 0.1391057487579844, + "grad_norm": 11.186986923217773, + "learning_rate": 9.98611781405252e-05, + "loss": 2.028730010986328, + "step": 980 + }, + { + "epoch": 0.14052519517388218, + "grad_norm": 10.553022384643555, + "learning_rate": 9.985975869410931e-05, + "loss": 1.9375551223754883, + "step": 990 + }, + { + "epoch": 0.14194464158977999, + "grad_norm": 11.089204788208008, + "learning_rate": 9.98583392476934e-05, + "loss": 2.0689823150634767, + "step": 1000 + }, + { + "epoch": 0.14194464158977999, + "eval_accuracy": 0.42239460799898265, + "eval_loss": 1.9010688066482544, + "eval_runtime": 31.4593, + "eval_samples_per_second": 499.916, + "eval_steps_per_second": 15.639, + "step": 1000 + }, + { + "epoch": 0.1433640880056778, + "grad_norm": 10.988676071166992, + "learning_rate": 9.98569198012775e-05, + "loss": 1.9830604553222657, + "step": 1010 + }, + { + "epoch": 0.1447835344215756, + "grad_norm": 11.2459077835083, + "learning_rate": 9.985550035486161e-05, + "loss": 1.9190074920654296, + "step": 1020 + }, + { + "epoch": 0.14620298083747338, + "grad_norm": 10.437894821166992, + "learning_rate": 9.985408090844571e-05, + "loss": 1.8999460220336915, + "step": 1030 + }, + { + "epoch": 0.14762242725337119, + "grad_norm": 10.94793701171875, + "learning_rate": 9.985266146202982e-05, + "loss": 1.8579456329345703, + "step": 1040 + }, + { + "epoch": 0.149041873669269, + "grad_norm": 11.168233871459961, + "learning_rate": 9.98512420156139e-05, + "loss": 1.8979732513427734, + "step": 1050 + }, + { + "epoch": 0.15046132008516677, + "grad_norm": 10.14195728302002, + "learning_rate": 9.984982256919802e-05, + "loss": 1.7833553314208985, + "step": 1060 + }, + { + "epoch": 0.15188076650106458, + "grad_norm": 9.160737991333008, + "learning_rate": 9.984840312278211e-05, + "loss": 1.8624576568603515, + "step": 1070 + }, + { + "epoch": 0.1533002129169624, + "grad_norm": 11.151049613952637, + "learning_rate": 9.984698367636623e-05, + "loss": 1.8210905075073243, + "step": 1080 + }, + { + "epoch": 0.1547196593328602, + "grad_norm": 10.053725242614746, + "learning_rate": 9.984556422995032e-05, + "loss": 1.7738643646240235, + "step": 1090 + }, + { + "epoch": 0.15613910574875797, + "grad_norm": 10.97727108001709, + "learning_rate": 9.984414478353442e-05, + "loss": 1.866429328918457, + "step": 1100 + }, + { + "epoch": 0.15755855216465578, + "grad_norm": 12.384384155273438, + "learning_rate": 9.984272533711853e-05, + "loss": 1.8680984497070312, + "step": 1110 + }, + { + "epoch": 0.1589779985805536, + "grad_norm": 11.387879371643066, + "learning_rate": 9.984130589070263e-05, + "loss": 1.8034194946289062, + "step": 1120 + }, + { + "epoch": 0.1603974449964514, + "grad_norm": 10.6587495803833, + "learning_rate": 9.983988644428674e-05, + "loss": 1.772690773010254, + "step": 1130 + }, + { + "epoch": 0.16181689141234917, + "grad_norm": 12.721858024597168, + "learning_rate": 9.983846699787084e-05, + "loss": 1.7724496841430664, + "step": 1140 + }, + { + "epoch": 0.16323633782824698, + "grad_norm": 11.116838455200195, + "learning_rate": 9.983704755145493e-05, + "loss": 1.7527042388916017, + "step": 1150 + }, + { + "epoch": 0.1646557842441448, + "grad_norm": 10.033406257629395, + "learning_rate": 9.983562810503903e-05, + "loss": 1.674898338317871, + "step": 1160 + }, + { + "epoch": 0.1660752306600426, + "grad_norm": 11.121773719787598, + "learning_rate": 9.983420865862314e-05, + "loss": 1.741505241394043, + "step": 1170 + }, + { + "epoch": 0.16749467707594037, + "grad_norm": 11.052094459533691, + "learning_rate": 9.983278921220724e-05, + "loss": 1.7749841690063477, + "step": 1180 + }, + { + "epoch": 0.16891412349183818, + "grad_norm": 10.183452606201172, + "learning_rate": 9.983136976579135e-05, + "loss": 1.6881484985351562, + "step": 1190 + }, + { + "epoch": 0.170333569907736, + "grad_norm": 11.106999397277832, + "learning_rate": 9.982995031937545e-05, + "loss": 1.814961051940918, + "step": 1200 + }, + { + "epoch": 0.1717530163236338, + "grad_norm": 12.08647632598877, + "learning_rate": 9.982853087295955e-05, + "loss": 1.682515525817871, + "step": 1210 + }, + { + "epoch": 0.17317246273953157, + "grad_norm": 13.744584083557129, + "learning_rate": 9.982711142654366e-05, + "loss": 1.6713733673095703, + "step": 1220 + }, + { + "epoch": 0.17459190915542938, + "grad_norm": 9.970173835754395, + "learning_rate": 9.982569198012775e-05, + "loss": 1.711156463623047, + "step": 1230 + }, + { + "epoch": 0.1760113555713272, + "grad_norm": 11.027495384216309, + "learning_rate": 9.982427253371186e-05, + "loss": 1.759619140625, + "step": 1240 + }, + { + "epoch": 0.177430801987225, + "grad_norm": 10.876315116882324, + "learning_rate": 9.982285308729596e-05, + "loss": 1.618482780456543, + "step": 1250 + }, + { + "epoch": 0.17885024840312277, + "grad_norm": 10.26490592956543, + "learning_rate": 9.982143364088006e-05, + "loss": 1.6674427032470702, + "step": 1260 + }, + { + "epoch": 0.18026969481902058, + "grad_norm": 11.872292518615723, + "learning_rate": 9.982001419446416e-05, + "loss": 1.6325908660888673, + "step": 1270 + }, + { + "epoch": 0.1816891412349184, + "grad_norm": 9.946234703063965, + "learning_rate": 9.981859474804827e-05, + "loss": 1.5453743934631348, + "step": 1280 + }, + { + "epoch": 0.18310858765081617, + "grad_norm": 11.03128719329834, + "learning_rate": 9.981717530163236e-05, + "loss": 1.658684539794922, + "step": 1290 + }, + { + "epoch": 0.18452803406671398, + "grad_norm": 12.145915031433105, + "learning_rate": 9.981575585521648e-05, + "loss": 1.5792274475097656, + "step": 1300 + }, + { + "epoch": 0.18594748048261178, + "grad_norm": 11.820379257202148, + "learning_rate": 9.981433640880057e-05, + "loss": 1.5301803588867187, + "step": 1310 + }, + { + "epoch": 0.1873669268985096, + "grad_norm": 11.046746253967285, + "learning_rate": 9.981291696238467e-05, + "loss": 1.6124080657958983, + "step": 1320 + }, + { + "epoch": 0.18878637331440737, + "grad_norm": 9.545868873596191, + "learning_rate": 9.981149751596878e-05, + "loss": 1.5502593994140625, + "step": 1330 + }, + { + "epoch": 0.19020581973030518, + "grad_norm": 11.999979019165039, + "learning_rate": 9.981007806955288e-05, + "loss": 1.5360203742980958, + "step": 1340 + }, + { + "epoch": 0.19162526614620298, + "grad_norm": 9.949675559997559, + "learning_rate": 9.980865862313699e-05, + "loss": 1.353858470916748, + "step": 1350 + }, + { + "epoch": 0.1930447125621008, + "grad_norm": 11.573400497436523, + "learning_rate": 9.980723917672107e-05, + "loss": 1.3946660995483398, + "step": 1360 + }, + { + "epoch": 0.19446415897799857, + "grad_norm": 10.249485969543457, + "learning_rate": 9.980581973030518e-05, + "loss": 1.518262004852295, + "step": 1370 + }, + { + "epoch": 0.19588360539389638, + "grad_norm": 10.011629104614258, + "learning_rate": 9.980440028388928e-05, + "loss": 1.5000194549560546, + "step": 1380 + }, + { + "epoch": 0.19730305180979418, + "grad_norm": 12.186440467834473, + "learning_rate": 9.980298083747339e-05, + "loss": 1.554741382598877, + "step": 1390 + }, + { + "epoch": 0.198722498225692, + "grad_norm": 11.845844268798828, + "learning_rate": 9.980156139105749e-05, + "loss": 1.4599843978881837, + "step": 1400 + }, + { + "epoch": 0.20014194464158977, + "grad_norm": 10.98592472076416, + "learning_rate": 9.980014194464159e-05, + "loss": 1.4062080383300781, + "step": 1410 + }, + { + "epoch": 0.20156139105748758, + "grad_norm": 11.54171371459961, + "learning_rate": 9.97987224982257e-05, + "loss": 1.5128003120422364, + "step": 1420 + }, + { + "epoch": 0.20298083747338538, + "grad_norm": 10.248682022094727, + "learning_rate": 9.97973030518098e-05, + "loss": 1.5022719383239747, + "step": 1430 + }, + { + "epoch": 0.2044002838892832, + "grad_norm": 8.78536319732666, + "learning_rate": 9.97958836053939e-05, + "loss": 1.4118841171264649, + "step": 1440 + }, + { + "epoch": 0.20581973030518097, + "grad_norm": 9.993626594543457, + "learning_rate": 9.9794464158978e-05, + "loss": 1.3945957183837892, + "step": 1450 + }, + { + "epoch": 0.20723917672107878, + "grad_norm": 11.31412124633789, + "learning_rate": 9.97930447125621e-05, + "loss": 1.26229887008667, + "step": 1460 + }, + { + "epoch": 0.20865862313697658, + "grad_norm": 11.182840347290039, + "learning_rate": 9.97916252661462e-05, + "loss": 1.3171740531921388, + "step": 1470 + }, + { + "epoch": 0.2100780695528744, + "grad_norm": 12.25224781036377, + "learning_rate": 9.979020581973031e-05, + "loss": 1.3310781478881837, + "step": 1480 + }, + { + "epoch": 0.21149751596877217, + "grad_norm": 11.81201457977295, + "learning_rate": 9.978878637331441e-05, + "loss": 1.3043070793151856, + "step": 1490 + }, + { + "epoch": 0.21291696238466998, + "grad_norm": 10.484480857849121, + "learning_rate": 9.978736692689852e-05, + "loss": 1.2629288673400878, + "step": 1500 + }, + { + "epoch": 0.21291696238466998, + "eval_accuracy": 0.5395180263241559, + "eval_loss": 1.438815712928772, + "eval_runtime": 32.1456, + "eval_samples_per_second": 489.242, + "eval_steps_per_second": 15.305, + "step": 1500 + }, + { + "epoch": 0.21433640880056778, + "grad_norm": 10.796157836914062, + "learning_rate": 9.978594748048262e-05, + "loss": 1.3752121925354004, + "step": 1510 + }, + { + "epoch": 0.2157558552164656, + "grad_norm": 10.1256742477417, + "learning_rate": 9.978452803406671e-05, + "loss": 1.3005435943603516, + "step": 1520 + }, + { + "epoch": 0.21717530163236337, + "grad_norm": 11.182530403137207, + "learning_rate": 9.978310858765082e-05, + "loss": 1.3048934936523438, + "step": 1530 + }, + { + "epoch": 0.21859474804826118, + "grad_norm": 10.190278053283691, + "learning_rate": 9.978168914123492e-05, + "loss": 1.3993605613708495, + "step": 1540 + }, + { + "epoch": 0.22001419446415899, + "grad_norm": 10.497735977172852, + "learning_rate": 9.978026969481903e-05, + "loss": 1.303945541381836, + "step": 1550 + }, + { + "epoch": 0.22143364088005676, + "grad_norm": 10.535606384277344, + "learning_rate": 9.977885024840313e-05, + "loss": 1.2210904121398927, + "step": 1560 + }, + { + "epoch": 0.22285308729595457, + "grad_norm": 11.385029792785645, + "learning_rate": 9.977743080198723e-05, + "loss": 1.3508376121520995, + "step": 1570 + }, + { + "epoch": 0.22427253371185238, + "grad_norm": 9.528643608093262, + "learning_rate": 9.977601135557132e-05, + "loss": 1.2278815269470216, + "step": 1580 + }, + { + "epoch": 0.22569198012775019, + "grad_norm": 13.161009788513184, + "learning_rate": 9.977459190915544e-05, + "loss": 1.254448413848877, + "step": 1590 + }, + { + "epoch": 0.22711142654364797, + "grad_norm": 11.288809776306152, + "learning_rate": 9.977317246273953e-05, + "loss": 1.271047878265381, + "step": 1600 + }, + { + "epoch": 0.22853087295954577, + "grad_norm": 11.30105209350586, + "learning_rate": 9.977175301632364e-05, + "loss": 1.3242988586425781, + "step": 1610 + }, + { + "epoch": 0.22995031937544358, + "grad_norm": 10.600774765014648, + "learning_rate": 9.977033356990774e-05, + "loss": 1.3170942306518554, + "step": 1620 + }, + { + "epoch": 0.2313697657913414, + "grad_norm": 10.652543067932129, + "learning_rate": 9.976891412349184e-05, + "loss": 1.3998719215393067, + "step": 1630 + }, + { + "epoch": 0.23278921220723917, + "grad_norm": 11.354793548583984, + "learning_rate": 9.976749467707595e-05, + "loss": 1.270443820953369, + "step": 1640 + }, + { + "epoch": 0.23420865862313697, + "grad_norm": 9.926568031311035, + "learning_rate": 9.976607523066005e-05, + "loss": 1.117215347290039, + "step": 1650 + }, + { + "epoch": 0.23562810503903478, + "grad_norm": 11.167335510253906, + "learning_rate": 9.976465578424416e-05, + "loss": 1.348717212677002, + "step": 1660 + }, + { + "epoch": 0.2370475514549326, + "grad_norm": 11.364425659179688, + "learning_rate": 9.976323633782824e-05, + "loss": 1.2113998413085938, + "step": 1670 + }, + { + "epoch": 0.23846699787083037, + "grad_norm": 10.315034866333008, + "learning_rate": 9.976181689141235e-05, + "loss": 1.2621678352355956, + "step": 1680 + }, + { + "epoch": 0.23988644428672817, + "grad_norm": 11.332146644592285, + "learning_rate": 9.976039744499645e-05, + "loss": 1.2919418334960937, + "step": 1690 + }, + { + "epoch": 0.24130589070262598, + "grad_norm": 9.863037109375, + "learning_rate": 9.975897799858056e-05, + "loss": 1.262222957611084, + "step": 1700 + }, + { + "epoch": 0.2427253371185238, + "grad_norm": 13.898163795471191, + "learning_rate": 9.975755855216467e-05, + "loss": 1.349098300933838, + "step": 1710 + }, + { + "epoch": 0.24414478353442157, + "grad_norm": 9.008386611938477, + "learning_rate": 9.975613910574876e-05, + "loss": 1.1653017044067382, + "step": 1720 + }, + { + "epoch": 0.24556422995031937, + "grad_norm": 9.755669593811035, + "learning_rate": 9.975471965933287e-05, + "loss": 1.304057788848877, + "step": 1730 + }, + { + "epoch": 0.24698367636621718, + "grad_norm": 10.742278099060059, + "learning_rate": 9.975330021291696e-05, + "loss": 1.1656038284301757, + "step": 1740 + }, + { + "epoch": 0.248403122782115, + "grad_norm": 11.937880516052246, + "learning_rate": 9.975188076650107e-05, + "loss": 1.2565963745117188, + "step": 1750 + }, + { + "epoch": 0.24982256919801277, + "grad_norm": 9.80545711517334, + "learning_rate": 9.975046132008517e-05, + "loss": 1.1316876411437988, + "step": 1760 + }, + { + "epoch": 0.2512420156139106, + "grad_norm": 11.162557601928711, + "learning_rate": 9.974904187366927e-05, + "loss": 1.2094581604003907, + "step": 1770 + }, + { + "epoch": 0.2526614620298084, + "grad_norm": 12.278450965881348, + "learning_rate": 9.974762242725337e-05, + "loss": 1.2499947547912598, + "step": 1780 + }, + { + "epoch": 0.2540809084457062, + "grad_norm": 10.95953369140625, + "learning_rate": 9.974620298083748e-05, + "loss": 1.1540046691894532, + "step": 1790 + }, + { + "epoch": 0.255500354861604, + "grad_norm": 7.865696430206299, + "learning_rate": 9.974478353442159e-05, + "loss": 1.1665989875793457, + "step": 1800 + }, + { + "epoch": 0.25691980127750175, + "grad_norm": 12.1609468460083, + "learning_rate": 9.974336408800569e-05, + "loss": 1.120746898651123, + "step": 1810 + }, + { + "epoch": 0.25833924769339955, + "grad_norm": 9.554359436035156, + "learning_rate": 9.974194464158978e-05, + "loss": 1.3381189346313476, + "step": 1820 + }, + { + "epoch": 0.25975869410929736, + "grad_norm": 9.497129440307617, + "learning_rate": 9.974052519517388e-05, + "loss": 1.1758546829223633, + "step": 1830 + }, + { + "epoch": 0.26117814052519517, + "grad_norm": 10.584992408752441, + "learning_rate": 9.973910574875799e-05, + "loss": 1.0787659645080567, + "step": 1840 + }, + { + "epoch": 0.262597586941093, + "grad_norm": 9.558980941772461, + "learning_rate": 9.973768630234209e-05, + "loss": 0.9334567070007325, + "step": 1850 + }, + { + "epoch": 0.2640170333569908, + "grad_norm": 9.41112995147705, + "learning_rate": 9.97362668559262e-05, + "loss": 1.1376053810119628, + "step": 1860 + }, + { + "epoch": 0.2654364797728886, + "grad_norm": 11.666831970214844, + "learning_rate": 9.973484740951028e-05, + "loss": 1.207914447784424, + "step": 1870 + }, + { + "epoch": 0.2668559261887864, + "grad_norm": 11.217955589294434, + "learning_rate": 9.97334279630944e-05, + "loss": 1.052849578857422, + "step": 1880 + }, + { + "epoch": 0.26827537260468415, + "grad_norm": 8.3615083694458, + "learning_rate": 9.97320085166785e-05, + "loss": 0.9782976150512696, + "step": 1890 + }, + { + "epoch": 0.26969481902058196, + "grad_norm": 10.69944953918457, + "learning_rate": 9.97305890702626e-05, + "loss": 0.9639101982116699, + "step": 1900 + }, + { + "epoch": 0.27111426543647976, + "grad_norm": 11.15194034576416, + "learning_rate": 9.972916962384671e-05, + "loss": 1.0744239807128906, + "step": 1910 + }, + { + "epoch": 0.27253371185237757, + "grad_norm": 10.363690376281738, + "learning_rate": 9.972775017743081e-05, + "loss": 1.1180108070373536, + "step": 1920 + }, + { + "epoch": 0.2739531582682754, + "grad_norm": 10.816513061523438, + "learning_rate": 9.972633073101491e-05, + "loss": 1.118791103363037, + "step": 1930 + }, + { + "epoch": 0.2753726046841732, + "grad_norm": 8.64388656616211, + "learning_rate": 9.9724911284599e-05, + "loss": 1.1368459701538085, + "step": 1940 + }, + { + "epoch": 0.276792051100071, + "grad_norm": 9.002252578735352, + "learning_rate": 9.972349183818312e-05, + "loss": 1.1344121932983398, + "step": 1950 + }, + { + "epoch": 0.2782114975159688, + "grad_norm": 11.083386421203613, + "learning_rate": 9.972207239176721e-05, + "loss": 1.1827295303344727, + "step": 1960 + }, + { + "epoch": 0.27963094393186655, + "grad_norm": 8.360145568847656, + "learning_rate": 9.972065294535133e-05, + "loss": 0.9954969406127929, + "step": 1970 + }, + { + "epoch": 0.28105039034776436, + "grad_norm": 12.982026100158691, + "learning_rate": 9.971923349893542e-05, + "loss": 0.9865982055664062, + "step": 1980 + }, + { + "epoch": 0.28246983676366216, + "grad_norm": 9.3854341506958, + "learning_rate": 9.971781405251952e-05, + "loss": 0.9238475799560547, + "step": 1990 + }, + { + "epoch": 0.28388928317955997, + "grad_norm": 10.693597793579102, + "learning_rate": 9.971639460610363e-05, + "loss": 0.9660484313964843, + "step": 2000 + }, + { + "epoch": 0.28388928317955997, + "eval_accuracy": 0.6596935206968907, + "eval_loss": 1.0827350616455078, + "eval_runtime": 31.44, + "eval_samples_per_second": 500.222, + "eval_steps_per_second": 15.649, + "step": 2000 + }, + { + "epoch": 0.2853087295954578, + "grad_norm": 11.403952598571777, + "learning_rate": 9.971497515968773e-05, + "loss": 0.987885856628418, + "step": 2010 + }, + { + "epoch": 0.2867281760113556, + "grad_norm": 11.068461418151855, + "learning_rate": 9.971355571327184e-05, + "loss": 1.0180384635925293, + "step": 2020 + }, + { + "epoch": 0.2881476224272534, + "grad_norm": 10.536505699157715, + "learning_rate": 9.971213626685592e-05, + "loss": 1.0148059844970703, + "step": 2030 + }, + { + "epoch": 0.2895670688431512, + "grad_norm": 9.358129501342773, + "learning_rate": 9.971071682044003e-05, + "loss": 0.9920819282531739, + "step": 2040 + }, + { + "epoch": 0.29098651525904895, + "grad_norm": 10.33521842956543, + "learning_rate": 9.970929737402413e-05, + "loss": 1.0010162353515626, + "step": 2050 + }, + { + "epoch": 0.29240596167494676, + "grad_norm": 10.490190505981445, + "learning_rate": 9.970787792760824e-05, + "loss": 0.9781021118164063, + "step": 2060 + }, + { + "epoch": 0.29382540809084456, + "grad_norm": 9.507524490356445, + "learning_rate": 9.970645848119234e-05, + "loss": 0.9722440719604493, + "step": 2070 + }, + { + "epoch": 0.29524485450674237, + "grad_norm": 10.77835464477539, + "learning_rate": 9.970503903477644e-05, + "loss": 0.9851055145263672, + "step": 2080 + }, + { + "epoch": 0.2966643009226402, + "grad_norm": 9.847874641418457, + "learning_rate": 9.970361958836055e-05, + "loss": 0.8958380699157715, + "step": 2090 + }, + { + "epoch": 0.298083747338538, + "grad_norm": 11.703569412231445, + "learning_rate": 9.970220014194465e-05, + "loss": 0.9267073631286621, + "step": 2100 + }, + { + "epoch": 0.2995031937544358, + "grad_norm": 6.974740028381348, + "learning_rate": 9.970078069552876e-05, + "loss": 0.787592887878418, + "step": 2110 + }, + { + "epoch": 0.30092264017033354, + "grad_norm": 5.989770889282227, + "learning_rate": 9.969936124911285e-05, + "loss": 0.8309663772583008, + "step": 2120 + }, + { + "epoch": 0.30234208658623135, + "grad_norm": 8.477362632751465, + "learning_rate": 9.969794180269695e-05, + "loss": 0.8926510810852051, + "step": 2130 + }, + { + "epoch": 0.30376153300212916, + "grad_norm": 8.412622451782227, + "learning_rate": 9.969652235628105e-05, + "loss": 0.8592344284057617, + "step": 2140 + }, + { + "epoch": 0.30518097941802697, + "grad_norm": 10.356178283691406, + "learning_rate": 9.969510290986516e-05, + "loss": 0.8827583312988281, + "step": 2150 + }, + { + "epoch": 0.3066004258339248, + "grad_norm": 7.666086673736572, + "learning_rate": 9.969368346344926e-05, + "loss": 0.9427967071533203, + "step": 2160 + }, + { + "epoch": 0.3080198722498226, + "grad_norm": 11.2577486038208, + "learning_rate": 9.969226401703337e-05, + "loss": 0.8580154418945313, + "step": 2170 + }, + { + "epoch": 0.3094393186657204, + "grad_norm": 10.915003776550293, + "learning_rate": 9.969084457061746e-05, + "loss": 0.9019613265991211, + "step": 2180 + }, + { + "epoch": 0.3108587650816182, + "grad_norm": 9.683639526367188, + "learning_rate": 9.968942512420156e-05, + "loss": 0.8982448577880859, + "step": 2190 + }, + { + "epoch": 0.31227821149751595, + "grad_norm": 8.5520601272583, + "learning_rate": 9.968800567778567e-05, + "loss": 0.7967979431152343, + "step": 2200 + }, + { + "epoch": 0.31369765791341375, + "grad_norm": 11.931614875793457, + "learning_rate": 9.968658623136977e-05, + "loss": 0.9725972175598144, + "step": 2210 + }, + { + "epoch": 0.31511710432931156, + "grad_norm": 11.004504203796387, + "learning_rate": 9.968516678495388e-05, + "loss": 0.9189895629882813, + "step": 2220 + }, + { + "epoch": 0.31653655074520937, + "grad_norm": 9.460184097290039, + "learning_rate": 9.968374733853797e-05, + "loss": 0.9253165245056152, + "step": 2230 + }, + { + "epoch": 0.3179559971611072, + "grad_norm": 9.675958633422852, + "learning_rate": 9.968232789212208e-05, + "loss": 0.7977495670318604, + "step": 2240 + }, + { + "epoch": 0.319375443577005, + "grad_norm": 8.858159065246582, + "learning_rate": 9.968090844570617e-05, + "loss": 0.9056186676025391, + "step": 2250 + }, + { + "epoch": 0.3207948899929028, + "grad_norm": 10.144878387451172, + "learning_rate": 9.967948899929028e-05, + "loss": 0.9273244857788085, + "step": 2260 + }, + { + "epoch": 0.3222143364088006, + "grad_norm": 9.78799819946289, + "learning_rate": 9.967806955287438e-05, + "loss": 0.8192936897277832, + "step": 2270 + }, + { + "epoch": 0.32363378282469835, + "grad_norm": 8.891179084777832, + "learning_rate": 9.967665010645849e-05, + "loss": 0.8823507308959961, + "step": 2280 + }, + { + "epoch": 0.32505322924059615, + "grad_norm": 9.303411483764648, + "learning_rate": 9.967523066004259e-05, + "loss": 0.8374591827392578, + "step": 2290 + }, + { + "epoch": 0.32647267565649396, + "grad_norm": 8.408880233764648, + "learning_rate": 9.967381121362669e-05, + "loss": 0.849891471862793, + "step": 2300 + }, + { + "epoch": 0.32789212207239177, + "grad_norm": 9.384819030761719, + "learning_rate": 9.96723917672108e-05, + "loss": 0.7750972747802735, + "step": 2310 + }, + { + "epoch": 0.3293115684882896, + "grad_norm": 9.170500755310059, + "learning_rate": 9.96709723207949e-05, + "loss": 0.7687624454498291, + "step": 2320 + }, + { + "epoch": 0.3307310149041874, + "grad_norm": 8.488929748535156, + "learning_rate": 9.966955287437901e-05, + "loss": 0.8498885154724121, + "step": 2330 + }, + { + "epoch": 0.3321504613200852, + "grad_norm": 10.291971206665039, + "learning_rate": 9.966813342796309e-05, + "loss": 0.7807302474975586, + "step": 2340 + }, + { + "epoch": 0.33356990773598294, + "grad_norm": 11.644806861877441, + "learning_rate": 9.96667139815472e-05, + "loss": 0.7917065143585205, + "step": 2350 + }, + { + "epoch": 0.33498935415188075, + "grad_norm": 13.938374519348145, + "learning_rate": 9.96652945351313e-05, + "loss": 0.8718063354492187, + "step": 2360 + }, + { + "epoch": 0.33640880056777855, + "grad_norm": 10.399706840515137, + "learning_rate": 9.966387508871541e-05, + "loss": 0.8703582763671875, + "step": 2370 + }, + { + "epoch": 0.33782824698367636, + "grad_norm": 7.870115756988525, + "learning_rate": 9.966245564229951e-05, + "loss": 0.8549924850463867, + "step": 2380 + }, + { + "epoch": 0.33924769339957417, + "grad_norm": 9.777918815612793, + "learning_rate": 9.96610361958836e-05, + "loss": 1.0234166145324708, + "step": 2390 + }, + { + "epoch": 0.340667139815472, + "grad_norm": 10.103452682495117, + "learning_rate": 9.965961674946772e-05, + "loss": 0.9040670394897461, + "step": 2400 + }, + { + "epoch": 0.3420865862313698, + "grad_norm": 10.497400283813477, + "learning_rate": 9.965819730305181e-05, + "loss": 0.7955552577972412, + "step": 2410 + }, + { + "epoch": 0.3435060326472676, + "grad_norm": 8.0149564743042, + "learning_rate": 9.965677785663592e-05, + "loss": 0.856791877746582, + "step": 2420 + }, + { + "epoch": 0.34492547906316534, + "grad_norm": 8.111480712890625, + "learning_rate": 9.965535841022002e-05, + "loss": 0.8129085540771485, + "step": 2430 + }, + { + "epoch": 0.34634492547906315, + "grad_norm": 7.93813419342041, + "learning_rate": 9.965393896380412e-05, + "loss": 0.809941291809082, + "step": 2440 + }, + { + "epoch": 0.34776437189496096, + "grad_norm": 10.88427448272705, + "learning_rate": 9.965251951738822e-05, + "loss": 0.7622882843017578, + "step": 2450 + }, + { + "epoch": 0.34918381831085876, + "grad_norm": 9.509648323059082, + "learning_rate": 9.965110007097233e-05, + "loss": 0.7235064029693603, + "step": 2460 + }, + { + "epoch": 0.35060326472675657, + "grad_norm": 10.343646049499512, + "learning_rate": 9.964968062455642e-05, + "loss": 0.7792426586151123, + "step": 2470 + }, + { + "epoch": 0.3520227111426544, + "grad_norm": 11.936261177062988, + "learning_rate": 9.964826117814054e-05, + "loss": 0.8023401260375976, + "step": 2480 + }, + { + "epoch": 0.3534421575585522, + "grad_norm": 8.382633209228516, + "learning_rate": 9.964684173172463e-05, + "loss": 0.7960898399353027, + "step": 2490 + }, + { + "epoch": 0.35486160397445, + "grad_norm": 11.01586627960205, + "learning_rate": 9.964542228530873e-05, + "loss": 0.7975746631622315, + "step": 2500 + }, + { + "epoch": 0.35486160397445, + "eval_accuracy": 0.730527118967381, + "eval_loss": 0.8166059255599976, + "eval_runtime": 32.3274, + "eval_samples_per_second": 486.491, + "eval_steps_per_second": 15.219, + "step": 2500 + }, + { + "epoch": 0.35628105039034774, + "grad_norm": 8.113981246948242, + "learning_rate": 9.964400283889284e-05, + "loss": 0.7863178253173828, + "step": 2510 + }, + { + "epoch": 0.35770049680624555, + "grad_norm": 9.127975463867188, + "learning_rate": 9.964258339247694e-05, + "loss": 0.8487259864807128, + "step": 2520 + }, + { + "epoch": 0.35911994322214336, + "grad_norm": 8.597822189331055, + "learning_rate": 9.964116394606105e-05, + "loss": 0.8151129722595215, + "step": 2530 + }, + { + "epoch": 0.36053938963804116, + "grad_norm": 8.069273948669434, + "learning_rate": 9.963974449964513e-05, + "loss": 0.6664574623107911, + "step": 2540 + }, + { + "epoch": 0.36195883605393897, + "grad_norm": 8.314419746398926, + "learning_rate": 9.963832505322924e-05, + "loss": 0.8365516662597656, + "step": 2550 + }, + { + "epoch": 0.3633782824698368, + "grad_norm": 9.172304153442383, + "learning_rate": 9.963690560681334e-05, + "loss": 0.7865428924560547, + "step": 2560 + }, + { + "epoch": 0.3647977288857346, + "grad_norm": 9.639200210571289, + "learning_rate": 9.963548616039745e-05, + "loss": 0.7925633430480957, + "step": 2570 + }, + { + "epoch": 0.36621717530163234, + "grad_norm": 8.856132507324219, + "learning_rate": 9.963406671398155e-05, + "loss": 0.7005198955535888, + "step": 2580 + }, + { + "epoch": 0.36763662171753014, + "grad_norm": 7.9700422286987305, + "learning_rate": 9.963264726756566e-05, + "loss": 0.6712905883789062, + "step": 2590 + }, + { + "epoch": 0.36905606813342795, + "grad_norm": 9.465399742126465, + "learning_rate": 9.963122782114976e-05, + "loss": 0.7288703441619873, + "step": 2600 + }, + { + "epoch": 0.37047551454932576, + "grad_norm": 8.769003868103027, + "learning_rate": 9.962980837473386e-05, + "loss": 0.7671696662902832, + "step": 2610 + }, + { + "epoch": 0.37189496096522356, + "grad_norm": 6.981420040130615, + "learning_rate": 9.962838892831797e-05, + "loss": 0.6548487663269043, + "step": 2620 + }, + { + "epoch": 0.37331440738112137, + "grad_norm": 8.440009117126465, + "learning_rate": 9.962696948190206e-05, + "loss": 0.705223274230957, + "step": 2630 + }, + { + "epoch": 0.3747338537970192, + "grad_norm": 12.392814636230469, + "learning_rate": 9.962555003548617e-05, + "loss": 0.8219353675842285, + "step": 2640 + }, + { + "epoch": 0.376153300212917, + "grad_norm": 9.1260404586792, + "learning_rate": 9.962413058907026e-05, + "loss": 0.7202134132385254, + "step": 2650 + }, + { + "epoch": 0.37757274662881474, + "grad_norm": 9.437945365905762, + "learning_rate": 9.962271114265437e-05, + "loss": 0.7196836471557617, + "step": 2660 + }, + { + "epoch": 0.37899219304471254, + "grad_norm": 8.03176212310791, + "learning_rate": 9.962129169623847e-05, + "loss": 0.6017679214477539, + "step": 2670 + }, + { + "epoch": 0.38041163946061035, + "grad_norm": 11.21246337890625, + "learning_rate": 9.961987224982258e-05, + "loss": 0.8073585510253907, + "step": 2680 + }, + { + "epoch": 0.38183108587650816, + "grad_norm": 8.937601089477539, + "learning_rate": 9.961845280340667e-05, + "loss": 0.5765426635742188, + "step": 2690 + }, + { + "epoch": 0.38325053229240597, + "grad_norm": 10.750785827636719, + "learning_rate": 9.961703335699077e-05, + "loss": 0.850700569152832, + "step": 2700 + }, + { + "epoch": 0.3846699787083038, + "grad_norm": 8.476407051086426, + "learning_rate": 9.961561391057488e-05, + "loss": 0.6841172695159912, + "step": 2710 + }, + { + "epoch": 0.3860894251242016, + "grad_norm": 8.174555778503418, + "learning_rate": 9.961419446415898e-05, + "loss": 0.6521795272827149, + "step": 2720 + }, + { + "epoch": 0.3875088715400994, + "grad_norm": 6.744903564453125, + "learning_rate": 9.961277501774309e-05, + "loss": 0.7194175720214844, + "step": 2730 + }, + { + "epoch": 0.38892831795599714, + "grad_norm": 7.107284069061279, + "learning_rate": 9.961135557132719e-05, + "loss": 0.7437104701995849, + "step": 2740 + }, + { + "epoch": 0.39034776437189495, + "grad_norm": 12.026649475097656, + "learning_rate": 9.960993612491129e-05, + "loss": 0.7481307029724121, + "step": 2750 + }, + { + "epoch": 0.39176721078779275, + "grad_norm": 10.131022453308105, + "learning_rate": 9.960851667849538e-05, + "loss": 0.6669661521911621, + "step": 2760 + }, + { + "epoch": 0.39318665720369056, + "grad_norm": 7.589590072631836, + "learning_rate": 9.96070972320795e-05, + "loss": 0.5883037567138671, + "step": 2770 + }, + { + "epoch": 0.39460610361958837, + "grad_norm": 8.32777214050293, + "learning_rate": 9.960567778566359e-05, + "loss": 0.6772464752197266, + "step": 2780 + }, + { + "epoch": 0.3960255500354862, + "grad_norm": 6.111226558685303, + "learning_rate": 9.96042583392477e-05, + "loss": 0.6943521976470948, + "step": 2790 + }, + { + "epoch": 0.397444996451384, + "grad_norm": 10.40073299407959, + "learning_rate": 9.96028388928318e-05, + "loss": 0.6597262382507324, + "step": 2800 + }, + { + "epoch": 0.3988644428672818, + "grad_norm": 11.990081787109375, + "learning_rate": 9.96014194464159e-05, + "loss": 0.6846660614013672, + "step": 2810 + }, + { + "epoch": 0.40028388928317954, + "grad_norm": 7.820896625518799, + "learning_rate": 9.960000000000001e-05, + "loss": 0.5972445487976075, + "step": 2820 + }, + { + "epoch": 0.40170333569907735, + "grad_norm": 9.078740119934082, + "learning_rate": 9.95985805535841e-05, + "loss": 0.7440935611724854, + "step": 2830 + }, + { + "epoch": 0.40312278211497515, + "grad_norm": 8.869423866271973, + "learning_rate": 9.959716110716822e-05, + "loss": 0.7406916141510009, + "step": 2840 + }, + { + "epoch": 0.40454222853087296, + "grad_norm": 9.250556945800781, + "learning_rate": 9.95957416607523e-05, + "loss": 0.6444163799285889, + "step": 2850 + }, + { + "epoch": 0.40596167494677077, + "grad_norm": 12.534906387329102, + "learning_rate": 9.959432221433641e-05, + "loss": 0.7008297920227051, + "step": 2860 + }, + { + "epoch": 0.4073811213626686, + "grad_norm": 10.320120811462402, + "learning_rate": 9.959290276792051e-05, + "loss": 0.6261786460876465, + "step": 2870 + }, + { + "epoch": 0.4088005677785664, + "grad_norm": 7.483973979949951, + "learning_rate": 9.959148332150462e-05, + "loss": 0.6434149742126465, + "step": 2880 + }, + { + "epoch": 0.41022001419446413, + "grad_norm": 9.007946014404297, + "learning_rate": 9.959006387508872e-05, + "loss": 0.6796345233917236, + "step": 2890 + }, + { + "epoch": 0.41163946061036194, + "grad_norm": 8.191641807556152, + "learning_rate": 9.958864442867281e-05, + "loss": 0.5003190994262695, + "step": 2900 + }, + { + "epoch": 0.41305890702625975, + "grad_norm": 9.307744979858398, + "learning_rate": 9.958722498225693e-05, + "loss": 0.6988365173339843, + "step": 2910 + }, + { + "epoch": 0.41447835344215755, + "grad_norm": 6.16031551361084, + "learning_rate": 9.958580553584102e-05, + "loss": 0.6487136840820312, + "step": 2920 + }, + { + "epoch": 0.41589779985805536, + "grad_norm": 9.785910606384277, + "learning_rate": 9.958438608942513e-05, + "loss": 0.6544306755065918, + "step": 2930 + }, + { + "epoch": 0.41731724627395317, + "grad_norm": 12.08917236328125, + "learning_rate": 9.958296664300923e-05, + "loss": 0.6119012832641602, + "step": 2940 + }, + { + "epoch": 0.418736692689851, + "grad_norm": 10.118932723999023, + "learning_rate": 9.958154719659334e-05, + "loss": 0.5515688896179199, + "step": 2950 + }, + { + "epoch": 0.4201561391057488, + "grad_norm": 10.645463943481445, + "learning_rate": 9.958012775017743e-05, + "loss": 0.6795665740966796, + "step": 2960 + }, + { + "epoch": 0.42157558552164653, + "grad_norm": 8.745086669921875, + "learning_rate": 9.957870830376154e-05, + "loss": 0.6612170219421387, + "step": 2970 + }, + { + "epoch": 0.42299503193754434, + "grad_norm": 7.060173511505127, + "learning_rate": 9.957728885734563e-05, + "loss": 0.635819387435913, + "step": 2980 + }, + { + "epoch": 0.42441447835344215, + "grad_norm": 11.630016326904297, + "learning_rate": 9.957586941092975e-05, + "loss": 0.5891122341156005, + "step": 2990 + }, + { + "epoch": 0.42583392476933996, + "grad_norm": 11.667549133300781, + "learning_rate": 9.957444996451386e-05, + "loss": 0.7183985233306884, + "step": 3000 + }, + { + "epoch": 0.42583392476933996, + "eval_accuracy": 0.7175557957652445, + "eval_loss": 0.8312568068504333, + "eval_runtime": 32.7465, + "eval_samples_per_second": 480.265, + "eval_steps_per_second": 15.024, + "step": 3000 + }, + { + "epoch": 0.42725337118523776, + "grad_norm": 10.770739555358887, + "learning_rate": 9.957303051809794e-05, + "loss": 0.606045913696289, + "step": 3010 + }, + { + "epoch": 0.42867281760113557, + "grad_norm": 8.715160369873047, + "learning_rate": 9.957161107168205e-05, + "loss": 0.6968401908874512, + "step": 3020 + }, + { + "epoch": 0.4300922640170334, + "grad_norm": 10.227581977844238, + "learning_rate": 9.957019162526615e-05, + "loss": 0.5089622497558594, + "step": 3030 + }, + { + "epoch": 0.4315117104329312, + "grad_norm": 8.32385540008545, + "learning_rate": 9.956877217885026e-05, + "loss": 0.6402715682983399, + "step": 3040 + }, + { + "epoch": 0.43293115684882894, + "grad_norm": 10.973727226257324, + "learning_rate": 9.956735273243436e-05, + "loss": 0.7282869338989257, + "step": 3050 + }, + { + "epoch": 0.43435060326472674, + "grad_norm": 8.994437217712402, + "learning_rate": 9.956593328601845e-05, + "loss": 0.5776423454284668, + "step": 3060 + }, + { + "epoch": 0.43577004968062455, + "grad_norm": 7.597539901733398, + "learning_rate": 9.956451383960255e-05, + "loss": 0.5537106990814209, + "step": 3070 + }, + { + "epoch": 0.43718949609652236, + "grad_norm": 7.695132732391357, + "learning_rate": 9.956309439318666e-05, + "loss": 0.5561283588409424, + "step": 3080 + }, + { + "epoch": 0.43860894251242016, + "grad_norm": 10.008833885192871, + "learning_rate": 9.956167494677077e-05, + "loss": 0.6571722030639648, + "step": 3090 + }, + { + "epoch": 0.44002838892831797, + "grad_norm": 6.440252304077148, + "learning_rate": 9.956025550035487e-05, + "loss": 0.4972050189971924, + "step": 3100 + }, + { + "epoch": 0.4414478353442158, + "grad_norm": 11.92957878112793, + "learning_rate": 9.955883605393897e-05, + "loss": 0.6483690738677979, + "step": 3110 + }, + { + "epoch": 0.44286728176011353, + "grad_norm": 8.40812873840332, + "learning_rate": 9.955741660752307e-05, + "loss": 0.602755069732666, + "step": 3120 + }, + { + "epoch": 0.44428672817601134, + "grad_norm": 6.782786846160889, + "learning_rate": 9.955599716110718e-05, + "loss": 0.6320923328399658, + "step": 3130 + }, + { + "epoch": 0.44570617459190914, + "grad_norm": 12.326107025146484, + "learning_rate": 9.955457771469127e-05, + "loss": 0.5779653549194336, + "step": 3140 + }, + { + "epoch": 0.44712562100780695, + "grad_norm": 12.876483917236328, + "learning_rate": 9.955315826827538e-05, + "loss": 0.7216415882110596, + "step": 3150 + }, + { + "epoch": 0.44854506742370476, + "grad_norm": 6.984850883483887, + "learning_rate": 9.955173882185947e-05, + "loss": 0.4415611267089844, + "step": 3160 + }, + { + "epoch": 0.44996451383960256, + "grad_norm": 6.711297512054443, + "learning_rate": 9.955031937544358e-05, + "loss": 0.5505913734436035, + "step": 3170 + }, + { + "epoch": 0.45138396025550037, + "grad_norm": 7.127682685852051, + "learning_rate": 9.954889992902769e-05, + "loss": 0.5563027858734131, + "step": 3180 + }, + { + "epoch": 0.4528034066713982, + "grad_norm": 9.826492309570312, + "learning_rate": 9.954748048261179e-05, + "loss": 0.5204686641693115, + "step": 3190 + }, + { + "epoch": 0.45422285308729593, + "grad_norm": 14.011224746704102, + "learning_rate": 9.95460610361959e-05, + "loss": 0.5676139831542969, + "step": 3200 + }, + { + "epoch": 0.45564229950319374, + "grad_norm": 10.502514839172363, + "learning_rate": 9.954464158977998e-05, + "loss": 0.6593122482299805, + "step": 3210 + }, + { + "epoch": 0.45706174591909154, + "grad_norm": 9.966157913208008, + "learning_rate": 9.95432221433641e-05, + "loss": 0.5757305145263671, + "step": 3220 + }, + { + "epoch": 0.45848119233498935, + "grad_norm": 7.551996231079102, + "learning_rate": 9.954180269694819e-05, + "loss": 0.5537711620330811, + "step": 3230 + }, + { + "epoch": 0.45990063875088716, + "grad_norm": 10.630086898803711, + "learning_rate": 9.95403832505323e-05, + "loss": 0.5302771091461181, + "step": 3240 + }, + { + "epoch": 0.46132008516678497, + "grad_norm": 12.471774101257324, + "learning_rate": 9.95389638041164e-05, + "loss": 0.6347667694091796, + "step": 3250 + }, + { + "epoch": 0.4627395315826828, + "grad_norm": 9.668441772460938, + "learning_rate": 9.95375443577005e-05, + "loss": 0.5615960121154785, + "step": 3260 + }, + { + "epoch": 0.4641589779985806, + "grad_norm": 9.092421531677246, + "learning_rate": 9.953612491128461e-05, + "loss": 0.5531889438629151, + "step": 3270 + }, + { + "epoch": 0.46557842441447833, + "grad_norm": 8.55390453338623, + "learning_rate": 9.95347054648687e-05, + "loss": 0.5149998188018798, + "step": 3280 + }, + { + "epoch": 0.46699787083037614, + "grad_norm": 9.092056274414062, + "learning_rate": 9.953328601845282e-05, + "loss": 0.49632701873779295, + "step": 3290 + }, + { + "epoch": 0.46841731724627395, + "grad_norm": 9.66268253326416, + "learning_rate": 9.953186657203691e-05, + "loss": 0.5612505912780762, + "step": 3300 + }, + { + "epoch": 0.46983676366217175, + "grad_norm": 6.583611011505127, + "learning_rate": 9.953044712562102e-05, + "loss": 0.5805669307708741, + "step": 3310 + }, + { + "epoch": 0.47125621007806956, + "grad_norm": 8.160282135009766, + "learning_rate": 9.952902767920511e-05, + "loss": 0.4320365428924561, + "step": 3320 + }, + { + "epoch": 0.47267565649396737, + "grad_norm": 10.05884075164795, + "learning_rate": 9.952760823278922e-05, + "loss": 0.5736487865447998, + "step": 3330 + }, + { + "epoch": 0.4740951029098652, + "grad_norm": 9.000593185424805, + "learning_rate": 9.952618878637332e-05, + "loss": 0.5238205432891846, + "step": 3340 + }, + { + "epoch": 0.4755145493257629, + "grad_norm": 9.076302528381348, + "learning_rate": 9.952476933995743e-05, + "loss": 0.5925283432006836, + "step": 3350 + }, + { + "epoch": 0.47693399574166073, + "grad_norm": 8.275947570800781, + "learning_rate": 9.952334989354152e-05, + "loss": 0.4787450313568115, + "step": 3360 + }, + { + "epoch": 0.47835344215755854, + "grad_norm": 12.550822257995605, + "learning_rate": 9.952193044712562e-05, + "loss": 0.49250407218933107, + "step": 3370 + }, + { + "epoch": 0.47977288857345635, + "grad_norm": 6.8708176612854, + "learning_rate": 9.952051100070973e-05, + "loss": 0.585394811630249, + "step": 3380 + }, + { + "epoch": 0.48119233498935415, + "grad_norm": 6.129304885864258, + "learning_rate": 9.951909155429383e-05, + "loss": 0.5862763881683349, + "step": 3390 + }, + { + "epoch": 0.48261178140525196, + "grad_norm": 7.1515045166015625, + "learning_rate": 9.951767210787794e-05, + "loss": 0.46143798828125, + "step": 3400 + }, + { + "epoch": 0.48403122782114977, + "grad_norm": 5.421439170837402, + "learning_rate": 9.951625266146204e-05, + "loss": 0.6040849685668945, + "step": 3410 + }, + { + "epoch": 0.4854506742370476, + "grad_norm": 10.418113708496094, + "learning_rate": 9.951483321504614e-05, + "loss": 0.55996732711792, + "step": 3420 + }, + { + "epoch": 0.4868701206529453, + "grad_norm": 9.697559356689453, + "learning_rate": 9.951341376863023e-05, + "loss": 0.5332645893096923, + "step": 3430 + }, + { + "epoch": 0.48828956706884313, + "grad_norm": 9.79345703125, + "learning_rate": 9.951199432221434e-05, + "loss": 0.5983724117279052, + "step": 3440 + }, + { + "epoch": 0.48970901348474094, + "grad_norm": 7.977105617523193, + "learning_rate": 9.951057487579844e-05, + "loss": 0.6096511840820312, + "step": 3450 + }, + { + "epoch": 0.49112845990063875, + "grad_norm": 6.851355075836182, + "learning_rate": 9.950915542938255e-05, + "loss": 0.4748993396759033, + "step": 3460 + }, + { + "epoch": 0.49254790631653655, + "grad_norm": 4.706153392791748, + "learning_rate": 9.950773598296665e-05, + "loss": 0.544727087020874, + "step": 3470 + }, + { + "epoch": 0.49396735273243436, + "grad_norm": 9.061712265014648, + "learning_rate": 9.950631653655075e-05, + "loss": 0.5076655387878418, + "step": 3480 + }, + { + "epoch": 0.49538679914833217, + "grad_norm": 7.619383335113525, + "learning_rate": 9.950489709013486e-05, + "loss": 0.5011069297790527, + "step": 3490 + }, + { + "epoch": 0.49680624556423, + "grad_norm": 6.629651069641113, + "learning_rate": 9.950347764371896e-05, + "loss": 0.5038942337036133, + "step": 3500 + }, + { + "epoch": 0.49680624556423, + "eval_accuracy": 0.8245056272652127, + "eval_loss": 0.526730477809906, + "eval_runtime": 32.5263, + "eval_samples_per_second": 483.517, + "eval_steps_per_second": 15.126, + "step": 3500 + }, + { + "epoch": 0.4982256919801277, + "grad_norm": 6.535589694976807, + "learning_rate": 9.950205819730307e-05, + "loss": 0.5255190849304199, + "step": 3510 + }, + { + "epoch": 0.49964513839602553, + "grad_norm": 10.481846809387207, + "learning_rate": 9.950063875088715e-05, + "loss": 0.4977625846862793, + "step": 3520 + }, + { + "epoch": 0.5010645848119234, + "grad_norm": 6.455493450164795, + "learning_rate": 9.949921930447126e-05, + "loss": 0.4624650955200195, + "step": 3530 + }, + { + "epoch": 0.5024840312278211, + "grad_norm": 12.190658569335938, + "learning_rate": 9.949779985805536e-05, + "loss": 0.45445499420166013, + "step": 3540 + }, + { + "epoch": 0.5039034776437189, + "grad_norm": 6.512971878051758, + "learning_rate": 9.949638041163947e-05, + "loss": 0.48822684288024903, + "step": 3550 + }, + { + "epoch": 0.5053229240596168, + "grad_norm": 8.259076118469238, + "learning_rate": 9.949496096522357e-05, + "loss": 0.4896749496459961, + "step": 3560 + }, + { + "epoch": 0.5067423704755145, + "grad_norm": 10.809083938598633, + "learning_rate": 9.949354151880766e-05, + "loss": 0.5267855644226074, + "step": 3570 + }, + { + "epoch": 0.5081618168914124, + "grad_norm": 11.164665222167969, + "learning_rate": 9.949212207239178e-05, + "loss": 0.6478964328765869, + "step": 3580 + }, + { + "epoch": 0.5095812633073101, + "grad_norm": 10.553145408630371, + "learning_rate": 9.949070262597587e-05, + "loss": 0.5322469711303711, + "step": 3590 + }, + { + "epoch": 0.511000709723208, + "grad_norm": 12.578235626220703, + "learning_rate": 9.948928317955998e-05, + "loss": 0.559388542175293, + "step": 3600 + }, + { + "epoch": 0.5124201561391057, + "grad_norm": 7.2467474937438965, + "learning_rate": 9.948786373314408e-05, + "loss": 0.5477664470672607, + "step": 3610 + }, + { + "epoch": 0.5138396025550035, + "grad_norm": 5.959977626800537, + "learning_rate": 9.948644428672818e-05, + "loss": 0.41798744201660154, + "step": 3620 + }, + { + "epoch": 0.5152590489709014, + "grad_norm": 11.72385025024414, + "learning_rate": 9.948502484031228e-05, + "loss": 0.5879819869995118, + "step": 3630 + }, + { + "epoch": 0.5166784953867991, + "grad_norm": 7.881444454193115, + "learning_rate": 9.948360539389639e-05, + "loss": 0.5005061149597168, + "step": 3640 + }, + { + "epoch": 0.518097941802697, + "grad_norm": 7.005399703979492, + "learning_rate": 9.948218594748048e-05, + "loss": 0.5412337303161621, + "step": 3650 + }, + { + "epoch": 0.5195173882185947, + "grad_norm": 13.495038032531738, + "learning_rate": 9.94807665010646e-05, + "loss": 0.5668565273284912, + "step": 3660 + }, + { + "epoch": 0.5209368346344926, + "grad_norm": 8.42395305633545, + "learning_rate": 9.947934705464869e-05, + "loss": 0.6085368633270264, + "step": 3670 + }, + { + "epoch": 0.5223562810503903, + "grad_norm": 8.754134178161621, + "learning_rate": 9.947792760823279e-05, + "loss": 0.5360457420349121, + "step": 3680 + }, + { + "epoch": 0.5237757274662882, + "grad_norm": 5.868712425231934, + "learning_rate": 9.94765081618169e-05, + "loss": 0.6166606903076172, + "step": 3690 + }, + { + "epoch": 0.525195173882186, + "grad_norm": 4.342434883117676, + "learning_rate": 9.9475088715401e-05, + "loss": 0.4890284061431885, + "step": 3700 + }, + { + "epoch": 0.5266146202980837, + "grad_norm": 8.200478553771973, + "learning_rate": 9.947366926898511e-05, + "loss": 0.5135448455810547, + "step": 3710 + }, + { + "epoch": 0.5280340667139816, + "grad_norm": 6.076674938201904, + "learning_rate": 9.94722498225692e-05, + "loss": 0.37685840129852294, + "step": 3720 + }, + { + "epoch": 0.5294535131298793, + "grad_norm": 8.206668853759766, + "learning_rate": 9.94708303761533e-05, + "loss": 0.43741750717163086, + "step": 3730 + }, + { + "epoch": 0.5308729595457772, + "grad_norm": 8.284717559814453, + "learning_rate": 9.94694109297374e-05, + "loss": 0.46701841354370116, + "step": 3740 + }, + { + "epoch": 0.5322924059616749, + "grad_norm": 8.111977577209473, + "learning_rate": 9.946799148332151e-05, + "loss": 0.5564829349517822, + "step": 3750 + }, + { + "epoch": 0.5337118523775728, + "grad_norm": 10.037016868591309, + "learning_rate": 9.946657203690561e-05, + "loss": 0.4543320655822754, + "step": 3760 + }, + { + "epoch": 0.5351312987934705, + "grad_norm": 6.1391191482543945, + "learning_rate": 9.946515259048972e-05, + "loss": 0.43409008979797364, + "step": 3770 + }, + { + "epoch": 0.5365507452093683, + "grad_norm": 9.031709671020508, + "learning_rate": 9.946373314407382e-05, + "loss": 0.45609292984008787, + "step": 3780 + }, + { + "epoch": 0.5379701916252662, + "grad_norm": 10.507880210876465, + "learning_rate": 9.946231369765791e-05, + "loss": 0.49566287994384767, + "step": 3790 + }, + { + "epoch": 0.5393896380411639, + "grad_norm": 7.94572114944458, + "learning_rate": 9.946089425124203e-05, + "loss": 0.43464975357055663, + "step": 3800 + }, + { + "epoch": 0.5408090844570618, + "grad_norm": 11.292725563049316, + "learning_rate": 9.945947480482612e-05, + "loss": 0.4976043224334717, + "step": 3810 + }, + { + "epoch": 0.5422285308729595, + "grad_norm": 9.720746040344238, + "learning_rate": 9.945805535841023e-05, + "loss": 0.44420394897460935, + "step": 3820 + }, + { + "epoch": 0.5436479772888574, + "grad_norm": 10.859402656555176, + "learning_rate": 9.945663591199432e-05, + "loss": 0.47884187698364256, + "step": 3830 + }, + { + "epoch": 0.5450674237047551, + "grad_norm": 10.234602928161621, + "learning_rate": 9.945521646557843e-05, + "loss": 0.4273094654083252, + "step": 3840 + }, + { + "epoch": 0.5464868701206529, + "grad_norm": 10.073461532592773, + "learning_rate": 9.945379701916253e-05, + "loss": 0.4809098243713379, + "step": 3850 + }, + { + "epoch": 0.5479063165365508, + "grad_norm": 8.402386665344238, + "learning_rate": 9.945237757274664e-05, + "loss": 0.5075035572052002, + "step": 3860 + }, + { + "epoch": 0.5493257629524485, + "grad_norm": 8.385801315307617, + "learning_rate": 9.945095812633073e-05, + "loss": 0.42679743766784667, + "step": 3870 + }, + { + "epoch": 0.5507452093683464, + "grad_norm": 8.214275360107422, + "learning_rate": 9.944953867991483e-05, + "loss": 0.42831969261169434, + "step": 3880 + }, + { + "epoch": 0.5521646557842441, + "grad_norm": 6.777364730834961, + "learning_rate": 9.944811923349894e-05, + "loss": 0.37784249782562257, + "step": 3890 + }, + { + "epoch": 0.553584102200142, + "grad_norm": 7.4766011238098145, + "learning_rate": 9.944669978708304e-05, + "loss": 0.5389047622680664, + "step": 3900 + }, + { + "epoch": 0.5550035486160397, + "grad_norm": 7.167613983154297, + "learning_rate": 9.944528034066715e-05, + "loss": 0.4535686492919922, + "step": 3910 + }, + { + "epoch": 0.5564229950319376, + "grad_norm": 3.94936203956604, + "learning_rate": 9.944386089425125e-05, + "loss": 0.392999267578125, + "step": 3920 + }, + { + "epoch": 0.5578424414478353, + "grad_norm": 7.909378528594971, + "learning_rate": 9.944244144783535e-05, + "loss": 0.4675307750701904, + "step": 3930 + }, + { + "epoch": 0.5592618878637331, + "grad_norm": 8.253449440002441, + "learning_rate": 9.944102200141944e-05, + "loss": 0.515011215209961, + "step": 3940 + }, + { + "epoch": 0.560681334279631, + "grad_norm": 5.535346984863281, + "learning_rate": 9.943960255500355e-05, + "loss": 0.35612196922302247, + "step": 3950 + }, + { + "epoch": 0.5621007806955287, + "grad_norm": 5.621975898742676, + "learning_rate": 9.943818310858765e-05, + "loss": 0.3292267322540283, + "step": 3960 + }, + { + "epoch": 0.5635202271114266, + "grad_norm": 8.432771682739258, + "learning_rate": 9.943676366217176e-05, + "loss": 0.44489707946777346, + "step": 3970 + }, + { + "epoch": 0.5649396735273243, + "grad_norm": 5.422188758850098, + "learning_rate": 9.943534421575586e-05, + "loss": 0.4312278270721436, + "step": 3980 + }, + { + "epoch": 0.5663591199432222, + "grad_norm": 6.463229179382324, + "learning_rate": 9.943392476933996e-05, + "loss": 0.4469761371612549, + "step": 3990 + }, + { + "epoch": 0.5677785663591199, + "grad_norm": 12.039133071899414, + "learning_rate": 9.943250532292407e-05, + "loss": 0.5653008937835693, + "step": 4000 + }, + { + "epoch": 0.5677785663591199, + "eval_accuracy": 0.8540726139759649, + "eval_loss": 0.4309617578983307, + "eval_runtime": 32.698, + "eval_samples_per_second": 480.977, + "eval_steps_per_second": 15.047, + "step": 4000 + }, + { + "epoch": 0.5691980127750177, + "grad_norm": 12.185803413391113, + "learning_rate": 9.943108587650817e-05, + "loss": 0.4458905220031738, + "step": 4010 + }, + { + "epoch": 0.5706174591909156, + "grad_norm": 9.691877365112305, + "learning_rate": 9.942966643009228e-05, + "loss": 0.39486720561981203, + "step": 4020 + }, + { + "epoch": 0.5720369056068133, + "grad_norm": 8.106902122497559, + "learning_rate": 9.942824698367637e-05, + "loss": 0.49444093704223635, + "step": 4030 + }, + { + "epoch": 0.5734563520227112, + "grad_norm": 9.14234447479248, + "learning_rate": 9.942682753726047e-05, + "loss": 0.43038039207458495, + "step": 4040 + }, + { + "epoch": 0.5748757984386089, + "grad_norm": 4.6097588539123535, + "learning_rate": 9.942540809084457e-05, + "loss": 0.4118741512298584, + "step": 4050 + }, + { + "epoch": 0.5762952448545068, + "grad_norm": 6.0909881591796875, + "learning_rate": 9.942398864442868e-05, + "loss": 0.4245272159576416, + "step": 4060 + }, + { + "epoch": 0.5777146912704045, + "grad_norm": 10.82681941986084, + "learning_rate": 9.942256919801278e-05, + "loss": 0.43029065132141114, + "step": 4070 + }, + { + "epoch": 0.5791341376863024, + "grad_norm": 7.2398481369018555, + "learning_rate": 9.942114975159689e-05, + "loss": 0.4268779277801514, + "step": 4080 + }, + { + "epoch": 0.5805535841022001, + "grad_norm": 12.160025596618652, + "learning_rate": 9.941973030518099e-05, + "loss": 0.45933380126953127, + "step": 4090 + }, + { + "epoch": 0.5819730305180979, + "grad_norm": 8.116787910461426, + "learning_rate": 9.941831085876508e-05, + "loss": 0.3810285568237305, + "step": 4100 + }, + { + "epoch": 0.5833924769339958, + "grad_norm": 7.5045037269592285, + "learning_rate": 9.94168914123492e-05, + "loss": 0.48673238754272463, + "step": 4110 + }, + { + "epoch": 0.5848119233498935, + "grad_norm": 10.9375581741333, + "learning_rate": 9.941547196593329e-05, + "loss": 0.5465658664703369, + "step": 4120 + }, + { + "epoch": 0.5862313697657914, + "grad_norm": 9.211751937866211, + "learning_rate": 9.94140525195174e-05, + "loss": 0.46831202507019043, + "step": 4130 + }, + { + "epoch": 0.5876508161816891, + "grad_norm": 7.636734485626221, + "learning_rate": 9.941263307310149e-05, + "loss": 0.3928233623504639, + "step": 4140 + }, + { + "epoch": 0.589070262597587, + "grad_norm": 7.125626564025879, + "learning_rate": 9.94112136266856e-05, + "loss": 0.40816364288330076, + "step": 4150 + }, + { + "epoch": 0.5904897090134847, + "grad_norm": 5.0693888664245605, + "learning_rate": 9.94097941802697e-05, + "loss": 0.3931445837020874, + "step": 4160 + }, + { + "epoch": 0.5919091554293825, + "grad_norm": 8.10261058807373, + "learning_rate": 9.94083747338538e-05, + "loss": 0.4498757839202881, + "step": 4170 + }, + { + "epoch": 0.5933286018452804, + "grad_norm": 9.593578338623047, + "learning_rate": 9.94069552874379e-05, + "loss": 0.46671414375305176, + "step": 4180 + }, + { + "epoch": 0.5947480482611781, + "grad_norm": 10.025617599487305, + "learning_rate": 9.9405535841022e-05, + "loss": 0.42932772636413574, + "step": 4190 + }, + { + "epoch": 0.596167494677076, + "grad_norm": 9.828198432922363, + "learning_rate": 9.940411639460611e-05, + "loss": 0.4723196506500244, + "step": 4200 + }, + { + "epoch": 0.5975869410929737, + "grad_norm": 7.570648193359375, + "learning_rate": 9.940269694819021e-05, + "loss": 0.4049358367919922, + "step": 4210 + }, + { + "epoch": 0.5990063875088716, + "grad_norm": 6.280502796173096, + "learning_rate": 9.940127750177432e-05, + "loss": 0.4247574806213379, + "step": 4220 + }, + { + "epoch": 0.6004258339247693, + "grad_norm": 8.619515419006348, + "learning_rate": 9.939985805535842e-05, + "loss": 0.3778993606567383, + "step": 4230 + }, + { + "epoch": 0.6018452803406671, + "grad_norm": 7.0030059814453125, + "learning_rate": 9.939843860894251e-05, + "loss": 0.4360033988952637, + "step": 4240 + }, + { + "epoch": 0.603264726756565, + "grad_norm": 6.206148624420166, + "learning_rate": 9.939701916252661e-05, + "loss": 0.4114119529724121, + "step": 4250 + }, + { + "epoch": 0.6046841731724627, + "grad_norm": 4.982306003570557, + "learning_rate": 9.939559971611072e-05, + "loss": 0.36019244194030764, + "step": 4260 + }, + { + "epoch": 0.6061036195883606, + "grad_norm": 7.193652153015137, + "learning_rate": 9.939418026969482e-05, + "loss": 0.47596259117126466, + "step": 4270 + }, + { + "epoch": 0.6075230660042583, + "grad_norm": 9.371147155761719, + "learning_rate": 9.939276082327893e-05, + "loss": 0.42912769317626953, + "step": 4280 + }, + { + "epoch": 0.6089425124201562, + "grad_norm": 8.962141036987305, + "learning_rate": 9.939134137686303e-05, + "loss": 0.42342243194580076, + "step": 4290 + }, + { + "epoch": 0.6103619588360539, + "grad_norm": 7.575186252593994, + "learning_rate": 9.938992193044712e-05, + "loss": 0.4972747802734375, + "step": 4300 + }, + { + "epoch": 0.6117814052519518, + "grad_norm": 6.965094566345215, + "learning_rate": 9.938850248403124e-05, + "loss": 0.3489841938018799, + "step": 4310 + }, + { + "epoch": 0.6132008516678495, + "grad_norm": 8.466391563415527, + "learning_rate": 9.938708303761533e-05, + "loss": 0.3544389009475708, + "step": 4320 + }, + { + "epoch": 0.6146202980837473, + "grad_norm": 6.5821123123168945, + "learning_rate": 9.938566359119944e-05, + "loss": 0.4732979297637939, + "step": 4330 + }, + { + "epoch": 0.6160397444996452, + "grad_norm": 6.803234100341797, + "learning_rate": 9.938424414478353e-05, + "loss": 0.39069912433624265, + "step": 4340 + }, + { + "epoch": 0.6174591909155429, + "grad_norm": 10.069840431213379, + "learning_rate": 9.938282469836764e-05, + "loss": 0.4592564582824707, + "step": 4350 + }, + { + "epoch": 0.6188786373314408, + "grad_norm": 9.41560173034668, + "learning_rate": 9.938140525195174e-05, + "loss": 0.45508289337158203, + "step": 4360 + }, + { + "epoch": 0.6202980837473385, + "grad_norm": 8.344886779785156, + "learning_rate": 9.937998580553585e-05, + "loss": 0.41198153495788575, + "step": 4370 + }, + { + "epoch": 0.6217175301632364, + "grad_norm": 9.129981994628906, + "learning_rate": 9.937856635911994e-05, + "loss": 0.33535902500152587, + "step": 4380 + }, + { + "epoch": 0.6231369765791341, + "grad_norm": 6.8436455726623535, + "learning_rate": 9.937714691270406e-05, + "loss": 0.3965883493423462, + "step": 4390 + }, + { + "epoch": 0.6245564229950319, + "grad_norm": 6.954466342926025, + "learning_rate": 9.937572746628815e-05, + "loss": 0.3352261304855347, + "step": 4400 + }, + { + "epoch": 0.6259758694109298, + "grad_norm": 8.227835655212402, + "learning_rate": 9.937430801987225e-05, + "loss": 0.4205745220184326, + "step": 4410 + }, + { + "epoch": 0.6273953158268275, + "grad_norm": 8.202418327331543, + "learning_rate": 9.937288857345636e-05, + "loss": 0.3927265405654907, + "step": 4420 + }, + { + "epoch": 0.6288147622427254, + "grad_norm": 9.406537055969238, + "learning_rate": 9.937146912704046e-05, + "loss": 0.4481183052062988, + "step": 4430 + }, + { + "epoch": 0.6302342086586231, + "grad_norm": 8.330412864685059, + "learning_rate": 9.937004968062457e-05, + "loss": 0.37030580043792727, + "step": 4440 + }, + { + "epoch": 0.631653655074521, + "grad_norm": 5.601277828216553, + "learning_rate": 9.936863023420865e-05, + "loss": 0.35557353496551514, + "step": 4450 + }, + { + "epoch": 0.6330731014904187, + "grad_norm": 11.551403999328613, + "learning_rate": 9.936721078779276e-05, + "loss": 0.3718759536743164, + "step": 4460 + }, + { + "epoch": 0.6344925479063165, + "grad_norm": 5.961857318878174, + "learning_rate": 9.936579134137686e-05, + "loss": 0.3828912258148193, + "step": 4470 + }, + { + "epoch": 0.6359119943222143, + "grad_norm": 6.173798561096191, + "learning_rate": 9.936437189496097e-05, + "loss": 0.392284631729126, + "step": 4480 + }, + { + "epoch": 0.6373314407381121, + "grad_norm": 8.952240943908691, + "learning_rate": 9.936295244854508e-05, + "loss": 0.41978960037231444, + "step": 4490 + }, + { + "epoch": 0.63875088715401, + "grad_norm": 9.86811637878418, + "learning_rate": 9.936153300212917e-05, + "loss": 0.42105417251586913, + "step": 4500 + }, + { + "epoch": 0.63875088715401, + "eval_accuracy": 0.8225344948178293, + "eval_loss": 0.5203356146812439, + "eval_runtime": 33.0374, + "eval_samples_per_second": 476.036, + "eval_steps_per_second": 14.892, + "step": 4500 + }, + { + "epoch": 0.6401703335699077, + "grad_norm": 10.036981582641602, + "learning_rate": 9.936011355571328e-05, + "loss": 0.41321401596069335, + "step": 4510 + }, + { + "epoch": 0.6415897799858056, + "grad_norm": 6.618304252624512, + "learning_rate": 9.935869410929738e-05, + "loss": 0.43657841682434084, + "step": 4520 + }, + { + "epoch": 0.6430092264017033, + "grad_norm": 9.975127220153809, + "learning_rate": 9.935727466288149e-05, + "loss": 0.3949880838394165, + "step": 4530 + }, + { + "epoch": 0.6444286728176012, + "grad_norm": 8.210672378540039, + "learning_rate": 9.935585521646558e-05, + "loss": 0.4280043125152588, + "step": 4540 + }, + { + "epoch": 0.6458481192334989, + "grad_norm": 12.055879592895508, + "learning_rate": 9.935443577004968e-05, + "loss": 0.39465947151184083, + "step": 4550 + }, + { + "epoch": 0.6472675656493967, + "grad_norm": 7.540829658508301, + "learning_rate": 9.935301632363378e-05, + "loss": 0.3965680837631226, + "step": 4560 + }, + { + "epoch": 0.6486870120652946, + "grad_norm": 9.717781066894531, + "learning_rate": 9.935159687721789e-05, + "loss": 0.40194106101989746, + "step": 4570 + }, + { + "epoch": 0.6501064584811923, + "grad_norm": 10.271167755126953, + "learning_rate": 9.9350177430802e-05, + "loss": 0.4726293087005615, + "step": 4580 + }, + { + "epoch": 0.6515259048970902, + "grad_norm": 7.158174514770508, + "learning_rate": 9.93487579843861e-05, + "loss": 0.40993413925170896, + "step": 4590 + }, + { + "epoch": 0.6529453513129879, + "grad_norm": 10.536994934082031, + "learning_rate": 9.93473385379702e-05, + "loss": 0.4424222469329834, + "step": 4600 + }, + { + "epoch": 0.6543647977288858, + "grad_norm": 7.256109714508057, + "learning_rate": 9.934591909155429e-05, + "loss": 0.388359522819519, + "step": 4610 + }, + { + "epoch": 0.6557842441447835, + "grad_norm": 8.278726577758789, + "learning_rate": 9.93444996451384e-05, + "loss": 0.3513230085372925, + "step": 4620 + }, + { + "epoch": 0.6572036905606813, + "grad_norm": 7.767818927764893, + "learning_rate": 9.93430801987225e-05, + "loss": 0.42050671577453613, + "step": 4630 + }, + { + "epoch": 0.6586231369765791, + "grad_norm": 3.4903321266174316, + "learning_rate": 9.934166075230661e-05, + "loss": 0.3255154609680176, + "step": 4640 + }, + { + "epoch": 0.6600425833924769, + "grad_norm": 8.193768501281738, + "learning_rate": 9.93402413058907e-05, + "loss": 0.34639596939086914, + "step": 4650 + }, + { + "epoch": 0.6614620298083748, + "grad_norm": 6.168176651000977, + "learning_rate": 9.93388218594748e-05, + "loss": 0.3619822025299072, + "step": 4660 + }, + { + "epoch": 0.6628814762242725, + "grad_norm": 4.793501853942871, + "learning_rate": 9.933740241305892e-05, + "loss": 0.3441330909729004, + "step": 4670 + }, + { + "epoch": 0.6643009226401704, + "grad_norm": 7.100066184997559, + "learning_rate": 9.933598296664301e-05, + "loss": 0.41966400146484373, + "step": 4680 + }, + { + "epoch": 0.6657203690560681, + "grad_norm": 8.032003402709961, + "learning_rate": 9.933456352022713e-05, + "loss": 0.39086959362030027, + "step": 4690 + }, + { + "epoch": 0.6671398154719659, + "grad_norm": 5.533408164978027, + "learning_rate": 9.933314407381121e-05, + "loss": 0.455733060836792, + "step": 4700 + }, + { + "epoch": 0.6685592618878637, + "grad_norm": 6.478943347930908, + "learning_rate": 9.933172462739532e-05, + "loss": 0.3870114326477051, + "step": 4710 + }, + { + "epoch": 0.6699787083037615, + "grad_norm": 8.963722229003906, + "learning_rate": 9.933030518097942e-05, + "loss": 0.4041899681091309, + "step": 4720 + }, + { + "epoch": 0.6713981547196594, + "grad_norm": 4.072963714599609, + "learning_rate": 9.932888573456353e-05, + "loss": 0.35542023181915283, + "step": 4730 + }, + { + "epoch": 0.6728176011355571, + "grad_norm": 6.834389686584473, + "learning_rate": 9.932746628814763e-05, + "loss": 0.34830470085144044, + "step": 4740 + }, + { + "epoch": 0.674237047551455, + "grad_norm": 7.003122329711914, + "learning_rate": 9.932604684173174e-05, + "loss": 0.3465887069702148, + "step": 4750 + }, + { + "epoch": 0.6756564939673527, + "grad_norm": 8.914156913757324, + "learning_rate": 9.932462739531583e-05, + "loss": 0.44321861267089846, + "step": 4760 + }, + { + "epoch": 0.6770759403832506, + "grad_norm": 7.6024627685546875, + "learning_rate": 9.932320794889993e-05, + "loss": 0.40067334175109864, + "step": 4770 + }, + { + "epoch": 0.6784953867991483, + "grad_norm": 8.667821884155273, + "learning_rate": 9.932178850248404e-05, + "loss": 0.371229887008667, + "step": 4780 + }, + { + "epoch": 0.6799148332150461, + "grad_norm": 9.355796813964844, + "learning_rate": 9.932036905606814e-05, + "loss": 0.3920291900634766, + "step": 4790 + }, + { + "epoch": 0.681334279630944, + "grad_norm": 6.767845153808594, + "learning_rate": 9.931894960965225e-05, + "loss": 0.3848612070083618, + "step": 4800 + }, + { + "epoch": 0.6827537260468417, + "grad_norm": 8.195937156677246, + "learning_rate": 9.931767210787794e-05, + "loss": 0.5190616607666015, + "step": 4810 + }, + { + "epoch": 0.6841731724627396, + "grad_norm": 6.033681869506836, + "learning_rate": 9.931625266146203e-05, + "loss": 0.39132606983184814, + "step": 4820 + }, + { + "epoch": 0.6855926188786373, + "grad_norm": 8.469270706176758, + "learning_rate": 9.931483321504613e-05, + "loss": 0.3626258850097656, + "step": 4830 + }, + { + "epoch": 0.6870120652945352, + "grad_norm": 4.255542278289795, + "learning_rate": 9.931341376863024e-05, + "loss": 0.31856842041015626, + "step": 4840 + }, + { + "epoch": 0.6884315117104329, + "grad_norm": 9.191469192504883, + "learning_rate": 9.931199432221434e-05, + "loss": 0.3280362367630005, + "step": 4850 + }, + { + "epoch": 0.6898509581263307, + "grad_norm": 8.94046688079834, + "learning_rate": 9.931057487579845e-05, + "loss": 0.39851620197296145, + "step": 4860 + }, + { + "epoch": 0.6912704045422285, + "grad_norm": 7.770534992218018, + "learning_rate": 9.930915542938255e-05, + "loss": 0.33825528621673584, + "step": 4870 + }, + { + "epoch": 0.6926898509581263, + "grad_norm": 6.560062885284424, + "learning_rate": 9.930773598296664e-05, + "loss": 0.35839481353759767, + "step": 4880 + }, + { + "epoch": 0.6941092973740242, + "grad_norm": 9.24365520477295, + "learning_rate": 9.930631653655074e-05, + "loss": 0.39770119190216063, + "step": 4890 + }, + { + "epoch": 0.6955287437899219, + "grad_norm": 11.744332313537598, + "learning_rate": 9.930489709013485e-05, + "loss": 0.4902297019958496, + "step": 4900 + }, + { + "epoch": 0.6969481902058198, + "grad_norm": 7.251524448394775, + "learning_rate": 9.930347764371895e-05, + "loss": 0.40317511558532715, + "step": 4910 + }, + { + "epoch": 0.6983676366217175, + "grad_norm": 8.896724700927734, + "learning_rate": 9.930205819730306e-05, + "loss": 0.44049978256225586, + "step": 4920 + }, + { + "epoch": 0.6997870830376153, + "grad_norm": 7.477156162261963, + "learning_rate": 9.930063875088716e-05, + "loss": 0.3586245536804199, + "step": 4930 + }, + { + "epoch": 0.7012065294535131, + "grad_norm": 6.159836769104004, + "learning_rate": 9.929921930447126e-05, + "loss": 0.32783629894256594, + "step": 4940 + }, + { + "epoch": 0.7026259758694109, + "grad_norm": 6.85299825668335, + "learning_rate": 9.929779985805537e-05, + "loss": 0.30911822319030763, + "step": 4950 + }, + { + "epoch": 0.7040454222853088, + "grad_norm": 7.820040225982666, + "learning_rate": 9.929638041163946e-05, + "loss": 0.36734838485717775, + "step": 4960 + }, + { + "epoch": 0.7054648687012065, + "grad_norm": 6.66180944442749, + "learning_rate": 9.929496096522358e-05, + "loss": 0.37120904922485354, + "step": 4970 + }, + { + "epoch": 0.7068843151171044, + "grad_norm": 7.3861775398254395, + "learning_rate": 9.929354151880766e-05, + "loss": 0.4064349174499512, + "step": 4980 + }, + { + "epoch": 0.7083037615330021, + "grad_norm": 7.068629741668701, + "learning_rate": 9.929212207239177e-05, + "loss": 0.36406426429748534, + "step": 4990 + }, + { + "epoch": 0.7097232079489, + "grad_norm": 7.482442378997803, + "learning_rate": 9.929070262597587e-05, + "loss": 0.40763154029846194, + "step": 5000 + }, + { + "epoch": 0.7097232079489, + "eval_accuracy": 0.8707954473198957, + "eval_loss": 0.37987253069877625, + "eval_runtime": 33.0642, + "eval_samples_per_second": 475.651, + "eval_steps_per_second": 14.88, + "step": 5000 + }, + { + "epoch": 0.7111426543647977, + "grad_norm": 5.368759632110596, + "learning_rate": 9.928928317955998e-05, + "loss": 0.40792322158813477, + "step": 5010 + }, + { + "epoch": 0.7125621007806955, + "grad_norm": 3.8395280838012695, + "learning_rate": 9.928786373314408e-05, + "loss": 0.45433621406555175, + "step": 5020 + }, + { + "epoch": 0.7139815471965933, + "grad_norm": 7.884678840637207, + "learning_rate": 9.928644428672817e-05, + "loss": 0.3092354774475098, + "step": 5030 + }, + { + "epoch": 0.7154009936124911, + "grad_norm": 9.11925983428955, + "learning_rate": 9.928502484031228e-05, + "loss": 0.3887113094329834, + "step": 5040 + }, + { + "epoch": 0.716820440028389, + "grad_norm": 8.5901517868042, + "learning_rate": 9.928360539389638e-05, + "loss": 0.3938072443008423, + "step": 5050 + }, + { + "epoch": 0.7182398864442867, + "grad_norm": 4.011209011077881, + "learning_rate": 9.928218594748049e-05, + "loss": 0.3140719890594482, + "step": 5060 + }, + { + "epoch": 0.7196593328601846, + "grad_norm": 9.04295825958252, + "learning_rate": 9.928076650106459e-05, + "loss": 0.37023751735687255, + "step": 5070 + }, + { + "epoch": 0.7210787792760823, + "grad_norm": 7.336644649505615, + "learning_rate": 9.92793470546487e-05, + "loss": 0.3326029539108276, + "step": 5080 + }, + { + "epoch": 0.7224982256919801, + "grad_norm": 6.824075698852539, + "learning_rate": 9.927792760823278e-05, + "loss": 0.31377925872802737, + "step": 5090 + }, + { + "epoch": 0.7239176721078779, + "grad_norm": 6.152795314788818, + "learning_rate": 9.92765081618169e-05, + "loss": 0.4362512111663818, + "step": 5100 + }, + { + "epoch": 0.7253371185237757, + "grad_norm": 7.997036457061768, + "learning_rate": 9.927508871540099e-05, + "loss": 0.39910459518432617, + "step": 5110 + }, + { + "epoch": 0.7267565649396736, + "grad_norm": 7.5024309158325195, + "learning_rate": 9.92736692689851e-05, + "loss": 0.3690288305282593, + "step": 5120 + }, + { + "epoch": 0.7281760113555713, + "grad_norm": 9.340811729431152, + "learning_rate": 9.92722498225692e-05, + "loss": 0.28037595748901367, + "step": 5130 + }, + { + "epoch": 0.7295954577714692, + "grad_norm": 6.796107292175293, + "learning_rate": 9.92708303761533e-05, + "loss": 0.2862435817718506, + "step": 5140 + }, + { + "epoch": 0.7310149041873669, + "grad_norm": 6.0283379554748535, + "learning_rate": 9.926941092973741e-05, + "loss": 0.351378345489502, + "step": 5150 + }, + { + "epoch": 0.7324343506032647, + "grad_norm": 6.880161762237549, + "learning_rate": 9.926799148332151e-05, + "loss": 0.3127347230911255, + "step": 5160 + }, + { + "epoch": 0.7338537970191625, + "grad_norm": 7.761416912078857, + "learning_rate": 9.926657203690562e-05, + "loss": 0.3232876777648926, + "step": 5170 + }, + { + "epoch": 0.7352732434350603, + "grad_norm": 8.840635299682617, + "learning_rate": 9.926515259048972e-05, + "loss": 0.36195032596588134, + "step": 5180 + }, + { + "epoch": 0.7366926898509581, + "grad_norm": 10.067350387573242, + "learning_rate": 9.926373314407381e-05, + "loss": 0.33318257331848145, + "step": 5190 + }, + { + "epoch": 0.7381121362668559, + "grad_norm": 4.935089111328125, + "learning_rate": 9.926231369765791e-05, + "loss": 0.3263442039489746, + "step": 5200 + }, + { + "epoch": 0.7395315826827538, + "grad_norm": 6.868301868438721, + "learning_rate": 9.926089425124202e-05, + "loss": 0.4087569236755371, + "step": 5210 + }, + { + "epoch": 0.7409510290986515, + "grad_norm": 7.978097915649414, + "learning_rate": 9.925947480482612e-05, + "loss": 0.33616573810577394, + "step": 5220 + }, + { + "epoch": 0.7423704755145494, + "grad_norm": 11.391094207763672, + "learning_rate": 9.925805535841023e-05, + "loss": 0.33483550548553465, + "step": 5230 + }, + { + "epoch": 0.7437899219304471, + "grad_norm": 5.558361530303955, + "learning_rate": 9.925663591199433e-05, + "loss": 0.38994641304016114, + "step": 5240 + }, + { + "epoch": 0.7452093683463449, + "grad_norm": 2.6022746562957764, + "learning_rate": 9.925521646557842e-05, + "loss": 0.2801194429397583, + "step": 5250 + }, + { + "epoch": 0.7466288147622427, + "grad_norm": 10.395146369934082, + "learning_rate": 9.925379701916253e-05, + "loss": 0.45772466659545896, + "step": 5260 + }, + { + "epoch": 0.7480482611781405, + "grad_norm": 10.162497520446777, + "learning_rate": 9.925237757274663e-05, + "loss": 0.3906741142272949, + "step": 5270 + }, + { + "epoch": 0.7494677075940384, + "grad_norm": 7.618703365325928, + "learning_rate": 9.925095812633074e-05, + "loss": 0.3549813747406006, + "step": 5280 + }, + { + "epoch": 0.7508871540099361, + "grad_norm": 6.407444953918457, + "learning_rate": 9.924953867991483e-05, + "loss": 0.3040858268737793, + "step": 5290 + }, + { + "epoch": 0.752306600425834, + "grad_norm": 7.738057613372803, + "learning_rate": 9.924811923349894e-05, + "loss": 0.39499850273132325, + "step": 5300 + }, + { + "epoch": 0.7537260468417317, + "grad_norm": 7.237374782562256, + "learning_rate": 9.924669978708304e-05, + "loss": 0.3085558652877808, + "step": 5310 + }, + { + "epoch": 0.7551454932576295, + "grad_norm": 6.442776203155518, + "learning_rate": 9.924528034066715e-05, + "loss": 0.40102262496948243, + "step": 5320 + }, + { + "epoch": 0.7565649396735273, + "grad_norm": 10.280111312866211, + "learning_rate": 9.924386089425126e-05, + "loss": 0.3338863611221313, + "step": 5330 + }, + { + "epoch": 0.7579843860894251, + "grad_norm": 8.590238571166992, + "learning_rate": 9.924244144783534e-05, + "loss": 0.48393831253051756, + "step": 5340 + }, + { + "epoch": 0.759403832505323, + "grad_norm": 4.818009376525879, + "learning_rate": 9.924102200141945e-05, + "loss": 0.31519811153411864, + "step": 5350 + }, + { + "epoch": 0.7608232789212207, + "grad_norm": 7.284486293792725, + "learning_rate": 9.923960255500355e-05, + "loss": 0.3537211179733276, + "step": 5360 + }, + { + "epoch": 0.7622427253371186, + "grad_norm": 8.618793487548828, + "learning_rate": 9.923818310858766e-05, + "loss": 0.34086947441101073, + "step": 5370 + }, + { + "epoch": 0.7636621717530163, + "grad_norm": 8.162178039550781, + "learning_rate": 9.923676366217176e-05, + "loss": 0.38811311721801756, + "step": 5380 + }, + { + "epoch": 0.7650816181689141, + "grad_norm": 7.360818386077881, + "learning_rate": 9.923534421575587e-05, + "loss": 0.30603010654449464, + "step": 5390 + }, + { + "epoch": 0.7665010645848119, + "grad_norm": 4.011861801147461, + "learning_rate": 9.923392476933995e-05, + "loss": 0.23683266639709472, + "step": 5400 + }, + { + "epoch": 0.7679205110007097, + "grad_norm": 5.943147659301758, + "learning_rate": 9.923250532292406e-05, + "loss": 0.34063313007354734, + "step": 5410 + }, + { + "epoch": 0.7693399574166075, + "grad_norm": 7.751121997833252, + "learning_rate": 9.923108587650817e-05, + "loss": 0.36524248123168945, + "step": 5420 + }, + { + "epoch": 0.7707594038325053, + "grad_norm": 8.413863182067871, + "learning_rate": 9.922966643009227e-05, + "loss": 0.3002290725708008, + "step": 5430 + }, + { + "epoch": 0.7721788502484032, + "grad_norm": 7.4792280197143555, + "learning_rate": 9.922824698367638e-05, + "loss": 0.2858253240585327, + "step": 5440 + }, + { + "epoch": 0.7735982966643009, + "grad_norm": 4.943634986877441, + "learning_rate": 9.922682753726047e-05, + "loss": 0.3922913074493408, + "step": 5450 + }, + { + "epoch": 0.7750177430801988, + "grad_norm": 9.556757926940918, + "learning_rate": 9.922540809084458e-05, + "loss": 0.32624542713165283, + "step": 5460 + }, + { + "epoch": 0.7764371894960965, + "grad_norm": 6.306029319763184, + "learning_rate": 9.922398864442867e-05, + "loss": 0.32522106170654297, + "step": 5470 + }, + { + "epoch": 0.7778566359119943, + "grad_norm": 9.622481346130371, + "learning_rate": 9.922256919801279e-05, + "loss": 0.32840585708618164, + "step": 5480 + }, + { + "epoch": 0.7792760823278921, + "grad_norm": 6.480415344238281, + "learning_rate": 9.922114975159688e-05, + "loss": 0.31494650840759275, + "step": 5490 + }, + { + "epoch": 0.7806955287437899, + "grad_norm": 9.822346687316895, + "learning_rate": 9.921973030518098e-05, + "loss": 0.3520227909088135, + "step": 5500 + }, + { + "epoch": 0.7806955287437899, + "eval_accuracy": 0.8887263940993196, + "eval_loss": 0.331625759601593, + "eval_runtime": 33.1217, + "eval_samples_per_second": 474.825, + "eval_steps_per_second": 14.854, + "step": 5500 + }, + { + "epoch": 0.7821149751596878, + "grad_norm": 8.544402122497559, + "learning_rate": 9.921831085876508e-05, + "loss": 0.3386709451675415, + "step": 5510 + }, + { + "epoch": 0.7835344215755855, + "grad_norm": 6.877591133117676, + "learning_rate": 9.921689141234919e-05, + "loss": 0.3577073574066162, + "step": 5520 + }, + { + "epoch": 0.7849538679914834, + "grad_norm": 8.182839393615723, + "learning_rate": 9.92154719659333e-05, + "loss": 0.33861188888549804, + "step": 5530 + }, + { + "epoch": 0.7863733144073811, + "grad_norm": 7.762393474578857, + "learning_rate": 9.92140525195174e-05, + "loss": 0.2913277387619019, + "step": 5540 + }, + { + "epoch": 0.7877927608232789, + "grad_norm": 9.238672256469727, + "learning_rate": 9.92126330731015e-05, + "loss": 0.27555758953094484, + "step": 5550 + }, + { + "epoch": 0.7892122072391767, + "grad_norm": 8.316729545593262, + "learning_rate": 9.921121362668559e-05, + "loss": 0.3221546411514282, + "step": 5560 + }, + { + "epoch": 0.7906316536550745, + "grad_norm": 5.685539245605469, + "learning_rate": 9.92097941802697e-05, + "loss": 0.335821533203125, + "step": 5570 + }, + { + "epoch": 0.7920511000709723, + "grad_norm": 9.121819496154785, + "learning_rate": 9.92083747338538e-05, + "loss": 0.41519789695739745, + "step": 5580 + }, + { + "epoch": 0.7934705464868701, + "grad_norm": 10.83812141418457, + "learning_rate": 9.920695528743791e-05, + "loss": 0.30081839561462403, + "step": 5590 + }, + { + "epoch": 0.794889992902768, + "grad_norm": 3.7030341625213623, + "learning_rate": 9.9205535841022e-05, + "loss": 0.3369245767593384, + "step": 5600 + }, + { + "epoch": 0.7963094393186657, + "grad_norm": 3.8987886905670166, + "learning_rate": 9.92041163946061e-05, + "loss": 0.3294223785400391, + "step": 5610 + }, + { + "epoch": 0.7977288857345636, + "grad_norm": 4.1831207275390625, + "learning_rate": 9.920269694819022e-05, + "loss": 0.2734922170639038, + "step": 5620 + }, + { + "epoch": 0.7991483321504613, + "grad_norm": 7.363320827484131, + "learning_rate": 9.920127750177431e-05, + "loss": 0.3629761219024658, + "step": 5630 + }, + { + "epoch": 0.8005677785663591, + "grad_norm": 3.947075366973877, + "learning_rate": 9.919985805535842e-05, + "loss": 0.24655752182006835, + "step": 5640 + }, + { + "epoch": 0.8019872249822569, + "grad_norm": 7.183192253112793, + "learning_rate": 9.919843860894251e-05, + "loss": 0.3074009895324707, + "step": 5650 + }, + { + "epoch": 0.8034066713981547, + "grad_norm": 9.004253387451172, + "learning_rate": 9.919701916252662e-05, + "loss": 0.38861281871795655, + "step": 5660 + }, + { + "epoch": 0.8048261178140526, + "grad_norm": 7.553649425506592, + "learning_rate": 9.919559971611072e-05, + "loss": 0.4247180461883545, + "step": 5670 + }, + { + "epoch": 0.8062455642299503, + "grad_norm": 6.382741928100586, + "learning_rate": 9.919418026969483e-05, + "loss": 0.304930305480957, + "step": 5680 + }, + { + "epoch": 0.8076650106458482, + "grad_norm": 5.102434158325195, + "learning_rate": 9.919276082327893e-05, + "loss": 0.38076980113983155, + "step": 5690 + }, + { + "epoch": 0.8090844570617459, + "grad_norm": 6.131350517272949, + "learning_rate": 9.919134137686302e-05, + "loss": 0.40895967483520507, + "step": 5700 + }, + { + "epoch": 0.8105039034776437, + "grad_norm": 7.717721939086914, + "learning_rate": 9.918992193044713e-05, + "loss": 0.34289727210998533, + "step": 5710 + }, + { + "epoch": 0.8119233498935415, + "grad_norm": 7.452071189880371, + "learning_rate": 9.918850248403123e-05, + "loss": 0.26248266696929934, + "step": 5720 + }, + { + "epoch": 0.8133427963094393, + "grad_norm": 4.934199333190918, + "learning_rate": 9.918708303761534e-05, + "loss": 0.2918365478515625, + "step": 5730 + }, + { + "epoch": 0.8147622427253371, + "grad_norm": 3.497220993041992, + "learning_rate": 9.918566359119944e-05, + "loss": 0.27859480381011964, + "step": 5740 + }, + { + "epoch": 0.8161816891412349, + "grad_norm": 9.320852279663086, + "learning_rate": 9.918424414478355e-05, + "loss": 0.34371328353881836, + "step": 5750 + }, + { + "epoch": 0.8176011355571328, + "grad_norm": 10.081619262695312, + "learning_rate": 9.918282469836763e-05, + "loss": 0.36181211471557617, + "step": 5760 + }, + { + "epoch": 0.8190205819730305, + "grad_norm": 7.466938018798828, + "learning_rate": 9.918140525195174e-05, + "loss": 0.34078028202056887, + "step": 5770 + }, + { + "epoch": 0.8204400283889283, + "grad_norm": 4.303114414215088, + "learning_rate": 9.917998580553584e-05, + "loss": 0.34729723930358886, + "step": 5780 + }, + { + "epoch": 0.8218594748048261, + "grad_norm": 9.38592529296875, + "learning_rate": 9.917856635911995e-05, + "loss": 0.4285425662994385, + "step": 5790 + }, + { + "epoch": 0.8232789212207239, + "grad_norm": 9.465388298034668, + "learning_rate": 9.917714691270405e-05, + "loss": 0.3501663446426392, + "step": 5800 + }, + { + "epoch": 0.8246983676366217, + "grad_norm": 5.500204086303711, + "learning_rate": 9.917572746628815e-05, + "loss": 0.3102808952331543, + "step": 5810 + }, + { + "epoch": 0.8261178140525195, + "grad_norm": 4.572218894958496, + "learning_rate": 9.917430801987226e-05, + "loss": 0.2433872938156128, + "step": 5820 + }, + { + "epoch": 0.8275372604684174, + "grad_norm": 9.858591079711914, + "learning_rate": 9.917288857345636e-05, + "loss": 0.30695419311523436, + "step": 5830 + }, + { + "epoch": 0.8289567068843151, + "grad_norm": 6.843176364898682, + "learning_rate": 9.917146912704047e-05, + "loss": 0.35634801387786863, + "step": 5840 + }, + { + "epoch": 0.830376153300213, + "grad_norm": 10.634949684143066, + "learning_rate": 9.917004968062456e-05, + "loss": 0.3107039451599121, + "step": 5850 + }, + { + "epoch": 0.8317955997161107, + "grad_norm": 8.44272518157959, + "learning_rate": 9.916863023420866e-05, + "loss": 0.3672316551208496, + "step": 5860 + }, + { + "epoch": 0.8332150461320085, + "grad_norm": 5.4848785400390625, + "learning_rate": 9.916721078779276e-05, + "loss": 0.4015390872955322, + "step": 5870 + }, + { + "epoch": 0.8346344925479063, + "grad_norm": 7.271710395812988, + "learning_rate": 9.916579134137687e-05, + "loss": 0.23676373958587646, + "step": 5880 + }, + { + "epoch": 0.8360539389638041, + "grad_norm": 4.376358509063721, + "learning_rate": 9.916437189496097e-05, + "loss": 0.2711988687515259, + "step": 5890 + }, + { + "epoch": 0.837473385379702, + "grad_norm": 6.931346416473389, + "learning_rate": 9.916295244854508e-05, + "loss": 0.2837867021560669, + "step": 5900 + }, + { + "epoch": 0.8388928317955997, + "grad_norm": 7.611521244049072, + "learning_rate": 9.916153300212918e-05, + "loss": 0.315134072303772, + "step": 5910 + }, + { + "epoch": 0.8403122782114976, + "grad_norm": 7.071038722991943, + "learning_rate": 9.916011355571327e-05, + "loss": 0.3368415594100952, + "step": 5920 + }, + { + "epoch": 0.8417317246273953, + "grad_norm": 4.1825056076049805, + "learning_rate": 9.915869410929738e-05, + "loss": 0.3074488162994385, + "step": 5930 + }, + { + "epoch": 0.8431511710432931, + "grad_norm": 6.3160929679870605, + "learning_rate": 9.915727466288148e-05, + "loss": 0.3252119541168213, + "step": 5940 + }, + { + "epoch": 0.8445706174591909, + "grad_norm": 8.007182121276855, + "learning_rate": 9.915585521646559e-05, + "loss": 0.23286638259887696, + "step": 5950 + }, + { + "epoch": 0.8459900638750887, + "grad_norm": 7.93002986907959, + "learning_rate": 9.915443577004968e-05, + "loss": 0.2870266199111938, + "step": 5960 + }, + { + "epoch": 0.8474095102909865, + "grad_norm": 5.426539897918701, + "learning_rate": 9.915301632363379e-05, + "loss": 0.29859611988067625, + "step": 5970 + }, + { + "epoch": 0.8488289567068843, + "grad_norm": 4.294735908508301, + "learning_rate": 9.915159687721788e-05, + "loss": 0.24727118015289307, + "step": 5980 + }, + { + "epoch": 0.8502484031227822, + "grad_norm": 8.501158714294434, + "learning_rate": 9.9150177430802e-05, + "loss": 0.3406102657318115, + "step": 5990 + }, + { + "epoch": 0.8516678495386799, + "grad_norm": 8.125472068786621, + "learning_rate": 9.914875798438609e-05, + "loss": 0.3179450273513794, + "step": 6000 + }, + { + "epoch": 0.8516678495386799, + "eval_accuracy": 0.8626565778597317, + "eval_loss": 0.4082823693752289, + "eval_runtime": 33.3539, + "eval_samples_per_second": 471.52, + "eval_steps_per_second": 14.751, + "step": 6000 + }, + { + "epoch": 0.8530872959545777, + "grad_norm": 4.980500221252441, + "learning_rate": 9.914733853797019e-05, + "loss": 0.3588885307312012, + "step": 6010 + }, + { + "epoch": 0.8545067423704755, + "grad_norm": 5.385146617889404, + "learning_rate": 9.91459190915543e-05, + "loss": 0.28512775897979736, + "step": 6020 + }, + { + "epoch": 0.8559261887863733, + "grad_norm": 8.24423599243164, + "learning_rate": 9.91444996451384e-05, + "loss": 0.32922515869140623, + "step": 6030 + }, + { + "epoch": 0.8573456352022711, + "grad_norm": 6.568521499633789, + "learning_rate": 9.914308019872251e-05, + "loss": 0.24458625316619872, + "step": 6040 + }, + { + "epoch": 0.8587650816181689, + "grad_norm": 6.268226146697998, + "learning_rate": 9.914166075230661e-05, + "loss": 0.30663580894470216, + "step": 6050 + }, + { + "epoch": 0.8601845280340668, + "grad_norm": 5.911208152770996, + "learning_rate": 9.91402413058907e-05, + "loss": 0.38018484115600587, + "step": 6060 + }, + { + "epoch": 0.8616039744499645, + "grad_norm": 5.170897483825684, + "learning_rate": 9.91388218594748e-05, + "loss": 0.22591965198516845, + "step": 6070 + }, + { + "epoch": 0.8630234208658624, + "grad_norm": 5.716799736022949, + "learning_rate": 9.913740241305891e-05, + "loss": 0.2626305103302002, + "step": 6080 + }, + { + "epoch": 0.8644428672817601, + "grad_norm": 6.144148349761963, + "learning_rate": 9.913598296664301e-05, + "loss": 0.23459088802337646, + "step": 6090 + }, + { + "epoch": 0.8658623136976579, + "grad_norm": 8.506244659423828, + "learning_rate": 9.913456352022712e-05, + "loss": 0.36330761909484866, + "step": 6100 + }, + { + "epoch": 0.8672817601135557, + "grad_norm": 9.882643699645996, + "learning_rate": 9.913314407381122e-05, + "loss": 0.32826101779937744, + "step": 6110 + }, + { + "epoch": 0.8687012065294535, + "grad_norm": 8.62743091583252, + "learning_rate": 9.913172462739532e-05, + "loss": 0.30355727672576904, + "step": 6120 + }, + { + "epoch": 0.8701206529453513, + "grad_norm": 11.726634979248047, + "learning_rate": 9.913030518097943e-05, + "loss": 0.280806303024292, + "step": 6130 + }, + { + "epoch": 0.8715400993612491, + "grad_norm": 7.7827839851379395, + "learning_rate": 9.912888573456352e-05, + "loss": 0.3389289855957031, + "step": 6140 + }, + { + "epoch": 0.872959545777147, + "grad_norm": 12.07807731628418, + "learning_rate": 9.912746628814764e-05, + "loss": 0.31570281982421877, + "step": 6150 + }, + { + "epoch": 0.8743789921930447, + "grad_norm": 4.949673652648926, + "learning_rate": 9.912604684173173e-05, + "loss": 0.269368839263916, + "step": 6160 + }, + { + "epoch": 0.8757984386089425, + "grad_norm": 6.946098327636719, + "learning_rate": 9.912462739531583e-05, + "loss": 0.33236119747161863, + "step": 6170 + }, + { + "epoch": 0.8772178850248403, + "grad_norm": 7.137246131896973, + "learning_rate": 9.912320794889993e-05, + "loss": 0.3343817710876465, + "step": 6180 + }, + { + "epoch": 0.8786373314407381, + "grad_norm": 4.929990768432617, + "learning_rate": 9.912178850248404e-05, + "loss": 0.23963472843170167, + "step": 6190 + }, + { + "epoch": 0.8800567778566359, + "grad_norm": 10.46869945526123, + "learning_rate": 9.912036905606814e-05, + "loss": 0.2913534641265869, + "step": 6200 + }, + { + "epoch": 0.8814762242725337, + "grad_norm": 7.179393291473389, + "learning_rate": 9.911894960965225e-05, + "loss": 0.27806806564331055, + "step": 6210 + }, + { + "epoch": 0.8828956706884316, + "grad_norm": 5.430668830871582, + "learning_rate": 9.911753016323634e-05, + "loss": 0.2537125587463379, + "step": 6220 + }, + { + "epoch": 0.8843151171043293, + "grad_norm": 7.001239776611328, + "learning_rate": 9.911611071682044e-05, + "loss": 0.2821568489074707, + "step": 6230 + }, + { + "epoch": 0.8857345635202271, + "grad_norm": 10.218942642211914, + "learning_rate": 9.911469127040455e-05, + "loss": 0.30785112380981444, + "step": 6240 + }, + { + "epoch": 0.8871540099361249, + "grad_norm": 3.9179635047912598, + "learning_rate": 9.911327182398865e-05, + "loss": 0.3376051902770996, + "step": 6250 + }, + { + "epoch": 0.8885734563520227, + "grad_norm": 7.35114049911499, + "learning_rate": 9.911185237757276e-05, + "loss": 0.2029582977294922, + "step": 6260 + }, + { + "epoch": 0.8899929027679205, + "grad_norm": 7.477942943572998, + "learning_rate": 9.911043293115684e-05, + "loss": 0.31639838218688965, + "step": 6270 + }, + { + "epoch": 0.8914123491838183, + "grad_norm": 6.479630470275879, + "learning_rate": 9.910901348474096e-05, + "loss": 0.35874156951904296, + "step": 6280 + }, + { + "epoch": 0.8928317955997161, + "grad_norm": 5.139812469482422, + "learning_rate": 9.910759403832505e-05, + "loss": 0.23642609119415284, + "step": 6290 + }, + { + "epoch": 0.8942512420156139, + "grad_norm": 7.17330265045166, + "learning_rate": 9.910617459190916e-05, + "loss": 0.27939982414245607, + "step": 6300 + }, + { + "epoch": 0.8956706884315118, + "grad_norm": 8.804689407348633, + "learning_rate": 9.910475514549326e-05, + "loss": 0.3722469568252563, + "step": 6310 + }, + { + "epoch": 0.8970901348474095, + "grad_norm": 2.958435297012329, + "learning_rate": 9.910333569907736e-05, + "loss": 0.23576738834381103, + "step": 6320 + }, + { + "epoch": 0.8985095812633073, + "grad_norm": 10.53680419921875, + "learning_rate": 9.910191625266147e-05, + "loss": 0.4027998447418213, + "step": 6330 + }, + { + "epoch": 0.8999290276792051, + "grad_norm": 5.857926368713379, + "learning_rate": 9.910049680624557e-05, + "loss": 0.29457688331604004, + "step": 6340 + }, + { + "epoch": 0.9013484740951029, + "grad_norm": 1.7572773694992065, + "learning_rate": 9.909907735982968e-05, + "loss": 0.2572882890701294, + "step": 6350 + }, + { + "epoch": 0.9027679205110007, + "grad_norm": 4.274378299713135, + "learning_rate": 9.909765791341377e-05, + "loss": 0.23681292533874512, + "step": 6360 + }, + { + "epoch": 0.9041873669268985, + "grad_norm": 7.596087455749512, + "learning_rate": 9.909623846699787e-05, + "loss": 0.23812153339385986, + "step": 6370 + }, + { + "epoch": 0.9056068133427964, + "grad_norm": 5.59556770324707, + "learning_rate": 9.909481902058197e-05, + "loss": 0.29871695041656493, + "step": 6380 + }, + { + "epoch": 0.9070262597586941, + "grad_norm": 4.671100616455078, + "learning_rate": 9.909339957416608e-05, + "loss": 0.23768167495727538, + "step": 6390 + }, + { + "epoch": 0.9084457061745919, + "grad_norm": 6.55142068862915, + "learning_rate": 9.909198012775018e-05, + "loss": 0.2650206804275513, + "step": 6400 + }, + { + "epoch": 0.9098651525904897, + "grad_norm": 7.774087429046631, + "learning_rate": 9.909056068133429e-05, + "loss": 0.2898139238357544, + "step": 6410 + }, + { + "epoch": 0.9112845990063875, + "grad_norm": 6.386779308319092, + "learning_rate": 9.908914123491839e-05, + "loss": 0.26163647174835203, + "step": 6420 + }, + { + "epoch": 0.9127040454222853, + "grad_norm": 7.33029317855835, + "learning_rate": 9.908772178850248e-05, + "loss": 0.2447366952896118, + "step": 6430 + }, + { + "epoch": 0.9141234918381831, + "grad_norm": 10.35724925994873, + "learning_rate": 9.90863023420866e-05, + "loss": 0.2560460329055786, + "step": 6440 + }, + { + "epoch": 0.915542938254081, + "grad_norm": 9.2293062210083, + "learning_rate": 9.908488289567069e-05, + "loss": 0.3864759922027588, + "step": 6450 + }, + { + "epoch": 0.9169623846699787, + "grad_norm": 8.472285270690918, + "learning_rate": 9.90834634492548e-05, + "loss": 0.2888746976852417, + "step": 6460 + }, + { + "epoch": 0.9183818310858765, + "grad_norm": 6.22374153137207, + "learning_rate": 9.90820440028389e-05, + "loss": 0.2505399942398071, + "step": 6470 + }, + { + "epoch": 0.9198012775017743, + "grad_norm": 7.827479839324951, + "learning_rate": 9.9080624556423e-05, + "loss": 0.2327653408050537, + "step": 6480 + }, + { + "epoch": 0.9212207239176721, + "grad_norm": 7.873356819152832, + "learning_rate": 9.90792051100071e-05, + "loss": 0.2565167903900146, + "step": 6490 + }, + { + "epoch": 0.9226401703335699, + "grad_norm": 4.665884494781494, + "learning_rate": 9.90777856635912e-05, + "loss": 0.2404710292816162, + "step": 6500 + }, + { + "epoch": 0.9226401703335699, + "eval_accuracy": 0.9011890379601959, + "eval_loss": 0.29011303186416626, + "eval_runtime": 34.6022, + "eval_samples_per_second": 454.509, + "eval_steps_per_second": 14.219, + "step": 6500 + }, + { + "epoch": 0.9240596167494677, + "grad_norm": 7.10374641418457, + "learning_rate": 9.90763662171753e-05, + "loss": 0.28783435821533204, + "step": 6510 + }, + { + "epoch": 0.9254790631653655, + "grad_norm": 7.5799784660339355, + "learning_rate": 9.907494677075941e-05, + "loss": 0.3219441890716553, + "step": 6520 + }, + { + "epoch": 0.9268985095812633, + "grad_norm": 3.9083335399627686, + "learning_rate": 9.907352732434351e-05, + "loss": 0.2374324083328247, + "step": 6530 + }, + { + "epoch": 0.9283179559971612, + "grad_norm": 9.309243202209473, + "learning_rate": 9.907210787792761e-05, + "loss": 0.2314399242401123, + "step": 6540 + }, + { + "epoch": 0.9297374024130589, + "grad_norm": 5.650235176086426, + "learning_rate": 9.907068843151172e-05, + "loss": 0.2187626600265503, + "step": 6550 + }, + { + "epoch": 0.9311568488289567, + "grad_norm": 5.9835710525512695, + "learning_rate": 9.906926898509582e-05, + "loss": 0.27225399017333984, + "step": 6560 + }, + { + "epoch": 0.9325762952448545, + "grad_norm": 8.403820991516113, + "learning_rate": 9.906784953867993e-05, + "loss": 0.24051570892333984, + "step": 6570 + }, + { + "epoch": 0.9339957416607523, + "grad_norm": 5.456867218017578, + "learning_rate": 9.906643009226401e-05, + "loss": 0.229835844039917, + "step": 6580 + }, + { + "epoch": 0.9354151880766501, + "grad_norm": 11.34472942352295, + "learning_rate": 9.906501064584812e-05, + "loss": 0.28583712577819825, + "step": 6590 + }, + { + "epoch": 0.9368346344925479, + "grad_norm": 7.0680694580078125, + "learning_rate": 9.906359119943222e-05, + "loss": 0.28688597679138184, + "step": 6600 + }, + { + "epoch": 0.9382540809084458, + "grad_norm": 4.637568950653076, + "learning_rate": 9.906217175301633e-05, + "loss": 0.3234848976135254, + "step": 6610 + }, + { + "epoch": 0.9396735273243435, + "grad_norm": 4.935168743133545, + "learning_rate": 9.906075230660043e-05, + "loss": 0.2546673059463501, + "step": 6620 + }, + { + "epoch": 0.9410929737402413, + "grad_norm": 8.563390731811523, + "learning_rate": 9.905933286018453e-05, + "loss": 0.26501734256744386, + "step": 6630 + }, + { + "epoch": 0.9425124201561391, + "grad_norm": 8.05203914642334, + "learning_rate": 9.905791341376864e-05, + "loss": 0.19906221628189086, + "step": 6640 + }, + { + "epoch": 0.9439318665720369, + "grad_norm": 4.535382270812988, + "learning_rate": 9.905649396735273e-05, + "loss": 0.2355113744735718, + "step": 6650 + }, + { + "epoch": 0.9453513129879347, + "grad_norm": 5.967373371124268, + "learning_rate": 9.905507452093685e-05, + "loss": 0.2591426372528076, + "step": 6660 + }, + { + "epoch": 0.9467707594038325, + "grad_norm": 5.093105792999268, + "learning_rate": 9.905365507452094e-05, + "loss": 0.2508120536804199, + "step": 6670 + }, + { + "epoch": 0.9481902058197303, + "grad_norm": 6.775847911834717, + "learning_rate": 9.905223562810504e-05, + "loss": 0.2802272319793701, + "step": 6680 + }, + { + "epoch": 0.9496096522356281, + "grad_norm": 7.280439376831055, + "learning_rate": 9.905081618168914e-05, + "loss": 0.23689627647399902, + "step": 6690 + }, + { + "epoch": 0.9510290986515259, + "grad_norm": 7.68773078918457, + "learning_rate": 9.904939673527325e-05, + "loss": 0.2927251815795898, + "step": 6700 + }, + { + "epoch": 0.9524485450674237, + "grad_norm": 5.4808831214904785, + "learning_rate": 9.904797728885735e-05, + "loss": 0.28672428131103517, + "step": 6710 + }, + { + "epoch": 0.9538679914833215, + "grad_norm": 8.087321281433105, + "learning_rate": 9.904655784244146e-05, + "loss": 0.3129342794418335, + "step": 6720 + }, + { + "epoch": 0.9552874378992193, + "grad_norm": 2.7893686294555664, + "learning_rate": 9.904513839602555e-05, + "loss": 0.22520501613616944, + "step": 6730 + }, + { + "epoch": 0.9567068843151171, + "grad_norm": 10.040759086608887, + "learning_rate": 9.904371894960965e-05, + "loss": 0.2705253601074219, + "step": 6740 + }, + { + "epoch": 0.9581263307310149, + "grad_norm": 3.0198464393615723, + "learning_rate": 9.904229950319376e-05, + "loss": 0.27905032634735105, + "step": 6750 + }, + { + "epoch": 0.9595457771469127, + "grad_norm": 9.044099807739258, + "learning_rate": 9.904088005677786e-05, + "loss": 0.2549771547317505, + "step": 6760 + }, + { + "epoch": 0.9609652235628106, + "grad_norm": 3.4965715408325195, + "learning_rate": 9.903946061036197e-05, + "loss": 0.2617889165878296, + "step": 6770 + }, + { + "epoch": 0.9623846699787083, + "grad_norm": 4.959318161010742, + "learning_rate": 9.903804116394605e-05, + "loss": 0.24190716743469237, + "step": 6780 + }, + { + "epoch": 0.9638041163946061, + "grad_norm": 4.6404314041137695, + "learning_rate": 9.903662171753017e-05, + "loss": 0.29865779876708987, + "step": 6790 + }, + { + "epoch": 0.9652235628105039, + "grad_norm": 6.315147876739502, + "learning_rate": 9.903520227111426e-05, + "loss": 0.2937409162521362, + "step": 6800 + }, + { + "epoch": 0.9666430092264017, + "grad_norm": 6.294488906860352, + "learning_rate": 9.903378282469837e-05, + "loss": 0.28489468097686765, + "step": 6810 + }, + { + "epoch": 0.9680624556422995, + "grad_norm": 6.917492866516113, + "learning_rate": 9.903236337828248e-05, + "loss": 0.18736352920532226, + "step": 6820 + }, + { + "epoch": 0.9694819020581973, + "grad_norm": 6.20442533493042, + "learning_rate": 9.903094393186658e-05, + "loss": 0.24552693367004394, + "step": 6830 + }, + { + "epoch": 0.9709013484740951, + "grad_norm": 9.16247844696045, + "learning_rate": 9.902952448545068e-05, + "loss": 0.22968952655792235, + "step": 6840 + }, + { + "epoch": 0.9723207948899929, + "grad_norm": 8.185150146484375, + "learning_rate": 9.902810503903478e-05, + "loss": 0.25458450317382814, + "step": 6850 + }, + { + "epoch": 0.9737402413058907, + "grad_norm": 8.134267807006836, + "learning_rate": 9.902668559261889e-05, + "loss": 0.25451316833496096, + "step": 6860 + }, + { + "epoch": 0.9751596877217885, + "grad_norm": 12.39373779296875, + "learning_rate": 9.902526614620298e-05, + "loss": 0.2887612819671631, + "step": 6870 + }, + { + "epoch": 0.9765791341376863, + "grad_norm": 7.776149272918701, + "learning_rate": 9.90238466997871e-05, + "loss": 0.3695904970169067, + "step": 6880 + }, + { + "epoch": 0.9779985805535841, + "grad_norm": 6.241235256195068, + "learning_rate": 9.902242725337118e-05, + "loss": 0.26552643775939944, + "step": 6890 + }, + { + "epoch": 0.9794180269694819, + "grad_norm": 11.734026908874512, + "learning_rate": 9.902100780695529e-05, + "loss": 0.32755370140075685, + "step": 6900 + }, + { + "epoch": 0.9808374733853797, + "grad_norm": 6.049038887023926, + "learning_rate": 9.90195883605394e-05, + "loss": 0.22059807777404786, + "step": 6910 + }, + { + "epoch": 0.9822569198012775, + "grad_norm": 4.156560897827148, + "learning_rate": 9.901831085876509e-05, + "loss": 0.3507907629013062, + "step": 6920 + }, + { + "epoch": 0.9836763662171752, + "grad_norm": 4.315751552581787, + "learning_rate": 9.901689141234918e-05, + "loss": 0.25436155796051024, + "step": 6930 + }, + { + "epoch": 0.9850958126330731, + "grad_norm": 6.76514196395874, + "learning_rate": 9.90154719659333e-05, + "loss": 0.24831132888793944, + "step": 6940 + }, + { + "epoch": 0.9865152590489709, + "grad_norm": 6.7387261390686035, + "learning_rate": 9.901405251951739e-05, + "loss": 0.23655142784118652, + "step": 6950 + }, + { + "epoch": 0.9879347054648687, + "grad_norm": 3.8014583587646484, + "learning_rate": 9.901263307310149e-05, + "loss": 0.2415374994277954, + "step": 6960 + }, + { + "epoch": 0.9893541518807665, + "grad_norm": 5.04398775100708, + "learning_rate": 9.90112136266856e-05, + "loss": 0.23744730949401854, + "step": 6970 + }, + { + "epoch": 0.9907735982966643, + "grad_norm": 5.434844017028809, + "learning_rate": 9.90097941802697e-05, + "loss": 0.24512255191802979, + "step": 6980 + }, + { + "epoch": 0.9921930447125621, + "grad_norm": 5.528685092926025, + "learning_rate": 9.900837473385381e-05, + "loss": 0.2296142578125, + "step": 6990 + }, + { + "epoch": 0.99361249112846, + "grad_norm": 5.2856526374816895, + "learning_rate": 9.90069552874379e-05, + "loss": 0.2707331418991089, + "step": 7000 + }, + { + "epoch": 0.99361249112846, + "eval_accuracy": 0.9093914923380174, + "eval_loss": 0.27007216215133667, + "eval_runtime": 33.3907, + "eval_samples_per_second": 470.999, + "eval_steps_per_second": 14.735, + "step": 7000 + }, + { + "epoch": 0.9950319375443577, + "grad_norm": 8.654793739318848, + "learning_rate": 9.9005535841022e-05, + "loss": 0.34286386966705323, + "step": 7010 + }, + { + "epoch": 0.9964513839602555, + "grad_norm": 3.311750888824463, + "learning_rate": 9.90041163946061e-05, + "loss": 0.269917893409729, + "step": 7020 + }, + { + "epoch": 0.9978708303761533, + "grad_norm": 6.643321514129639, + "learning_rate": 9.900269694819021e-05, + "loss": 0.2132892370223999, + "step": 7030 + }, + { + "epoch": 0.9992902767920511, + "grad_norm": 10.397172927856445, + "learning_rate": 9.900127750177431e-05, + "loss": 0.2613171339035034, + "step": 7040 + }, + { + "epoch": 1.000709723207949, + "grad_norm": 6.357808589935303, + "learning_rate": 9.899985805535842e-05, + "loss": 0.2258657455444336, + "step": 7050 + }, + { + "epoch": 1.0021291696238468, + "grad_norm": 6.077082633972168, + "learning_rate": 9.899843860894252e-05, + "loss": 0.20697100162506105, + "step": 7060 + }, + { + "epoch": 1.0035486160397444, + "grad_norm": 12.1661376953125, + "learning_rate": 9.899701916252661e-05, + "loss": 0.1927890658378601, + "step": 7070 + }, + { + "epoch": 1.0049680624556423, + "grad_norm": 4.968541145324707, + "learning_rate": 9.899559971611073e-05, + "loss": 0.23719356060028077, + "step": 7080 + }, + { + "epoch": 1.0063875088715402, + "grad_norm": 8.79593563079834, + "learning_rate": 9.899418026969482e-05, + "loss": 0.18882639408111573, + "step": 7090 + }, + { + "epoch": 1.0078069552874378, + "grad_norm": 5.142887115478516, + "learning_rate": 9.899276082327893e-05, + "loss": 0.2634677171707153, + "step": 7100 + }, + { + "epoch": 1.0092264017033357, + "grad_norm": 8.761039733886719, + "learning_rate": 9.899134137686302e-05, + "loss": 0.321915602684021, + "step": 7110 + }, + { + "epoch": 1.0106458481192335, + "grad_norm": 3.3865628242492676, + "learning_rate": 9.898992193044713e-05, + "loss": 0.23035690784454346, + "step": 7120 + }, + { + "epoch": 1.0120652945351314, + "grad_norm": 5.229470729827881, + "learning_rate": 9.898850248403123e-05, + "loss": 0.23260829448699952, + "step": 7130 + }, + { + "epoch": 1.013484740951029, + "grad_norm": 6.637743949890137, + "learning_rate": 9.898708303761534e-05, + "loss": 0.29780044555664065, + "step": 7140 + }, + { + "epoch": 1.014904187366927, + "grad_norm": 5.488855838775635, + "learning_rate": 9.898566359119943e-05, + "loss": 0.17786208391189576, + "step": 7150 + }, + { + "epoch": 1.0163236337828248, + "grad_norm": 3.6873295307159424, + "learning_rate": 9.898424414478355e-05, + "loss": 0.16665832996368407, + "step": 7160 + }, + { + "epoch": 1.0177430801987224, + "grad_norm": 3.507009267807007, + "learning_rate": 9.898282469836764e-05, + "loss": 0.2571221351623535, + "step": 7170 + }, + { + "epoch": 1.0191625266146203, + "grad_norm": 3.279927968978882, + "learning_rate": 9.898140525195174e-05, + "loss": 0.2422633171081543, + "step": 7180 + }, + { + "epoch": 1.0205819730305181, + "grad_norm": 7.186861991882324, + "learning_rate": 9.897998580553585e-05, + "loss": 0.2877654552459717, + "step": 7190 + }, + { + "epoch": 1.022001419446416, + "grad_norm": 8.821130752563477, + "learning_rate": 9.897856635911995e-05, + "loss": 0.21563093662261962, + "step": 7200 + }, + { + "epoch": 1.0234208658623136, + "grad_norm": 1.849163293838501, + "learning_rate": 9.897714691270406e-05, + "loss": 0.21513009071350098, + "step": 7210 + }, + { + "epoch": 1.0248403122782115, + "grad_norm": 7.898414611816406, + "learning_rate": 9.897572746628814e-05, + "loss": 0.24002442359924317, + "step": 7220 + }, + { + "epoch": 1.0262597586941093, + "grad_norm": 8.41958236694336, + "learning_rate": 9.897430801987225e-05, + "loss": 0.22358598709106445, + "step": 7230 + }, + { + "epoch": 1.027679205110007, + "grad_norm": 5.978959560394287, + "learning_rate": 9.897288857345635e-05, + "loss": 0.24321112632751465, + "step": 7240 + }, + { + "epoch": 1.0290986515259049, + "grad_norm": 7.758601665496826, + "learning_rate": 9.897146912704046e-05, + "loss": 0.2519962310791016, + "step": 7250 + }, + { + "epoch": 1.0305180979418027, + "grad_norm": 6.9067487716674805, + "learning_rate": 9.897004968062456e-05, + "loss": 0.22714946269989014, + "step": 7260 + }, + { + "epoch": 1.0319375443577006, + "grad_norm": 7.974116802215576, + "learning_rate": 9.896863023420866e-05, + "loss": 0.22177364826202392, + "step": 7270 + }, + { + "epoch": 1.0333569907735982, + "grad_norm": 2.706422805786133, + "learning_rate": 9.896721078779277e-05, + "loss": 0.19734153747558594, + "step": 7280 + }, + { + "epoch": 1.034776437189496, + "grad_norm": 10.539275169372559, + "learning_rate": 9.896579134137687e-05, + "loss": 0.2604410648345947, + "step": 7290 + }, + { + "epoch": 1.036195883605394, + "grad_norm": 6.023902893066406, + "learning_rate": 9.896437189496098e-05, + "loss": 0.23188574314117433, + "step": 7300 + }, + { + "epoch": 1.0376153300212918, + "grad_norm": 4.0170512199401855, + "learning_rate": 9.896295244854507e-05, + "loss": 0.20175492763519287, + "step": 7310 + }, + { + "epoch": 1.0390347764371894, + "grad_norm": 4.9612579345703125, + "learning_rate": 9.896153300212917e-05, + "loss": 0.2120590925216675, + "step": 7320 + }, + { + "epoch": 1.0404542228530873, + "grad_norm": 4.898397922515869, + "learning_rate": 9.896011355571327e-05, + "loss": 0.22397477626800538, + "step": 7330 + }, + { + "epoch": 1.0418736692689852, + "grad_norm": 7.394660472869873, + "learning_rate": 9.895869410929738e-05, + "loss": 0.2079904556274414, + "step": 7340 + }, + { + "epoch": 1.0432931156848828, + "grad_norm": 3.7839152812957764, + "learning_rate": 9.895727466288148e-05, + "loss": 0.1861090302467346, + "step": 7350 + }, + { + "epoch": 1.0447125621007807, + "grad_norm": 6.4003496170043945, + "learning_rate": 9.895585521646559e-05, + "loss": 0.21509413719177245, + "step": 7360 + }, + { + "epoch": 1.0461320085166785, + "grad_norm": 5.966845989227295, + "learning_rate": 9.895443577004969e-05, + "loss": 0.22056474685668945, + "step": 7370 + }, + { + "epoch": 1.0475514549325764, + "grad_norm": 3.580226182937622, + "learning_rate": 9.895301632363378e-05, + "loss": 0.2572075128555298, + "step": 7380 + }, + { + "epoch": 1.048970901348474, + "grad_norm": 7.922166347503662, + "learning_rate": 9.89515968772179e-05, + "loss": 0.26929004192352296, + "step": 7390 + }, + { + "epoch": 1.050390347764372, + "grad_norm": 8.884166717529297, + "learning_rate": 9.895017743080199e-05, + "loss": 0.23953988552093505, + "step": 7400 + }, + { + "epoch": 1.0518097941802698, + "grad_norm": 13.472792625427246, + "learning_rate": 9.89487579843861e-05, + "loss": 0.26428995132446287, + "step": 7410 + }, + { + "epoch": 1.0532292405961674, + "grad_norm": 5.455354690551758, + "learning_rate": 9.894733853797019e-05, + "loss": 0.22658278942108154, + "step": 7420 + }, + { + "epoch": 1.0546486870120653, + "grad_norm": 12.143173217773438, + "learning_rate": 9.89459190915543e-05, + "loss": 0.2838724136352539, + "step": 7430 + }, + { + "epoch": 1.0560681334279631, + "grad_norm": 12.741036415100098, + "learning_rate": 9.89444996451384e-05, + "loss": 0.22514543533325196, + "step": 7440 + }, + { + "epoch": 1.057487579843861, + "grad_norm": 3.3944201469421387, + "learning_rate": 9.89430801987225e-05, + "loss": 0.2505282163619995, + "step": 7450 + }, + { + "epoch": 1.0589070262597586, + "grad_norm": 4.490118503570557, + "learning_rate": 9.89416607523066e-05, + "loss": 0.24113750457763672, + "step": 7460 + }, + { + "epoch": 1.0603264726756565, + "grad_norm": 3.8860394954681396, + "learning_rate": 9.89402413058907e-05, + "loss": 0.19650124311447142, + "step": 7470 + }, + { + "epoch": 1.0617459190915544, + "grad_norm": 8.089933395385742, + "learning_rate": 9.893882185947481e-05, + "loss": 0.20081098079681398, + "step": 7480 + }, + { + "epoch": 1.063165365507452, + "grad_norm": 5.854043483734131, + "learning_rate": 9.893740241305891e-05, + "loss": 0.19387896060943605, + "step": 7490 + }, + { + "epoch": 1.0645848119233499, + "grad_norm": 3.3195252418518066, + "learning_rate": 9.893598296664302e-05, + "loss": 0.1918407201766968, + "step": 7500 + }, + { + "epoch": 1.0645848119233499, + "eval_accuracy": 0.9091371526673873, + "eval_loss": 0.25946471095085144, + "eval_runtime": 32.8002, + "eval_samples_per_second": 479.478, + "eval_steps_per_second": 15.0, + "step": 7500 + }, + { + "epoch": 1.0660042583392477, + "grad_norm": 7.044492244720459, + "learning_rate": 9.893456352022712e-05, + "loss": 0.18088626861572266, + "step": 7510 + }, + { + "epoch": 1.0674237047551456, + "grad_norm": 2.1477725505828857, + "learning_rate": 9.893314407381123e-05, + "loss": 0.25041606426239016, + "step": 7520 + }, + { + "epoch": 1.0688431511710432, + "grad_norm": 5.232922077178955, + "learning_rate": 9.893172462739531e-05, + "loss": 0.13164312839508058, + "step": 7530 + }, + { + "epoch": 1.070262597586941, + "grad_norm": 7.097192764282227, + "learning_rate": 9.893030518097942e-05, + "loss": 0.2210529088973999, + "step": 7540 + }, + { + "epoch": 1.071682044002839, + "grad_norm": 6.555529594421387, + "learning_rate": 9.892888573456352e-05, + "loss": 0.22583472728729248, + "step": 7550 + }, + { + "epoch": 1.0731014904187366, + "grad_norm": 4.672628879547119, + "learning_rate": 9.892746628814763e-05, + "loss": 0.2420278787612915, + "step": 7560 + }, + { + "epoch": 1.0745209368346345, + "grad_norm": 5.684006690979004, + "learning_rate": 9.892604684173174e-05, + "loss": 0.16603726148605347, + "step": 7570 + }, + { + "epoch": 1.0759403832505323, + "grad_norm": 8.538924217224121, + "learning_rate": 9.892462739531582e-05, + "loss": 0.22756731510162354, + "step": 7580 + }, + { + "epoch": 1.0773598296664302, + "grad_norm": 10.23405647277832, + "learning_rate": 9.892320794889994e-05, + "loss": 0.17195621728897095, + "step": 7590 + }, + { + "epoch": 1.0787792760823278, + "grad_norm": 3.4394562244415283, + "learning_rate": 9.892178850248403e-05, + "loss": 0.1631350874900818, + "step": 7600 + }, + { + "epoch": 1.0801987224982257, + "grad_norm": 9.240316390991211, + "learning_rate": 9.892036905606814e-05, + "loss": 0.2647270917892456, + "step": 7610 + }, + { + "epoch": 1.0816181689141235, + "grad_norm": 11.555622100830078, + "learning_rate": 9.891894960965224e-05, + "loss": 0.26429762840271, + "step": 7620 + }, + { + "epoch": 1.0830376153300212, + "grad_norm": 2.4831769466400146, + "learning_rate": 9.891753016323634e-05, + "loss": 0.29258711338043214, + "step": 7630 + }, + { + "epoch": 1.084457061745919, + "grad_norm": 4.935022830963135, + "learning_rate": 9.891611071682044e-05, + "loss": 0.21570188999176027, + "step": 7640 + }, + { + "epoch": 1.085876508161817, + "grad_norm": 11.602439880371094, + "learning_rate": 9.891469127040455e-05, + "loss": 0.32711737155914306, + "step": 7650 + }, + { + "epoch": 1.0872959545777148, + "grad_norm": 6.064338207244873, + "learning_rate": 9.891327182398866e-05, + "loss": 0.226470947265625, + "step": 7660 + }, + { + "epoch": 1.0887154009936124, + "grad_norm": 5.629254341125488, + "learning_rate": 9.891185237757276e-05, + "loss": 0.1874476909637451, + "step": 7670 + }, + { + "epoch": 1.0901348474095103, + "grad_norm": 6.994508743286133, + "learning_rate": 9.891043293115685e-05, + "loss": 0.2323138952255249, + "step": 7680 + }, + { + "epoch": 1.0915542938254081, + "grad_norm": 7.654874324798584, + "learning_rate": 9.890901348474095e-05, + "loss": 0.267806077003479, + "step": 7690 + }, + { + "epoch": 1.0929737402413058, + "grad_norm": 2.5339603424072266, + "learning_rate": 9.890759403832506e-05, + "loss": 0.17415390014648438, + "step": 7700 + }, + { + "epoch": 1.0943931866572036, + "grad_norm": 9.036078453063965, + "learning_rate": 9.890617459190916e-05, + "loss": 0.26232335567474363, + "step": 7710 + }, + { + "epoch": 1.0958126330731015, + "grad_norm": 8.1493558883667, + "learning_rate": 9.890475514549327e-05, + "loss": 0.26018438339233396, + "step": 7720 + }, + { + "epoch": 1.0972320794889994, + "grad_norm": 4.394131660461426, + "learning_rate": 9.890333569907735e-05, + "loss": 0.20033717155456543, + "step": 7730 + }, + { + "epoch": 1.098651525904897, + "grad_norm": 7.311230659484863, + "learning_rate": 9.890191625266146e-05, + "loss": 0.2336057662963867, + "step": 7740 + }, + { + "epoch": 1.1000709723207949, + "grad_norm": 3.716153621673584, + "learning_rate": 9.890049680624556e-05, + "loss": 0.21649951934814454, + "step": 7750 + }, + { + "epoch": 1.1014904187366927, + "grad_norm": 5.747766017913818, + "learning_rate": 9.889907735982967e-05, + "loss": 0.21761865615844728, + "step": 7760 + }, + { + "epoch": 1.1029098651525904, + "grad_norm": 2.6889519691467285, + "learning_rate": 9.889765791341378e-05, + "loss": 0.2489168405532837, + "step": 7770 + }, + { + "epoch": 1.1043293115684882, + "grad_norm": 6.918911933898926, + "learning_rate": 9.889623846699787e-05, + "loss": 0.22506451606750488, + "step": 7780 + }, + { + "epoch": 1.105748757984386, + "grad_norm": 6.129018783569336, + "learning_rate": 9.889481902058198e-05, + "loss": 0.22557535171508789, + "step": 7790 + }, + { + "epoch": 1.107168204400284, + "grad_norm": 6.179121017456055, + "learning_rate": 9.889339957416608e-05, + "loss": 0.20877602100372314, + "step": 7800 + }, + { + "epoch": 1.1085876508161816, + "grad_norm": 4.490073204040527, + "learning_rate": 9.889198012775019e-05, + "loss": 0.24456796646118165, + "step": 7810 + }, + { + "epoch": 1.1100070972320795, + "grad_norm": 11.580991744995117, + "learning_rate": 9.889056068133428e-05, + "loss": 0.2545257806777954, + "step": 7820 + }, + { + "epoch": 1.1114265436479773, + "grad_norm": 5.933578968048096, + "learning_rate": 9.88891412349184e-05, + "loss": 0.20906269550323486, + "step": 7830 + }, + { + "epoch": 1.1128459900638752, + "grad_norm": 8.964847564697266, + "learning_rate": 9.888772178850248e-05, + "loss": 0.21426281929016114, + "step": 7840 + }, + { + "epoch": 1.1142654364797728, + "grad_norm": 3.047978401184082, + "learning_rate": 9.888630234208659e-05, + "loss": 0.20127902030944825, + "step": 7850 + }, + { + "epoch": 1.1156848828956707, + "grad_norm": 11.52719783782959, + "learning_rate": 9.88848828956707e-05, + "loss": 0.23301458358764648, + "step": 7860 + }, + { + "epoch": 1.1171043293115686, + "grad_norm": 4.898934364318848, + "learning_rate": 9.88834634492548e-05, + "loss": 0.26660704612731934, + "step": 7870 + }, + { + "epoch": 1.1185237757274662, + "grad_norm": 6.535075664520264, + "learning_rate": 9.888204400283891e-05, + "loss": 0.2355792284011841, + "step": 7880 + }, + { + "epoch": 1.119943222143364, + "grad_norm": 6.307318687438965, + "learning_rate": 9.888062455642299e-05, + "loss": 0.20682175159454347, + "step": 7890 + }, + { + "epoch": 1.121362668559262, + "grad_norm": 3.9123454093933105, + "learning_rate": 9.88792051100071e-05, + "loss": 0.3205126762390137, + "step": 7900 + }, + { + "epoch": 1.1227821149751598, + "grad_norm": 9.152158737182617, + "learning_rate": 9.88777856635912e-05, + "loss": 0.2413860082626343, + "step": 7910 + }, + { + "epoch": 1.1242015613910574, + "grad_norm": 9.178197860717773, + "learning_rate": 9.887636621717531e-05, + "loss": 0.32107110023498536, + "step": 7920 + }, + { + "epoch": 1.1256210078069553, + "grad_norm": 8.382686614990234, + "learning_rate": 9.887494677075941e-05, + "loss": 0.26145339012145996, + "step": 7930 + }, + { + "epoch": 1.1270404542228531, + "grad_norm": 6.847768306732178, + "learning_rate": 9.88735273243435e-05, + "loss": 0.21859989166259766, + "step": 7940 + }, + { + "epoch": 1.1284599006387508, + "grad_norm": 3.770111560821533, + "learning_rate": 9.887210787792762e-05, + "loss": 0.13420095443725585, + "step": 7950 + }, + { + "epoch": 1.1298793470546487, + "grad_norm": 7.4002509117126465, + "learning_rate": 9.887068843151171e-05, + "loss": 0.18695064783096313, + "step": 7960 + }, + { + "epoch": 1.1312987934705465, + "grad_norm": 4.0712761878967285, + "learning_rate": 9.886926898509583e-05, + "loss": 0.20656538009643555, + "step": 7970 + }, + { + "epoch": 1.1327182398864444, + "grad_norm": 4.4091291427612305, + "learning_rate": 9.886784953867992e-05, + "loss": 0.28663394451141355, + "step": 7980 + }, + { + "epoch": 1.134137686302342, + "grad_norm": 10.553000450134277, + "learning_rate": 9.886643009226402e-05, + "loss": 0.319093132019043, + "step": 7990 + }, + { + "epoch": 1.1355571327182399, + "grad_norm": 6.1367597579956055, + "learning_rate": 9.886501064584812e-05, + "loss": 0.19342881441116333, + "step": 8000 + }, + { + "epoch": 1.1355571327182399, + "eval_accuracy": 0.9207731925987156, + "eval_loss": 0.24032267928123474, + "eval_runtime": 32.4949, + "eval_samples_per_second": 483.984, + "eval_steps_per_second": 15.141, + "step": 8000 + }, + { + "epoch": 1.1369765791341377, + "grad_norm": 2.405918598175049, + "learning_rate": 9.886359119943223e-05, + "loss": 0.22856481075286866, + "step": 8010 + }, + { + "epoch": 1.1383960255500356, + "grad_norm": 3.4976019859313965, + "learning_rate": 9.886217175301633e-05, + "loss": 0.18118438720703126, + "step": 8020 + }, + { + "epoch": 1.1398154719659332, + "grad_norm": 6.432300567626953, + "learning_rate": 9.886075230660044e-05, + "loss": 0.21989898681640624, + "step": 8030 + }, + { + "epoch": 1.141234918381831, + "grad_norm": 8.299015045166016, + "learning_rate": 9.885933286018453e-05, + "loss": 0.18632423877716064, + "step": 8040 + }, + { + "epoch": 1.142654364797729, + "grad_norm": 4.741350173950195, + "learning_rate": 9.885791341376863e-05, + "loss": 0.3003889799118042, + "step": 8050 + }, + { + "epoch": 1.1440738112136266, + "grad_norm": 2.561021327972412, + "learning_rate": 9.885649396735274e-05, + "loss": 0.20989477634429932, + "step": 8060 + }, + { + "epoch": 1.1454932576295245, + "grad_norm": 4.419784069061279, + "learning_rate": 9.885507452093684e-05, + "loss": 0.20898723602294922, + "step": 8070 + }, + { + "epoch": 1.1469127040454223, + "grad_norm": 4.329728603363037, + "learning_rate": 9.885365507452095e-05, + "loss": 0.191938316822052, + "step": 8080 + }, + { + "epoch": 1.1483321504613202, + "grad_norm": 5.096283912658691, + "learning_rate": 9.885223562810503e-05, + "loss": 0.21612834930419922, + "step": 8090 + }, + { + "epoch": 1.1497515968772178, + "grad_norm": 7.623912811279297, + "learning_rate": 9.885081618168915e-05, + "loss": 0.2056267261505127, + "step": 8100 + }, + { + "epoch": 1.1511710432931157, + "grad_norm": 5.211782455444336, + "learning_rate": 9.884939673527324e-05, + "loss": 0.2458388090133667, + "step": 8110 + }, + { + "epoch": 1.1525904897090136, + "grad_norm": 4.73144006729126, + "learning_rate": 9.884797728885735e-05, + "loss": 0.2795632123947144, + "step": 8120 + }, + { + "epoch": 1.1540099361249112, + "grad_norm": 4.658935546875, + "learning_rate": 9.884655784244145e-05, + "loss": 0.19132717847824096, + "step": 8130 + }, + { + "epoch": 1.155429382540809, + "grad_norm": 2.4226841926574707, + "learning_rate": 9.884513839602555e-05, + "loss": 0.2345660448074341, + "step": 8140 + }, + { + "epoch": 1.156848828956707, + "grad_norm": 4.741151809692383, + "learning_rate": 9.884371894960966e-05, + "loss": 0.16295211315155028, + "step": 8150 + }, + { + "epoch": 1.1582682753726048, + "grad_norm": 5.364559173583984, + "learning_rate": 9.884229950319376e-05, + "loss": 0.32001848220825196, + "step": 8160 + }, + { + "epoch": 1.1596877217885024, + "grad_norm": 5.700736045837402, + "learning_rate": 9.884088005677787e-05, + "loss": 0.2149799346923828, + "step": 8170 + }, + { + "epoch": 1.1611071682044003, + "grad_norm": 8.003674507141113, + "learning_rate": 9.883946061036197e-05, + "loss": 0.1882821202278137, + "step": 8180 + }, + { + "epoch": 1.1625266146202982, + "grad_norm": 4.5582122802734375, + "learning_rate": 9.883804116394608e-05, + "loss": 0.21344914436340331, + "step": 8190 + }, + { + "epoch": 1.1639460610361958, + "grad_norm": 7.819937229156494, + "learning_rate": 9.883662171753016e-05, + "loss": 0.20212192535400392, + "step": 8200 + }, + { + "epoch": 1.1653655074520937, + "grad_norm": 4.706314563751221, + "learning_rate": 9.883520227111427e-05, + "loss": 0.23133435249328613, + "step": 8210 + }, + { + "epoch": 1.1667849538679915, + "grad_norm": 6.7971343994140625, + "learning_rate": 9.883378282469837e-05, + "loss": 0.2259516477584839, + "step": 8220 + }, + { + "epoch": 1.1682044002838894, + "grad_norm": 6.324117183685303, + "learning_rate": 9.883236337828248e-05, + "loss": 0.2526458024978638, + "step": 8230 + }, + { + "epoch": 1.169623846699787, + "grad_norm": 11.824000358581543, + "learning_rate": 9.883094393186658e-05, + "loss": 0.28786749839782716, + "step": 8240 + }, + { + "epoch": 1.171043293115685, + "grad_norm": 6.5561089515686035, + "learning_rate": 9.882952448545067e-05, + "loss": 0.2411046028137207, + "step": 8250 + }, + { + "epoch": 1.1724627395315828, + "grad_norm": 9.257662773132324, + "learning_rate": 9.882810503903479e-05, + "loss": 0.2078631639480591, + "step": 8260 + }, + { + "epoch": 1.1738821859474804, + "grad_norm": 6.388674736022949, + "learning_rate": 9.882668559261888e-05, + "loss": 0.2299574851989746, + "step": 8270 + }, + { + "epoch": 1.1753016323633783, + "grad_norm": 5.7360992431640625, + "learning_rate": 9.8825266146203e-05, + "loss": 0.18881726264953613, + "step": 8280 + }, + { + "epoch": 1.1767210787792761, + "grad_norm": 6.240981578826904, + "learning_rate": 9.882384669978709e-05, + "loss": 0.1505158066749573, + "step": 8290 + }, + { + "epoch": 1.178140525195174, + "grad_norm": 5.832661151885986, + "learning_rate": 9.882242725337119e-05, + "loss": 0.22867400646209718, + "step": 8300 + }, + { + "epoch": 1.1795599716110716, + "grad_norm": 10.773929595947266, + "learning_rate": 9.882100780695529e-05, + "loss": 0.1888264536857605, + "step": 8310 + }, + { + "epoch": 1.1809794180269695, + "grad_norm": 3.489490509033203, + "learning_rate": 9.88195883605394e-05, + "loss": 0.1748473525047302, + "step": 8320 + }, + { + "epoch": 1.1823988644428673, + "grad_norm": 5.332619667053223, + "learning_rate": 9.88181689141235e-05, + "loss": 0.20995078086853028, + "step": 8330 + }, + { + "epoch": 1.183818310858765, + "grad_norm": 4.1643147468566895, + "learning_rate": 9.88167494677076e-05, + "loss": 0.17949424982070922, + "step": 8340 + }, + { + "epoch": 1.1852377572746629, + "grad_norm": 5.263898849487305, + "learning_rate": 9.88153300212917e-05, + "loss": 0.17099075317382811, + "step": 8350 + }, + { + "epoch": 1.1866572036905607, + "grad_norm": 10.222403526306152, + "learning_rate": 9.88139105748758e-05, + "loss": 0.163385272026062, + "step": 8360 + }, + { + "epoch": 1.1880766501064586, + "grad_norm": 4.657668113708496, + "learning_rate": 9.881249112845991e-05, + "loss": 0.2960475444793701, + "step": 8370 + }, + { + "epoch": 1.1894960965223562, + "grad_norm": 4.420619964599609, + "learning_rate": 9.881107168204401e-05, + "loss": 0.1871565818786621, + "step": 8380 + }, + { + "epoch": 1.190915542938254, + "grad_norm": 6.741722583770752, + "learning_rate": 9.880965223562812e-05, + "loss": 0.18152236938476562, + "step": 8390 + }, + { + "epoch": 1.192334989354152, + "grad_norm": 7.203516483306885, + "learning_rate": 9.88082327892122e-05, + "loss": 0.21214077472686768, + "step": 8400 + }, + { + "epoch": 1.1937544357700496, + "grad_norm": 4.927282810211182, + "learning_rate": 9.880681334279631e-05, + "loss": 0.2104212999343872, + "step": 8410 + }, + { + "epoch": 1.1951738821859474, + "grad_norm": 5.8592023849487305, + "learning_rate": 9.880539389638041e-05, + "loss": 0.2139230728149414, + "step": 8420 + }, + { + "epoch": 1.1965933286018453, + "grad_norm": 7.09868860244751, + "learning_rate": 9.880397444996452e-05, + "loss": 0.1821369171142578, + "step": 8430 + }, + { + "epoch": 1.1980127750177432, + "grad_norm": 3.22680401802063, + "learning_rate": 9.880255500354862e-05, + "loss": 0.20524086952209472, + "step": 8440 + }, + { + "epoch": 1.1994322214336408, + "grad_norm": 6.953636169433594, + "learning_rate": 9.880113555713272e-05, + "loss": 0.12908190488815308, + "step": 8450 + }, + { + "epoch": 1.2008516678495387, + "grad_norm": 3.305361032485962, + "learning_rate": 9.879971611071683e-05, + "loss": 0.21676282882690429, + "step": 8460 + }, + { + "epoch": 1.2022711142654365, + "grad_norm": 5.03612756729126, + "learning_rate": 9.879829666430093e-05, + "loss": 0.21339573860168456, + "step": 8470 + }, + { + "epoch": 1.2036905606813342, + "grad_norm": 8.03529167175293, + "learning_rate": 9.879687721788504e-05, + "loss": 0.22714192867279054, + "step": 8480 + }, + { + "epoch": 1.205110007097232, + "grad_norm": 11.267200469970703, + "learning_rate": 9.879545777146913e-05, + "loss": 0.2318274736404419, + "step": 8490 + }, + { + "epoch": 1.20652945351313, + "grad_norm": 4.298351764678955, + "learning_rate": 9.879403832505323e-05, + "loss": 0.13804138898849488, + "step": 8500 + }, + { + "epoch": 1.20652945351313, + "eval_accuracy": 0.9154320595154829, + "eval_loss": 0.2389156073331833, + "eval_runtime": 32.8287, + "eval_samples_per_second": 479.062, + "eval_steps_per_second": 14.987, + "step": 8500 + }, + { + "epoch": 1.2079488999290278, + "grad_norm": 7.828441619873047, + "learning_rate": 9.879261887863733e-05, + "loss": 0.22812976837158203, + "step": 8510 + }, + { + "epoch": 1.2093683463449254, + "grad_norm": 6.791322708129883, + "learning_rate": 9.879119943222144e-05, + "loss": 0.2314612865447998, + "step": 8520 + }, + { + "epoch": 1.2107877927608233, + "grad_norm": 2.5891473293304443, + "learning_rate": 9.878977998580554e-05, + "loss": 0.2156294107437134, + "step": 8530 + }, + { + "epoch": 1.2122072391767211, + "grad_norm": 8.005664825439453, + "learning_rate": 9.878836053938965e-05, + "loss": 0.2180927038192749, + "step": 8540 + }, + { + "epoch": 1.2136266855926188, + "grad_norm": 4.849853515625, + "learning_rate": 9.878694109297374e-05, + "loss": 0.2122575521469116, + "step": 8550 + }, + { + "epoch": 1.2150461320085166, + "grad_norm": 2.7616207599639893, + "learning_rate": 9.878552164655784e-05, + "loss": 0.17834146022796632, + "step": 8560 + }, + { + "epoch": 1.2164655784244145, + "grad_norm": 5.352903366088867, + "learning_rate": 9.878410220014195e-05, + "loss": 0.13497724533081054, + "step": 8570 + }, + { + "epoch": 1.2178850248403124, + "grad_norm": 8.255563735961914, + "learning_rate": 9.878268275372605e-05, + "loss": 0.19454526901245117, + "step": 8580 + }, + { + "epoch": 1.21930447125621, + "grad_norm": 3.5060651302337646, + "learning_rate": 9.878126330731016e-05, + "loss": 0.23703739643096924, + "step": 8590 + }, + { + "epoch": 1.2207239176721079, + "grad_norm": 5.917641639709473, + "learning_rate": 9.877984386089426e-05, + "loss": 0.1788935661315918, + "step": 8600 + }, + { + "epoch": 1.2221433640880057, + "grad_norm": 7.5726542472839355, + "learning_rate": 9.877842441447836e-05, + "loss": 0.1879301905632019, + "step": 8610 + }, + { + "epoch": 1.2235628105039034, + "grad_norm": 6.313500881195068, + "learning_rate": 9.877700496806245e-05, + "loss": 0.19519026279449464, + "step": 8620 + }, + { + "epoch": 1.2249822569198012, + "grad_norm": 6.073189735412598, + "learning_rate": 9.877558552164656e-05, + "loss": 0.16100149154663085, + "step": 8630 + }, + { + "epoch": 1.226401703335699, + "grad_norm": 9.31675910949707, + "learning_rate": 9.877416607523066e-05, + "loss": 0.24087250232696533, + "step": 8640 + }, + { + "epoch": 1.227821149751597, + "grad_norm": 6.469115734100342, + "learning_rate": 9.877274662881477e-05, + "loss": 0.15760414600372313, + "step": 8650 + }, + { + "epoch": 1.2292405961674946, + "grad_norm": 5.7666192054748535, + "learning_rate": 9.877132718239887e-05, + "loss": 0.2261284589767456, + "step": 8660 + }, + { + "epoch": 1.2306600425833925, + "grad_norm": 7.881688117980957, + "learning_rate": 9.876990773598297e-05, + "loss": 0.22792091369628906, + "step": 8670 + }, + { + "epoch": 1.2320794889992903, + "grad_norm": 4.771458625793457, + "learning_rate": 9.876848828956708e-05, + "loss": 0.21116392612457274, + "step": 8680 + }, + { + "epoch": 1.233498935415188, + "grad_norm": 9.804439544677734, + "learning_rate": 9.876706884315118e-05, + "loss": 0.25815906524658205, + "step": 8690 + }, + { + "epoch": 1.2349183818310858, + "grad_norm": 3.326082229614258, + "learning_rate": 9.876564939673529e-05, + "loss": 0.21468789577484132, + "step": 8700 + }, + { + "epoch": 1.2363378282469837, + "grad_norm": 3.82004714012146, + "learning_rate": 9.876422995031937e-05, + "loss": 0.17646214962005616, + "step": 8710 + }, + { + "epoch": 1.2377572746628815, + "grad_norm": 7.979610443115234, + "learning_rate": 9.876281050390348e-05, + "loss": 0.23217053413391114, + "step": 8720 + }, + { + "epoch": 1.2391767210787792, + "grad_norm": 6.828559398651123, + "learning_rate": 9.876139105748758e-05, + "loss": 0.226235294342041, + "step": 8730 + }, + { + "epoch": 1.240596167494677, + "grad_norm": 7.083154678344727, + "learning_rate": 9.875997161107169e-05, + "loss": 0.2136064052581787, + "step": 8740 + }, + { + "epoch": 1.242015613910575, + "grad_norm": 8.167536735534668, + "learning_rate": 9.875855216465579e-05, + "loss": 0.20408027172088622, + "step": 8750 + }, + { + "epoch": 1.2434350603264726, + "grad_norm": 7.635597229003906, + "learning_rate": 9.875713271823988e-05, + "loss": 0.2205681324005127, + "step": 8760 + }, + { + "epoch": 1.2448545067423704, + "grad_norm": 6.944504737854004, + "learning_rate": 9.8755713271824e-05, + "loss": 0.14819756746292115, + "step": 8770 + }, + { + "epoch": 1.2462739531582683, + "grad_norm": 7.144880771636963, + "learning_rate": 9.875429382540809e-05, + "loss": 0.25865755081176756, + "step": 8780 + }, + { + "epoch": 1.2476933995741661, + "grad_norm": 4.50839900970459, + "learning_rate": 9.87528743789922e-05, + "loss": 0.19764204025268556, + "step": 8790 + }, + { + "epoch": 1.2491128459900638, + "grad_norm": 3.0644021034240723, + "learning_rate": 9.87514549325763e-05, + "loss": 0.23454864025115968, + "step": 8800 + }, + { + "epoch": 1.2505322924059616, + "grad_norm": 6.562272548675537, + "learning_rate": 9.87500354861604e-05, + "loss": 0.2683814525604248, + "step": 8810 + }, + { + "epoch": 1.2519517388218595, + "grad_norm": 4.825582027435303, + "learning_rate": 9.87486160397445e-05, + "loss": 0.2111285924911499, + "step": 8820 + }, + { + "epoch": 1.2533711852377571, + "grad_norm": 5.02101469039917, + "learning_rate": 9.87471965933286e-05, + "loss": 0.20650248527526854, + "step": 8830 + }, + { + "epoch": 1.254790631653655, + "grad_norm": 6.4850754737854, + "learning_rate": 9.87457771469127e-05, + "loss": 0.18662099838256835, + "step": 8840 + }, + { + "epoch": 1.2562100780695529, + "grad_norm": 6.745723724365234, + "learning_rate": 9.874435770049682e-05, + "loss": 0.12750645875930786, + "step": 8850 + }, + { + "epoch": 1.2576295244854507, + "grad_norm": 10.856019973754883, + "learning_rate": 9.874293825408091e-05, + "loss": 0.22051913738250734, + "step": 8860 + }, + { + "epoch": 1.2590489709013486, + "grad_norm": 7.022629737854004, + "learning_rate": 9.874151880766501e-05, + "loss": 0.2626792907714844, + "step": 8870 + }, + { + "epoch": 1.2604684173172462, + "grad_norm": 8.997479438781738, + "learning_rate": 9.874009936124912e-05, + "loss": 0.22494235038757324, + "step": 8880 + }, + { + "epoch": 1.261887863733144, + "grad_norm": 8.640801429748535, + "learning_rate": 9.873867991483322e-05, + "loss": 0.21826319694519042, + "step": 8890 + }, + { + "epoch": 1.2633073101490417, + "grad_norm": 4.579946517944336, + "learning_rate": 9.873726046841733e-05, + "loss": 0.18379125595092774, + "step": 8900 + }, + { + "epoch": 1.2647267565649396, + "grad_norm": 6.971579074859619, + "learning_rate": 9.873584102200143e-05, + "loss": 0.23222970962524414, + "step": 8910 + }, + { + "epoch": 1.2661462029808375, + "grad_norm": 6.197728633880615, + "learning_rate": 9.873442157558552e-05, + "loss": 0.23273870944976807, + "step": 8920 + }, + { + "epoch": 1.2675656493967353, + "grad_norm": 9.468696594238281, + "learning_rate": 9.873300212916962e-05, + "loss": 0.18107137680053711, + "step": 8930 + }, + { + "epoch": 1.2689850958126332, + "grad_norm": 3.7539901733398438, + "learning_rate": 9.873158268275373e-05, + "loss": 0.1382051467895508, + "step": 8940 + }, + { + "epoch": 1.2704045422285308, + "grad_norm": 7.013411521911621, + "learning_rate": 9.873016323633783e-05, + "loss": 0.13840343952178955, + "step": 8950 + }, + { + "epoch": 1.2718239886444287, + "grad_norm": 4.136613845825195, + "learning_rate": 9.872874378992194e-05, + "loss": 0.27057197093963625, + "step": 8960 + }, + { + "epoch": 1.2732434350603263, + "grad_norm": 7.147876262664795, + "learning_rate": 9.872732434350604e-05, + "loss": 0.19125341176986693, + "step": 8970 + }, + { + "epoch": 1.2746628814762242, + "grad_norm": 1.9221298694610596, + "learning_rate": 9.872590489709014e-05, + "loss": 0.22451837062835694, + "step": 8980 + }, + { + "epoch": 1.276082327892122, + "grad_norm": 10.765070915222168, + "learning_rate": 9.872448545067425e-05, + "loss": 0.2057518482208252, + "step": 8990 + }, + { + "epoch": 1.27750177430802, + "grad_norm": 3.960794448852539, + "learning_rate": 9.872306600425834e-05, + "loss": 0.21558022499084473, + "step": 9000 + }, + { + "epoch": 1.27750177430802, + "eval_accuracy": 0.907420359890634, + "eval_loss": 0.2675907015800476, + "eval_runtime": 32.1907, + "eval_samples_per_second": 488.557, + "eval_steps_per_second": 15.284, + "step": 9000 + }, + { + "epoch": 1.2789212207239178, + "grad_norm": 6.640925884246826, + "learning_rate": 9.872164655784245e-05, + "loss": 0.21932268142700195, + "step": 9010 + }, + { + "epoch": 1.2803406671398154, + "grad_norm": 3.883657455444336, + "learning_rate": 9.872022711142654e-05, + "loss": 0.20566184520721437, + "step": 9020 + }, + { + "epoch": 1.2817601135557133, + "grad_norm": 8.243616104125977, + "learning_rate": 9.871880766501065e-05, + "loss": 0.1661081552505493, + "step": 9030 + }, + { + "epoch": 1.2831795599716112, + "grad_norm": 9.827435493469238, + "learning_rate": 9.871738821859475e-05, + "loss": 0.17904939651489257, + "step": 9040 + }, + { + "epoch": 1.2845990063875088, + "grad_norm": 7.80245304107666, + "learning_rate": 9.871596877217886e-05, + "loss": 0.15805249214172362, + "step": 9050 + }, + { + "epoch": 1.2860184528034067, + "grad_norm": 4.689866542816162, + "learning_rate": 9.871454932576297e-05, + "loss": 0.23644819259643554, + "step": 9060 + }, + { + "epoch": 1.2874378992193045, + "grad_norm": 6.257835865020752, + "learning_rate": 9.871312987934705e-05, + "loss": 0.2536448955535889, + "step": 9070 + }, + { + "epoch": 1.2888573456352024, + "grad_norm": 1.8020100593566895, + "learning_rate": 9.871185237757275e-05, + "loss": 0.1373010277748108, + "step": 9080 + }, + { + "epoch": 1.2902767920511, + "grad_norm": 4.135176658630371, + "learning_rate": 9.871043293115685e-05, + "loss": 0.1967120051383972, + "step": 9090 + }, + { + "epoch": 1.2916962384669979, + "grad_norm": 5.261960506439209, + "learning_rate": 9.870901348474096e-05, + "loss": 0.21039602756500245, + "step": 9100 + }, + { + "epoch": 1.2931156848828957, + "grad_norm": 6.985999584197998, + "learning_rate": 9.870759403832506e-05, + "loss": 0.22036538124084473, + "step": 9110 + }, + { + "epoch": 1.2945351312987934, + "grad_norm": 3.4260783195495605, + "learning_rate": 9.870617459190917e-05, + "loss": 0.2039936065673828, + "step": 9120 + }, + { + "epoch": 1.2959545777146912, + "grad_norm": 3.7384250164031982, + "learning_rate": 9.870475514549326e-05, + "loss": 0.20263819694519042, + "step": 9130 + }, + { + "epoch": 1.297374024130589, + "grad_norm": 3.172229528427124, + "learning_rate": 9.870333569907736e-05, + "loss": 0.13130682706832886, + "step": 9140 + }, + { + "epoch": 1.298793470546487, + "grad_norm": 12.370247840881348, + "learning_rate": 9.870191625266146e-05, + "loss": 0.20618796348571777, + "step": 9150 + }, + { + "epoch": 1.3002129169623846, + "grad_norm": 7.193541049957275, + "learning_rate": 9.870049680624557e-05, + "loss": 0.2788748264312744, + "step": 9160 + }, + { + "epoch": 1.3016323633782825, + "grad_norm": 4.76792573928833, + "learning_rate": 9.869907735982967e-05, + "loss": 0.18996012210845947, + "step": 9170 + }, + { + "epoch": 1.3030518097941803, + "grad_norm": 3.7090489864349365, + "learning_rate": 9.869765791341378e-05, + "loss": 0.18860991001129152, + "step": 9180 + }, + { + "epoch": 1.304471256210078, + "grad_norm": 6.190913677215576, + "learning_rate": 9.869623846699788e-05, + "loss": 0.224440860748291, + "step": 9190 + }, + { + "epoch": 1.3058907026259758, + "grad_norm": 3.286689281463623, + "learning_rate": 9.869481902058197e-05, + "loss": 0.20683689117431642, + "step": 9200 + }, + { + "epoch": 1.3073101490418737, + "grad_norm": 4.6291937828063965, + "learning_rate": 9.869339957416608e-05, + "loss": 0.19128093719482422, + "step": 9210 + }, + { + "epoch": 1.3087295954577716, + "grad_norm": 8.739839553833008, + "learning_rate": 9.869198012775018e-05, + "loss": 0.21355061531066893, + "step": 9220 + }, + { + "epoch": 1.3101490418736692, + "grad_norm": 4.578412055969238, + "learning_rate": 9.869056068133429e-05, + "loss": 0.1978748083114624, + "step": 9230 + }, + { + "epoch": 1.311568488289567, + "grad_norm": 5.891171932220459, + "learning_rate": 9.868914123491839e-05, + "loss": 0.21060125827789306, + "step": 9240 + }, + { + "epoch": 1.312987934705465, + "grad_norm": 8.383025169372559, + "learning_rate": 9.868772178850249e-05, + "loss": 0.29614646434783937, + "step": 9250 + }, + { + "epoch": 1.3144073811213626, + "grad_norm": 7.3245930671691895, + "learning_rate": 9.868630234208658e-05, + "loss": 0.22820439338684081, + "step": 9260 + }, + { + "epoch": 1.3158268275372604, + "grad_norm": 3.143709182739258, + "learning_rate": 9.86848828956707e-05, + "loss": 0.1735852003097534, + "step": 9270 + }, + { + "epoch": 1.3172462739531583, + "grad_norm": 8.565205574035645, + "learning_rate": 9.868346344925479e-05, + "loss": 0.175143563747406, + "step": 9280 + }, + { + "epoch": 1.3186657203690562, + "grad_norm": 5.662914752960205, + "learning_rate": 9.86820440028389e-05, + "loss": 0.19213972091674805, + "step": 9290 + }, + { + "epoch": 1.3200851667849538, + "grad_norm": 7.872828960418701, + "learning_rate": 9.8680624556423e-05, + "loss": 0.14704231023788453, + "step": 9300 + }, + { + "epoch": 1.3215046132008517, + "grad_norm": 11.20383071899414, + "learning_rate": 9.86792051100071e-05, + "loss": 0.24307498931884766, + "step": 9310 + }, + { + "epoch": 1.3229240596167495, + "grad_norm": 2.9435956478118896, + "learning_rate": 9.867778566359121e-05, + "loss": 0.23251771926879883, + "step": 9320 + }, + { + "epoch": 1.3243435060326472, + "grad_norm": 3.8682780265808105, + "learning_rate": 9.867636621717531e-05, + "loss": 0.21560065746307372, + "step": 9330 + }, + { + "epoch": 1.325762952448545, + "grad_norm": 7.9737420082092285, + "learning_rate": 9.867494677075942e-05, + "loss": 0.1927724599838257, + "step": 9340 + }, + { + "epoch": 1.327182398864443, + "grad_norm": 6.955791473388672, + "learning_rate": 9.86735273243435e-05, + "loss": 0.22344651222229003, + "step": 9350 + }, + { + "epoch": 1.3286018452803408, + "grad_norm": 9.098529815673828, + "learning_rate": 9.867210787792761e-05, + "loss": 0.2260176420211792, + "step": 9360 + }, + { + "epoch": 1.3300212916962384, + "grad_norm": 5.625829219818115, + "learning_rate": 9.867068843151171e-05, + "loss": 0.1760912299156189, + "step": 9370 + }, + { + "epoch": 1.3314407381121363, + "grad_norm": 2.4090805053710938, + "learning_rate": 9.866926898509582e-05, + "loss": 0.16904083490371705, + "step": 9380 + }, + { + "epoch": 1.3328601845280341, + "grad_norm": 4.635160446166992, + "learning_rate": 9.866784953867992e-05, + "loss": 0.21562621593475342, + "step": 9390 + }, + { + "epoch": 1.3342796309439318, + "grad_norm": 8.606550216674805, + "learning_rate": 9.866643009226402e-05, + "loss": 0.21092190742492675, + "step": 9400 + }, + { + "epoch": 1.3356990773598296, + "grad_norm": 5.678009033203125, + "learning_rate": 9.866501064584813e-05, + "loss": 0.19930131435394288, + "step": 9410 + }, + { + "epoch": 1.3371185237757275, + "grad_norm": 6.880139350891113, + "learning_rate": 9.866359119943222e-05, + "loss": 0.3152653217315674, + "step": 9420 + }, + { + "epoch": 1.3385379701916253, + "grad_norm": 5.563040733337402, + "learning_rate": 9.866217175301633e-05, + "loss": 0.18800781965255736, + "step": 9430 + }, + { + "epoch": 1.339957416607523, + "grad_norm": 2.5089986324310303, + "learning_rate": 9.866075230660043e-05, + "loss": 0.11295425891876221, + "step": 9440 + }, + { + "epoch": 1.3413768630234209, + "grad_norm": 4.770693302154541, + "learning_rate": 9.865933286018453e-05, + "loss": 0.18411701917648315, + "step": 9450 + }, + { + "epoch": 1.3427963094393187, + "grad_norm": 4.498220920562744, + "learning_rate": 9.865791341376863e-05, + "loss": 0.2168651342391968, + "step": 9460 + }, + { + "epoch": 1.3442157558552164, + "grad_norm": 3.5189125537872314, + "learning_rate": 9.865649396735274e-05, + "loss": 0.23824927806854249, + "step": 9470 + }, + { + "epoch": 1.3456352022711142, + "grad_norm": 5.034974098205566, + "learning_rate": 9.865507452093684e-05, + "loss": 0.14622821807861328, + "step": 9480 + }, + { + "epoch": 1.347054648687012, + "grad_norm": 2.3215811252593994, + "learning_rate": 9.865365507452095e-05, + "loss": 0.11778559684753417, + "step": 9490 + }, + { + "epoch": 1.34847409510291, + "grad_norm": 4.806303977966309, + "learning_rate": 9.865223562810504e-05, + "loss": 0.12332210540771485, + "step": 9500 + }, + { + "epoch": 1.34847409510291, + "eval_accuracy": 0.9099637565969352, + "eval_loss": 0.2493496835231781, + "eval_runtime": 31.6926, + "eval_samples_per_second": 496.236, + "eval_steps_per_second": 15.524, + "step": 9500 + }, + { + "epoch": 1.3498935415188076, + "grad_norm": 6.961501598358154, + "learning_rate": 9.865081618168914e-05, + "loss": 0.2591987371444702, + "step": 9510 + }, + { + "epoch": 1.3513129879347054, + "grad_norm": 4.2426323890686035, + "learning_rate": 9.864939673527325e-05, + "loss": 0.17831168174743653, + "step": 9520 + }, + { + "epoch": 1.3527324343506033, + "grad_norm": 6.4358625411987305, + "learning_rate": 9.864797728885735e-05, + "loss": 0.2314450740814209, + "step": 9530 + }, + { + "epoch": 1.354151880766501, + "grad_norm": 5.79241943359375, + "learning_rate": 9.864655784244146e-05, + "loss": 0.18896229267120362, + "step": 9540 + }, + { + "epoch": 1.3555713271823988, + "grad_norm": 7.353359699249268, + "learning_rate": 9.864513839602554e-05, + "loss": 0.19705621004104615, + "step": 9550 + }, + { + "epoch": 1.3569907735982967, + "grad_norm": 6.934425354003906, + "learning_rate": 9.864371894960966e-05, + "loss": 0.17384577989578248, + "step": 9560 + }, + { + "epoch": 1.3584102200141945, + "grad_norm": 5.2685394287109375, + "learning_rate": 9.864229950319375e-05, + "loss": 0.2469557285308838, + "step": 9570 + }, + { + "epoch": 1.3598296664300924, + "grad_norm": 6.054180145263672, + "learning_rate": 9.864088005677786e-05, + "loss": 0.2497105598449707, + "step": 9580 + }, + { + "epoch": 1.36124911284599, + "grad_norm": 3.806577444076538, + "learning_rate": 9.863946061036196e-05, + "loss": 0.16005023717880248, + "step": 9590 + }, + { + "epoch": 1.362668559261888, + "grad_norm": 9.077430725097656, + "learning_rate": 9.863804116394607e-05, + "loss": 0.24311597347259523, + "step": 9600 + }, + { + "epoch": 1.3640880056777855, + "grad_norm": 5.967398166656494, + "learning_rate": 9.863662171753017e-05, + "loss": 0.2098919153213501, + "step": 9610 + }, + { + "epoch": 1.3655074520936834, + "grad_norm": 5.3782172203063965, + "learning_rate": 9.863520227111427e-05, + "loss": 0.22856371402740477, + "step": 9620 + }, + { + "epoch": 1.3669268985095813, + "grad_norm": 7.211184501647949, + "learning_rate": 9.863378282469838e-05, + "loss": 0.19752051830291747, + "step": 9630 + }, + { + "epoch": 1.3683463449254791, + "grad_norm": 2.611245632171631, + "learning_rate": 9.863236337828247e-05, + "loss": 0.20763750076293946, + "step": 9640 + }, + { + "epoch": 1.369765791341377, + "grad_norm": 7.055820465087891, + "learning_rate": 9.863094393186659e-05, + "loss": 0.18712767362594604, + "step": 9650 + }, + { + "epoch": 1.3711852377572746, + "grad_norm": 7.2558112144470215, + "learning_rate": 9.862952448545067e-05, + "loss": 0.24251337051391603, + "step": 9660 + }, + { + "epoch": 1.3726046841731725, + "grad_norm": 6.948854446411133, + "learning_rate": 9.862810503903478e-05, + "loss": 0.1610349178314209, + "step": 9670 + }, + { + "epoch": 1.3740241305890701, + "grad_norm": 6.58130407333374, + "learning_rate": 9.862668559261888e-05, + "loss": 0.1934449315071106, + "step": 9680 + }, + { + "epoch": 1.375443577004968, + "grad_norm": 3.3496904373168945, + "learning_rate": 9.862526614620299e-05, + "loss": 0.17610930204391478, + "step": 9690 + }, + { + "epoch": 1.3768630234208659, + "grad_norm": 9.198835372924805, + "learning_rate": 9.862384669978709e-05, + "loss": 0.17025632858276368, + "step": 9700 + }, + { + "epoch": 1.3782824698367637, + "grad_norm": 1.7735481262207031, + "learning_rate": 9.862242725337118e-05, + "loss": 0.20825440883636476, + "step": 9710 + }, + { + "epoch": 1.3797019162526616, + "grad_norm": 6.809709548950195, + "learning_rate": 9.86210078069553e-05, + "loss": 0.18874866962432862, + "step": 9720 + }, + { + "epoch": 1.3811213626685592, + "grad_norm": 8.268877029418945, + "learning_rate": 9.861958836053939e-05, + "loss": 0.26922762393951416, + "step": 9730 + }, + { + "epoch": 1.382540809084457, + "grad_norm": 2.897256851196289, + "learning_rate": 9.86181689141235e-05, + "loss": 0.24385275840759277, + "step": 9740 + }, + { + "epoch": 1.3839602555003547, + "grad_norm": 3.334864616394043, + "learning_rate": 9.86167494677076e-05, + "loss": 0.16869350671768188, + "step": 9750 + }, + { + "epoch": 1.3853797019162526, + "grad_norm": 7.382256984710693, + "learning_rate": 9.86153300212917e-05, + "loss": 0.18727898597717285, + "step": 9760 + }, + { + "epoch": 1.3867991483321505, + "grad_norm": 3.0756566524505615, + "learning_rate": 9.86139105748758e-05, + "loss": 0.1948513627052307, + "step": 9770 + }, + { + "epoch": 1.3882185947480483, + "grad_norm": 7.820052146911621, + "learning_rate": 9.86124911284599e-05, + "loss": 0.1906062364578247, + "step": 9780 + }, + { + "epoch": 1.3896380411639462, + "grad_norm": 5.2213263511657715, + "learning_rate": 9.8611071682044e-05, + "loss": 0.19792075157165528, + "step": 9790 + }, + { + "epoch": 1.3910574875798438, + "grad_norm": 9.714534759521484, + "learning_rate": 9.860965223562811e-05, + "loss": 0.17712973356246947, + "step": 9800 + }, + { + "epoch": 1.3924769339957417, + "grad_norm": 4.078144073486328, + "learning_rate": 9.860823278921221e-05, + "loss": 0.18135050535202027, + "step": 9810 + }, + { + "epoch": 1.3938963804116393, + "grad_norm": 5.219580173492432, + "learning_rate": 9.860681334279631e-05, + "loss": 0.227278733253479, + "step": 9820 + }, + { + "epoch": 1.3953158268275372, + "grad_norm": 6.879891395568848, + "learning_rate": 9.860539389638042e-05, + "loss": 0.215889835357666, + "step": 9830 + }, + { + "epoch": 1.396735273243435, + "grad_norm": 9.455697059631348, + "learning_rate": 9.860397444996452e-05, + "loss": 0.16740819215774536, + "step": 9840 + }, + { + "epoch": 1.398154719659333, + "grad_norm": 4.630984306335449, + "learning_rate": 9.860255500354863e-05, + "loss": 0.22700212001800538, + "step": 9850 + }, + { + "epoch": 1.3995741660752308, + "grad_norm": 6.121819972991943, + "learning_rate": 9.860113555713271e-05, + "loss": 0.220161509513855, + "step": 9860 + }, + { + "epoch": 1.4009936124911284, + "grad_norm": 2.6966371536254883, + "learning_rate": 9.859971611071682e-05, + "loss": 0.18548699617385864, + "step": 9870 + }, + { + "epoch": 1.4024130589070263, + "grad_norm": 4.1472554206848145, + "learning_rate": 9.859829666430092e-05, + "loss": 0.18523939847946166, + "step": 9880 + }, + { + "epoch": 1.4038325053229241, + "grad_norm": 7.051137924194336, + "learning_rate": 9.859687721788503e-05, + "loss": 0.1325202226638794, + "step": 9890 + }, + { + "epoch": 1.4052519517388218, + "grad_norm": 5.540129661560059, + "learning_rate": 9.859545777146913e-05, + "loss": 0.16468173265457153, + "step": 9900 + }, + { + "epoch": 1.4066713981547196, + "grad_norm": 6.817564487457275, + "learning_rate": 9.859403832505323e-05, + "loss": 0.12863141298294067, + "step": 9910 + }, + { + "epoch": 1.4080908445706175, + "grad_norm": 2.415663719177246, + "learning_rate": 9.859261887863734e-05, + "loss": 0.1454537630081177, + "step": 9920 + }, + { + "epoch": 1.4095102909865154, + "grad_norm": 5.63126277923584, + "learning_rate": 9.859119943222143e-05, + "loss": 0.20712642669677733, + "step": 9930 + }, + { + "epoch": 1.410929737402413, + "grad_norm": 3.990525484085083, + "learning_rate": 9.858977998580555e-05, + "loss": 0.14999470710754395, + "step": 9940 + }, + { + "epoch": 1.4123491838183109, + "grad_norm": 4.665277004241943, + "learning_rate": 9.858836053938964e-05, + "loss": 0.1735332727432251, + "step": 9950 + }, + { + "epoch": 1.4137686302342087, + "grad_norm": 6.532275676727295, + "learning_rate": 9.858694109297375e-05, + "loss": 0.18187229633331298, + "step": 9960 + }, + { + "epoch": 1.4151880766501064, + "grad_norm": 10.086085319519043, + "learning_rate": 9.858552164655784e-05, + "loss": 0.25496907234191896, + "step": 9970 + }, + { + "epoch": 1.4166075230660042, + "grad_norm": 8.85912036895752, + "learning_rate": 9.858410220014195e-05, + "loss": 0.21260628700256348, + "step": 9980 + }, + { + "epoch": 1.418026969481902, + "grad_norm": 3.1774983406066895, + "learning_rate": 9.858268275372605e-05, + "loss": 0.16666808128356933, + "step": 9990 + }, + { + "epoch": 1.4194464158978, + "grad_norm": 8.12264633178711, + "learning_rate": 9.858126330731016e-05, + "loss": 0.13021547794342042, + "step": 10000 + }, + { + "epoch": 1.4194464158978, + "eval_accuracy": 0.9303745151650029, + "eval_loss": 0.2065460979938507, + "eval_runtime": 32.8099, + "eval_samples_per_second": 479.338, + "eval_steps_per_second": 14.995, + "step": 10000 + }, + { + "epoch": 1.4208658623136976, + "grad_norm": 3.760587453842163, + "learning_rate": 9.857984386089427e-05, + "loss": 0.21676597595214844, + "step": 10010 + }, + { + "epoch": 1.4222853087295955, + "grad_norm": 6.741761207580566, + "learning_rate": 9.857842441447835e-05, + "loss": 0.22888615131378173, + "step": 10020 + }, + { + "epoch": 1.4237047551454933, + "grad_norm": 4.405668258666992, + "learning_rate": 9.857700496806246e-05, + "loss": 0.13688948154449462, + "step": 10030 + }, + { + "epoch": 1.425124201561391, + "grad_norm": 5.534117698669434, + "learning_rate": 9.857558552164656e-05, + "loss": 0.14423273801803588, + "step": 10040 + }, + { + "epoch": 1.4265436479772888, + "grad_norm": 5.10047721862793, + "learning_rate": 9.857416607523067e-05, + "loss": 0.2310737133026123, + "step": 10050 + }, + { + "epoch": 1.4279630943931867, + "grad_norm": 3.052246570587158, + "learning_rate": 9.857274662881477e-05, + "loss": 0.20977180004119872, + "step": 10060 + }, + { + "epoch": 1.4293825408090846, + "grad_norm": 9.701653480529785, + "learning_rate": 9.857132718239887e-05, + "loss": 0.22714948654174805, + "step": 10070 + }, + { + "epoch": 1.4308019872249822, + "grad_norm": 2.72581148147583, + "learning_rate": 9.856990773598296e-05, + "loss": 0.2333024263381958, + "step": 10080 + }, + { + "epoch": 1.43222143364088, + "grad_norm": 8.234984397888184, + "learning_rate": 9.856848828956707e-05, + "loss": 0.21033647060394287, + "step": 10090 + }, + { + "epoch": 1.433640880056778, + "grad_norm": 4.618515491485596, + "learning_rate": 9.856706884315118e-05, + "loss": 0.2534619331359863, + "step": 10100 + }, + { + "epoch": 1.4350603264726756, + "grad_norm": 3.2053143978118896, + "learning_rate": 9.856564939673528e-05, + "loss": 0.18584598302841188, + "step": 10110 + }, + { + "epoch": 1.4364797728885734, + "grad_norm": 5.643956661224365, + "learning_rate": 9.856422995031938e-05, + "loss": 0.16008204221725464, + "step": 10120 + }, + { + "epoch": 1.4378992193044713, + "grad_norm": 7.6051201820373535, + "learning_rate": 9.856281050390348e-05, + "loss": 0.19140913486480712, + "step": 10130 + }, + { + "epoch": 1.4393186657203692, + "grad_norm": 8.58385181427002, + "learning_rate": 9.856139105748759e-05, + "loss": 0.22861852645874023, + "step": 10140 + }, + { + "epoch": 1.4407381121362668, + "grad_norm": 3.0554444789886475, + "learning_rate": 9.855997161107168e-05, + "loss": 0.14198927879333495, + "step": 10150 + }, + { + "epoch": 1.4421575585521647, + "grad_norm": 3.255782127380371, + "learning_rate": 9.85585521646558e-05, + "loss": 0.17290072441101073, + "step": 10160 + }, + { + "epoch": 1.4435770049680625, + "grad_norm": 4.403168678283691, + "learning_rate": 9.855713271823988e-05, + "loss": 0.19940041303634642, + "step": 10170 + }, + { + "epoch": 1.4449964513839602, + "grad_norm": 8.145320892333984, + "learning_rate": 9.855571327182399e-05, + "loss": 0.21902050971984863, + "step": 10180 + }, + { + "epoch": 1.446415897799858, + "grad_norm": 5.803956508636475, + "learning_rate": 9.85542938254081e-05, + "loss": 0.21828086376190187, + "step": 10190 + }, + { + "epoch": 1.4478353442157559, + "grad_norm": 8.805460929870605, + "learning_rate": 9.85528743789922e-05, + "loss": 0.23348815441131593, + "step": 10200 + }, + { + "epoch": 1.4492547906316537, + "grad_norm": 7.180856704711914, + "learning_rate": 9.855145493257631e-05, + "loss": 0.18313560485839844, + "step": 10210 + }, + { + "epoch": 1.4506742370475514, + "grad_norm": 7.773831844329834, + "learning_rate": 9.85500354861604e-05, + "loss": 0.18291949033737182, + "step": 10220 + }, + { + "epoch": 1.4520936834634492, + "grad_norm": 1.713024616241455, + "learning_rate": 9.85486160397445e-05, + "loss": 0.11751105785369872, + "step": 10230 + }, + { + "epoch": 1.453513129879347, + "grad_norm": 2.2637596130371094, + "learning_rate": 9.85471965933286e-05, + "loss": 0.14805399179458617, + "step": 10240 + }, + { + "epoch": 1.4549325762952448, + "grad_norm": 8.369937896728516, + "learning_rate": 9.854577714691271e-05, + "loss": 0.2501375198364258, + "step": 10250 + }, + { + "epoch": 1.4563520227111426, + "grad_norm": 9.403657913208008, + "learning_rate": 9.854435770049681e-05, + "loss": 0.1835735559463501, + "step": 10260 + }, + { + "epoch": 1.4577714691270405, + "grad_norm": 7.980884075164795, + "learning_rate": 9.854293825408091e-05, + "loss": 0.2255629301071167, + "step": 10270 + }, + { + "epoch": 1.4591909155429383, + "grad_norm": 13.038922309875488, + "learning_rate": 9.854151880766502e-05, + "loss": 0.1810195565223694, + "step": 10280 + }, + { + "epoch": 1.460610361958836, + "grad_norm": 6.806441783905029, + "learning_rate": 9.854009936124912e-05, + "loss": 0.20559656620025635, + "step": 10290 + }, + { + "epoch": 1.4620298083747338, + "grad_norm": 1.5737494230270386, + "learning_rate": 9.853867991483323e-05, + "loss": 0.17797669172286987, + "step": 10300 + }, + { + "epoch": 1.4634492547906317, + "grad_norm": 10.547101020812988, + "learning_rate": 9.853726046841732e-05, + "loss": 0.14445135593414307, + "step": 10310 + }, + { + "epoch": 1.4648687012065293, + "grad_norm": 7.028156757354736, + "learning_rate": 9.853584102200144e-05, + "loss": 0.19645894765853883, + "step": 10320 + }, + { + "epoch": 1.4662881476224272, + "grad_norm": 8.557269096374512, + "learning_rate": 9.853442157558552e-05, + "loss": 0.14470189809799194, + "step": 10330 + }, + { + "epoch": 1.467707594038325, + "grad_norm": 3.8612992763519287, + "learning_rate": 9.853300212916963e-05, + "loss": 0.18914811611175536, + "step": 10340 + }, + { + "epoch": 1.469127040454223, + "grad_norm": 1.5628553628921509, + "learning_rate": 9.853158268275373e-05, + "loss": 0.15799893140792848, + "step": 10350 + }, + { + "epoch": 1.4705464868701206, + "grad_norm": 1.3893674612045288, + "learning_rate": 9.853016323633784e-05, + "loss": 0.20945143699645996, + "step": 10360 + }, + { + "epoch": 1.4719659332860184, + "grad_norm": 5.654598712921143, + "learning_rate": 9.852874378992194e-05, + "loss": 0.18789818286895751, + "step": 10370 + }, + { + "epoch": 1.4733853797019163, + "grad_norm": 2.126235008239746, + "learning_rate": 9.852732434350603e-05, + "loss": 0.18574261665344238, + "step": 10380 + }, + { + "epoch": 1.474804826117814, + "grad_norm": 6.465456008911133, + "learning_rate": 9.852590489709014e-05, + "loss": 0.2622290849685669, + "step": 10390 + }, + { + "epoch": 1.4762242725337118, + "grad_norm": 0.5080237984657288, + "learning_rate": 9.852448545067424e-05, + "loss": 0.1537003517150879, + "step": 10400 + }, + { + "epoch": 1.4776437189496097, + "grad_norm": 1.72958505153656, + "learning_rate": 9.852306600425835e-05, + "loss": 0.15624310970306396, + "step": 10410 + }, + { + "epoch": 1.4790631653655075, + "grad_norm": 4.848511695861816, + "learning_rate": 9.852164655784245e-05, + "loss": 0.12883809804916382, + "step": 10420 + }, + { + "epoch": 1.4804826117814054, + "grad_norm": 5.730294227600098, + "learning_rate": 9.852022711142655e-05, + "loss": 0.14428837299346925, + "step": 10430 + }, + { + "epoch": 1.481902058197303, + "grad_norm": 4.0559539794921875, + "learning_rate": 9.851880766501064e-05, + "loss": 0.1629919409751892, + "step": 10440 + }, + { + "epoch": 1.483321504613201, + "grad_norm": 4.338459014892578, + "learning_rate": 9.851738821859476e-05, + "loss": 0.17030248641967774, + "step": 10450 + }, + { + "epoch": 1.4847409510290985, + "grad_norm": 10.856430053710938, + "learning_rate": 9.851596877217885e-05, + "loss": 0.23294711112976074, + "step": 10460 + }, + { + "epoch": 1.4861603974449964, + "grad_norm": 5.3764729499816895, + "learning_rate": 9.851454932576296e-05, + "loss": 0.1908231258392334, + "step": 10470 + }, + { + "epoch": 1.4875798438608943, + "grad_norm": 7.5525736808776855, + "learning_rate": 9.851312987934706e-05, + "loss": 0.1458095669746399, + "step": 10480 + }, + { + "epoch": 1.4889992902767921, + "grad_norm": 4.017747402191162, + "learning_rate": 9.851171043293116e-05, + "loss": 0.09822410345077515, + "step": 10490 + }, + { + "epoch": 1.49041873669269, + "grad_norm": 3.671755075454712, + "learning_rate": 9.851029098651527e-05, + "loss": 0.2174128770828247, + "step": 10500 + }, + { + "epoch": 1.49041873669269, + "eval_accuracy": 0.9363514974248108, + "eval_loss": 0.18055449426174164, + "eval_runtime": 32.7495, + "eval_samples_per_second": 480.221, + "eval_steps_per_second": 15.023, + "step": 10500 + }, + { + "epoch": 1.4918381831085876, + "grad_norm": 5.814731597900391, + "learning_rate": 9.850887154009937e-05, + "loss": 0.20221278667449952, + "step": 10510 + }, + { + "epoch": 1.4932576295244855, + "grad_norm": 4.894477367401123, + "learning_rate": 9.850745209368348e-05, + "loss": 0.1364034056663513, + "step": 10520 + }, + { + "epoch": 1.4946770759403831, + "grad_norm": 9.05544662475586, + "learning_rate": 9.850603264726756e-05, + "loss": 0.2525052785873413, + "step": 10530 + }, + { + "epoch": 1.496096522356281, + "grad_norm": 4.482929706573486, + "learning_rate": 9.850461320085167e-05, + "loss": 0.16218397617340088, + "step": 10540 + }, + { + "epoch": 1.4975159687721789, + "grad_norm": 6.634395599365234, + "learning_rate": 9.850319375443577e-05, + "loss": 0.14512306451797485, + "step": 10550 + }, + { + "epoch": 1.4989354151880767, + "grad_norm": 8.131645202636719, + "learning_rate": 9.850177430801988e-05, + "loss": 0.1850733518600464, + "step": 10560 + }, + { + "epoch": 1.5003548616039746, + "grad_norm": 7.16902494430542, + "learning_rate": 9.850035486160398e-05, + "loss": 0.232697057723999, + "step": 10570 + }, + { + "epoch": 1.5017743080198722, + "grad_norm": 9.409531593322754, + "learning_rate": 9.849893541518808e-05, + "loss": 0.13974694013595582, + "step": 10580 + }, + { + "epoch": 1.50319375443577, + "grad_norm": 6.473144054412842, + "learning_rate": 9.849751596877219e-05, + "loss": 0.1807733178138733, + "step": 10590 + }, + { + "epoch": 1.5046132008516677, + "grad_norm": 2.1681149005889893, + "learning_rate": 9.849609652235628e-05, + "loss": 0.12265112400054931, + "step": 10600 + }, + { + "epoch": 1.5060326472675656, + "grad_norm": 5.138197898864746, + "learning_rate": 9.84946770759404e-05, + "loss": 0.14840331077575683, + "step": 10610 + }, + { + "epoch": 1.5074520936834634, + "grad_norm": 7.284664630889893, + "learning_rate": 9.849325762952449e-05, + "loss": 0.14850282669067383, + "step": 10620 + }, + { + "epoch": 1.5088715400993613, + "grad_norm": 3.7971346378326416, + "learning_rate": 9.84918381831086e-05, + "loss": 0.1547774314880371, + "step": 10630 + }, + { + "epoch": 1.5102909865152592, + "grad_norm": 6.039275169372559, + "learning_rate": 9.849041873669269e-05, + "loss": 0.197337806224823, + "step": 10640 + }, + { + "epoch": 1.5117104329311568, + "grad_norm": 3.9703164100646973, + "learning_rate": 9.84889992902768e-05, + "loss": 0.2073758363723755, + "step": 10650 + }, + { + "epoch": 1.5131298793470547, + "grad_norm": 9.968624114990234, + "learning_rate": 9.84875798438609e-05, + "loss": 0.1673255443572998, + "step": 10660 + }, + { + "epoch": 1.5145493257629523, + "grad_norm": 5.294106483459473, + "learning_rate": 9.8486160397445e-05, + "loss": 0.1461545467376709, + "step": 10670 + }, + { + "epoch": 1.5159687721788502, + "grad_norm": 10.589927673339844, + "learning_rate": 9.84847409510291e-05, + "loss": 0.1678829312324524, + "step": 10680 + }, + { + "epoch": 1.517388218594748, + "grad_norm": 8.75311279296875, + "learning_rate": 9.84833215046132e-05, + "loss": 0.1493905782699585, + "step": 10690 + }, + { + "epoch": 1.518807665010646, + "grad_norm": 5.052854061126709, + "learning_rate": 9.848190205819731e-05, + "loss": 0.16829880475997924, + "step": 10700 + }, + { + "epoch": 1.5202271114265438, + "grad_norm": 10.165739059448242, + "learning_rate": 9.848048261178141e-05, + "loss": 0.1630192756652832, + "step": 10710 + }, + { + "epoch": 1.5216465578424414, + "grad_norm": 4.576249599456787, + "learning_rate": 9.847906316536552e-05, + "loss": 0.18904685974121094, + "step": 10720 + }, + { + "epoch": 1.5230660042583393, + "grad_norm": 6.297980308532715, + "learning_rate": 9.847764371894962e-05, + "loss": 0.20620598793029785, + "step": 10730 + }, + { + "epoch": 1.524485450674237, + "grad_norm": 6.77498197555542, + "learning_rate": 9.847622427253371e-05, + "loss": 0.16875416040420532, + "step": 10740 + }, + { + "epoch": 1.5259048970901348, + "grad_norm": 3.679386854171753, + "learning_rate": 9.847480482611781e-05, + "loss": 0.17838630676269532, + "step": 10750 + }, + { + "epoch": 1.5273243435060326, + "grad_norm": 9.312896728515625, + "learning_rate": 9.847338537970192e-05, + "loss": 0.21157798767089844, + "step": 10760 + }, + { + "epoch": 1.5287437899219305, + "grad_norm": 7.985523223876953, + "learning_rate": 9.847196593328602e-05, + "loss": 0.18047035932540895, + "step": 10770 + }, + { + "epoch": 1.5301632363378284, + "grad_norm": 6.29368257522583, + "learning_rate": 9.847054648687013e-05, + "loss": 0.1568093180656433, + "step": 10780 + }, + { + "epoch": 1.531582682753726, + "grad_norm": 5.2899322509765625, + "learning_rate": 9.846912704045423e-05, + "loss": 0.14504846334457397, + "step": 10790 + }, + { + "epoch": 1.5330021291696239, + "grad_norm": 1.8608068227767944, + "learning_rate": 9.846770759403833e-05, + "loss": 0.10261296033859253, + "step": 10800 + }, + { + "epoch": 1.5344215755855215, + "grad_norm": 7.755560398101807, + "learning_rate": 9.846628814762244e-05, + "loss": 0.20737462043762206, + "step": 10810 + }, + { + "epoch": 1.5358410220014194, + "grad_norm": 5.849984645843506, + "learning_rate": 9.846486870120653e-05, + "loss": 0.13056904077529907, + "step": 10820 + }, + { + "epoch": 1.5372604684173172, + "grad_norm": 12.66482162475586, + "learning_rate": 9.846344925479065e-05, + "loss": 0.18910495042800904, + "step": 10830 + }, + { + "epoch": 1.538679914833215, + "grad_norm": 5.568217754364014, + "learning_rate": 9.846202980837473e-05, + "loss": 0.21616907119750978, + "step": 10840 + }, + { + "epoch": 1.540099361249113, + "grad_norm": 7.100687503814697, + "learning_rate": 9.846061036195884e-05, + "loss": 0.2003716230392456, + "step": 10850 + }, + { + "epoch": 1.5415188076650106, + "grad_norm": 5.5214009284973145, + "learning_rate": 9.845919091554294e-05, + "loss": 0.17750124931335448, + "step": 10860 + }, + { + "epoch": 1.5429382540809085, + "grad_norm": 7.188937664031982, + "learning_rate": 9.845777146912705e-05, + "loss": 0.18738465309143065, + "step": 10870 + }, + { + "epoch": 1.544357700496806, + "grad_norm": 6.263291358947754, + "learning_rate": 9.845635202271115e-05, + "loss": 0.14714010953903198, + "step": 10880 + }, + { + "epoch": 1.545777146912704, + "grad_norm": 1.6037124395370483, + "learning_rate": 9.845493257629524e-05, + "loss": 0.16528385877609253, + "step": 10890 + }, + { + "epoch": 1.5471965933286018, + "grad_norm": 6.341423034667969, + "learning_rate": 9.845351312987935e-05, + "loss": 0.16852269172668458, + "step": 10900 + }, + { + "epoch": 1.5486160397444997, + "grad_norm": 1.0601999759674072, + "learning_rate": 9.845209368346345e-05, + "loss": 0.165651535987854, + "step": 10910 + }, + { + "epoch": 1.5500354861603975, + "grad_norm": 6.944467544555664, + "learning_rate": 9.845067423704756e-05, + "loss": 0.21995656490325927, + "step": 10920 + }, + { + "epoch": 1.5514549325762954, + "grad_norm": 6.1232380867004395, + "learning_rate": 9.844925479063166e-05, + "loss": 0.23545873165130615, + "step": 10930 + }, + { + "epoch": 1.552874378992193, + "grad_norm": 5.78615665435791, + "learning_rate": 9.844783534421576e-05, + "loss": 0.20628550052642822, + "step": 10940 + }, + { + "epoch": 1.5542938254080907, + "grad_norm": 2.3399593830108643, + "learning_rate": 9.844641589779985e-05, + "loss": 0.1314982771873474, + "step": 10950 + }, + { + "epoch": 1.5557132718239886, + "grad_norm": 8.838848114013672, + "learning_rate": 9.844499645138397e-05, + "loss": 0.17209669351577758, + "step": 10960 + }, + { + "epoch": 1.5571327182398864, + "grad_norm": 6.756653308868408, + "learning_rate": 9.844357700496806e-05, + "loss": 0.2233790397644043, + "step": 10970 + }, + { + "epoch": 1.5585521646557843, + "grad_norm": 3.664095163345337, + "learning_rate": 9.844215755855217e-05, + "loss": 0.14182189702987671, + "step": 10980 + }, + { + "epoch": 1.5599716110716821, + "grad_norm": 6.118113040924072, + "learning_rate": 9.844073811213627e-05, + "loss": 0.1605884075164795, + "step": 10990 + }, + { + "epoch": 1.56139105748758, + "grad_norm": 3.3329458236694336, + "learning_rate": 9.843931866572037e-05, + "loss": 0.15648469924926758, + "step": 11000 + }, + { + "epoch": 1.56139105748758, + "eval_accuracy": 0.9343803649774274, + "eval_loss": 0.18083110451698303, + "eval_runtime": 31.9521, + "eval_samples_per_second": 492.205, + "eval_steps_per_second": 15.398, + "step": 11000 + }, + { + "epoch": 1.5628105039034776, + "grad_norm": 2.8265178203582764, + "learning_rate": 9.843789921930448e-05, + "loss": 0.1055110216140747, + "step": 11010 + }, + { + "epoch": 1.5642299503193753, + "grad_norm": 7.40562105178833, + "learning_rate": 9.843647977288858e-05, + "loss": 0.1931678533554077, + "step": 11020 + }, + { + "epoch": 1.5656493967352731, + "grad_norm": 5.846470355987549, + "learning_rate": 9.843506032647269e-05, + "loss": 0.16744234561920165, + "step": 11030 + }, + { + "epoch": 1.567068843151171, + "grad_norm": 10.13637924194336, + "learning_rate": 9.843364088005678e-05, + "loss": 0.16841363906860352, + "step": 11040 + }, + { + "epoch": 1.5684882895670689, + "grad_norm": 8.881434440612793, + "learning_rate": 9.843222143364088e-05, + "loss": 0.11868530511856079, + "step": 11050 + }, + { + "epoch": 1.5699077359829667, + "grad_norm": 3.2120912075042725, + "learning_rate": 9.843080198722498e-05, + "loss": 0.25566916465759276, + "step": 11060 + }, + { + "epoch": 1.5713271823988646, + "grad_norm": 8.856307983398438, + "learning_rate": 9.842938254080909e-05, + "loss": 0.16841399669647217, + "step": 11070 + }, + { + "epoch": 1.5727466288147622, + "grad_norm": 5.458991050720215, + "learning_rate": 9.842796309439319e-05, + "loss": 0.1553714632987976, + "step": 11080 + }, + { + "epoch": 1.5741660752306599, + "grad_norm": 7.29731559753418, + "learning_rate": 9.84265436479773e-05, + "loss": 0.12889499664306642, + "step": 11090 + }, + { + "epoch": 1.5755855216465577, + "grad_norm": 4.352165699005127, + "learning_rate": 9.84251242015614e-05, + "loss": 0.17049648761749267, + "step": 11100 + }, + { + "epoch": 1.5770049680624556, + "grad_norm": 3.659630060195923, + "learning_rate": 9.84237047551455e-05, + "loss": 0.11960989236831665, + "step": 11110 + }, + { + "epoch": 1.5784244144783535, + "grad_norm": 9.198236465454102, + "learning_rate": 9.84222853087296e-05, + "loss": 0.13858609199523925, + "step": 11120 + }, + { + "epoch": 1.5798438608942513, + "grad_norm": 4.7100510597229, + "learning_rate": 9.84208658623137e-05, + "loss": 0.15008503198623657, + "step": 11130 + }, + { + "epoch": 1.5812633073101492, + "grad_norm": 7.331428050994873, + "learning_rate": 9.841944641589781e-05, + "loss": 0.1811345934867859, + "step": 11140 + }, + { + "epoch": 1.5826827537260468, + "grad_norm": 7.792325019836426, + "learning_rate": 9.84180269694819e-05, + "loss": 0.22963361740112304, + "step": 11150 + }, + { + "epoch": 1.5841022001419447, + "grad_norm": 1.6901665925979614, + "learning_rate": 9.841660752306601e-05, + "loss": 0.12061529159545899, + "step": 11160 + }, + { + "epoch": 1.5855216465578423, + "grad_norm": 6.294560432434082, + "learning_rate": 9.84151880766501e-05, + "loss": 0.1813538670539856, + "step": 11170 + }, + { + "epoch": 1.5869410929737402, + "grad_norm": 5.661618232727051, + "learning_rate": 9.841376863023422e-05, + "loss": 0.13598719835281373, + "step": 11180 + }, + { + "epoch": 1.588360539389638, + "grad_norm": 4.586926460266113, + "learning_rate": 9.841234918381831e-05, + "loss": 0.151306414604187, + "step": 11190 + }, + { + "epoch": 1.589779985805536, + "grad_norm": 3.2611052989959717, + "learning_rate": 9.841092973740241e-05, + "loss": 0.202089524269104, + "step": 11200 + }, + { + "epoch": 1.5911994322214338, + "grad_norm": 5.5583109855651855, + "learning_rate": 9.840951029098652e-05, + "loss": 0.13323140144348145, + "step": 11210 + }, + { + "epoch": 1.5926188786373314, + "grad_norm": 2.7712435722351074, + "learning_rate": 9.840809084457062e-05, + "loss": 0.2039250135421753, + "step": 11220 + }, + { + "epoch": 1.5940383250532293, + "grad_norm": 5.573919773101807, + "learning_rate": 9.840667139815473e-05, + "loss": 0.22665846347808838, + "step": 11230 + }, + { + "epoch": 1.595457771469127, + "grad_norm": 4.785495758056641, + "learning_rate": 9.840525195173883e-05, + "loss": 0.13016164302825928, + "step": 11240 + }, + { + "epoch": 1.5968772178850248, + "grad_norm": 5.181567668914795, + "learning_rate": 9.840383250532292e-05, + "loss": 0.1920285105705261, + "step": 11250 + }, + { + "epoch": 1.5982966643009227, + "grad_norm": 6.854187488555908, + "learning_rate": 9.840255500354862e-05, + "loss": 0.17289340496063232, + "step": 11260 + }, + { + "epoch": 1.5997161107168205, + "grad_norm": 5.818141937255859, + "learning_rate": 9.840113555713272e-05, + "loss": 0.1366284132003784, + "step": 11270 + }, + { + "epoch": 1.6011355571327184, + "grad_norm": 5.610560417175293, + "learning_rate": 9.839971611071682e-05, + "loss": 0.15053837299346923, + "step": 11280 + }, + { + "epoch": 1.602555003548616, + "grad_norm": 3.7539663314819336, + "learning_rate": 9.839829666430093e-05, + "loss": 0.14345501661300658, + "step": 11290 + }, + { + "epoch": 1.6039744499645139, + "grad_norm": 7.876579284667969, + "learning_rate": 9.839687721788503e-05, + "loss": 0.13623604774475098, + "step": 11300 + }, + { + "epoch": 1.6053938963804115, + "grad_norm": 7.193563461303711, + "learning_rate": 9.839545777146914e-05, + "loss": 0.21021018028259278, + "step": 11310 + }, + { + "epoch": 1.6068133427963094, + "grad_norm": 3.236804485321045, + "learning_rate": 9.839403832505323e-05, + "loss": 0.1547287106513977, + "step": 11320 + }, + { + "epoch": 1.6082327892122072, + "grad_norm": 5.831701278686523, + "learning_rate": 9.839261887863733e-05, + "loss": 0.2037062644958496, + "step": 11330 + }, + { + "epoch": 1.609652235628105, + "grad_norm": 11.167473793029785, + "learning_rate": 9.839119943222144e-05, + "loss": 0.23104898929595946, + "step": 11340 + }, + { + "epoch": 1.611071682044003, + "grad_norm": 8.400900840759277, + "learning_rate": 9.838977998580554e-05, + "loss": 0.18747899532318116, + "step": 11350 + }, + { + "epoch": 1.6124911284599006, + "grad_norm": 5.5414042472839355, + "learning_rate": 9.838836053938965e-05, + "loss": 0.20507404804229737, + "step": 11360 + }, + { + "epoch": 1.6139105748757985, + "grad_norm": 5.533061504364014, + "learning_rate": 9.838694109297375e-05, + "loss": 0.17890411615371704, + "step": 11370 + }, + { + "epoch": 1.6153300212916961, + "grad_norm": 2.9510483741760254, + "learning_rate": 9.838552164655785e-05, + "loss": 0.16628677845001222, + "step": 11380 + }, + { + "epoch": 1.616749467707594, + "grad_norm": 5.596954822540283, + "learning_rate": 9.838410220014194e-05, + "loss": 0.14340368509292603, + "step": 11390 + }, + { + "epoch": 1.6181689141234918, + "grad_norm": 1.025497555732727, + "learning_rate": 9.838268275372605e-05, + "loss": 0.1132912278175354, + "step": 11400 + }, + { + "epoch": 1.6195883605393897, + "grad_norm": 8.293600082397461, + "learning_rate": 9.838126330731015e-05, + "loss": 0.15983034372329713, + "step": 11410 + }, + { + "epoch": 1.6210078069552876, + "grad_norm": 6.942419052124023, + "learning_rate": 9.837984386089426e-05, + "loss": 0.18471511602401733, + "step": 11420 + }, + { + "epoch": 1.6224272533711852, + "grad_norm": 7.051154613494873, + "learning_rate": 9.837842441447836e-05, + "loss": 0.17162368297576905, + "step": 11430 + }, + { + "epoch": 1.623846699787083, + "grad_norm": 4.608026504516602, + "learning_rate": 9.837700496806246e-05, + "loss": 0.17447967529296876, + "step": 11440 + }, + { + "epoch": 1.6252661462029807, + "grad_norm": 2.5280375480651855, + "learning_rate": 9.837558552164657e-05, + "loss": 0.13198750019073485, + "step": 11450 + }, + { + "epoch": 1.6266855926188786, + "grad_norm": 5.921835422515869, + "learning_rate": 9.837416607523067e-05, + "loss": 0.19506406784057617, + "step": 11460 + }, + { + "epoch": 1.6281050390347764, + "grad_norm": 1.4568758010864258, + "learning_rate": 9.837274662881478e-05, + "loss": 0.12564977407455444, + "step": 11470 + }, + { + "epoch": 1.6295244854506743, + "grad_norm": 4.619745254516602, + "learning_rate": 9.837132718239886e-05, + "loss": 0.1366949200630188, + "step": 11480 + }, + { + "epoch": 1.6309439318665722, + "grad_norm": 13.973068237304688, + "learning_rate": 9.836990773598297e-05, + "loss": 0.2520665168762207, + "step": 11490 + }, + { + "epoch": 1.6323633782824698, + "grad_norm": 5.616090297698975, + "learning_rate": 9.836848828956707e-05, + "loss": 0.24036917686462403, + "step": 11500 + }, + { + "epoch": 1.6323633782824698, + "eval_accuracy": 0.938894894131112, + "eval_loss": 0.17282415926456451, + "eval_runtime": 32.6586, + "eval_samples_per_second": 481.558, + "eval_steps_per_second": 15.065, + "step": 11500 + }, + { + "epoch": 1.6337828246983677, + "grad_norm": 2.5921289920806885, + "learning_rate": 9.836706884315118e-05, + "loss": 0.1288065195083618, + "step": 11510 + }, + { + "epoch": 1.6352022711142653, + "grad_norm": 3.20184326171875, + "learning_rate": 9.836564939673528e-05, + "loss": 0.14583102464675904, + "step": 11520 + }, + { + "epoch": 1.6366217175301632, + "grad_norm": 5.127830505371094, + "learning_rate": 9.836422995031937e-05, + "loss": 0.18197163343429565, + "step": 11530 + }, + { + "epoch": 1.638041163946061, + "grad_norm": 7.125634670257568, + "learning_rate": 9.836281050390349e-05, + "loss": 0.1912643551826477, + "step": 11540 + }, + { + "epoch": 1.639460610361959, + "grad_norm": 2.9785008430480957, + "learning_rate": 9.836139105748758e-05, + "loss": 0.13757799863815307, + "step": 11550 + }, + { + "epoch": 1.6408800567778568, + "grad_norm": 1.8115347623825073, + "learning_rate": 9.83599716110717e-05, + "loss": 0.1510754942893982, + "step": 11560 + }, + { + "epoch": 1.6422995031937544, + "grad_norm": 3.6485488414764404, + "learning_rate": 9.835855216465579e-05, + "loss": 0.17528530359268188, + "step": 11570 + }, + { + "epoch": 1.6437189496096523, + "grad_norm": 5.931766510009766, + "learning_rate": 9.835713271823989e-05, + "loss": 0.20811958312988282, + "step": 11580 + }, + { + "epoch": 1.64513839602555, + "grad_norm": 7.735183238983154, + "learning_rate": 9.835571327182399e-05, + "loss": 0.1395600199699402, + "step": 11590 + }, + { + "epoch": 1.6465578424414478, + "grad_norm": 5.529693603515625, + "learning_rate": 9.83542938254081e-05, + "loss": 0.14511030912399292, + "step": 11600 + }, + { + "epoch": 1.6479772888573456, + "grad_norm": 4.704524993896484, + "learning_rate": 9.83528743789922e-05, + "loss": 0.1279573082923889, + "step": 11610 + }, + { + "epoch": 1.6493967352732435, + "grad_norm": 11.802435874938965, + "learning_rate": 9.83514549325763e-05, + "loss": 0.14364974498748778, + "step": 11620 + }, + { + "epoch": 1.6508161816891413, + "grad_norm": 7.839514255523682, + "learning_rate": 9.83500354861604e-05, + "loss": 0.17981865406036376, + "step": 11630 + }, + { + "epoch": 1.652235628105039, + "grad_norm": 6.616874694824219, + "learning_rate": 9.83486160397445e-05, + "loss": 0.2129373550415039, + "step": 11640 + }, + { + "epoch": 1.6536550745209369, + "grad_norm": 2.111496925354004, + "learning_rate": 9.834719659332861e-05, + "loss": 0.21924855709075927, + "step": 11650 + }, + { + "epoch": 1.6550745209368345, + "grad_norm": 10.006966590881348, + "learning_rate": 9.834577714691271e-05, + "loss": 0.17941123247146606, + "step": 11660 + }, + { + "epoch": 1.6564939673527324, + "grad_norm": 5.636976718902588, + "learning_rate": 9.834435770049682e-05, + "loss": 0.166895854473114, + "step": 11670 + }, + { + "epoch": 1.6579134137686302, + "grad_norm": 1.7106539011001587, + "learning_rate": 9.834293825408092e-05, + "loss": 0.16953905820846557, + "step": 11680 + }, + { + "epoch": 1.659332860184528, + "grad_norm": 5.924720764160156, + "learning_rate": 9.834151880766501e-05, + "loss": 0.12511081695556642, + "step": 11690 + }, + { + "epoch": 1.660752306600426, + "grad_norm": 8.140963554382324, + "learning_rate": 9.834009936124911e-05, + "loss": 0.15308539867401122, + "step": 11700 + }, + { + "epoch": 1.6621717530163236, + "grad_norm": 2.5716195106506348, + "learning_rate": 9.833867991483322e-05, + "loss": 0.1372369647026062, + "step": 11710 + }, + { + "epoch": 1.6635911994322214, + "grad_norm": 7.952601909637451, + "learning_rate": 9.833726046841732e-05, + "loss": 0.14670779705047607, + "step": 11720 + }, + { + "epoch": 1.665010645848119, + "grad_norm": 1.4507794380187988, + "learning_rate": 9.833584102200143e-05, + "loss": 0.1868760108947754, + "step": 11730 + }, + { + "epoch": 1.666430092264017, + "grad_norm": 7.695814609527588, + "learning_rate": 9.833442157558553e-05, + "loss": 0.24691624641418458, + "step": 11740 + }, + { + "epoch": 1.6678495386799148, + "grad_norm": 10.15262508392334, + "learning_rate": 9.833300212916962e-05, + "loss": 0.2450582504272461, + "step": 11750 + }, + { + "epoch": 1.6692689850958127, + "grad_norm": 5.300413131713867, + "learning_rate": 9.833158268275374e-05, + "loss": 0.17981985807418824, + "step": 11760 + }, + { + "epoch": 1.6706884315117105, + "grad_norm": 10.736809730529785, + "learning_rate": 9.833016323633783e-05, + "loss": 0.12192434072494507, + "step": 11770 + }, + { + "epoch": 1.6721078779276084, + "grad_norm": 2.6130592823028564, + "learning_rate": 9.832874378992194e-05, + "loss": 0.1472996473312378, + "step": 11780 + }, + { + "epoch": 1.673527324343506, + "grad_norm": 6.176468849182129, + "learning_rate": 9.832732434350603e-05, + "loss": 0.12378195524215699, + "step": 11790 + }, + { + "epoch": 1.6749467707594037, + "grad_norm": 12.4953031539917, + "learning_rate": 9.832590489709014e-05, + "loss": 0.18659558296203613, + "step": 11800 + }, + { + "epoch": 1.6763662171753015, + "grad_norm": 6.664957046508789, + "learning_rate": 9.832448545067424e-05, + "loss": 0.17845855951309203, + "step": 11810 + }, + { + "epoch": 1.6777856635911994, + "grad_norm": 4.767297267913818, + "learning_rate": 9.832306600425835e-05, + "loss": 0.20129690170288086, + "step": 11820 + }, + { + "epoch": 1.6792051100070973, + "grad_norm": 8.662429809570312, + "learning_rate": 9.832164655784244e-05, + "loss": 0.19204812049865722, + "step": 11830 + }, + { + "epoch": 1.6806245564229951, + "grad_norm": 4.443410873413086, + "learning_rate": 9.832022711142654e-05, + "loss": 0.17241191864013672, + "step": 11840 + }, + { + "epoch": 1.682044002838893, + "grad_norm": 6.706130027770996, + "learning_rate": 9.831880766501065e-05, + "loss": 0.14194031953811645, + "step": 11850 + }, + { + "epoch": 1.6834634492547906, + "grad_norm": 4.810044288635254, + "learning_rate": 9.831738821859475e-05, + "loss": 0.1292971134185791, + "step": 11860 + }, + { + "epoch": 1.6848828956706883, + "grad_norm": 4.945130348205566, + "learning_rate": 9.831596877217886e-05, + "loss": 0.13104760646820068, + "step": 11870 + }, + { + "epoch": 1.6863023420865861, + "grad_norm": 7.412860870361328, + "learning_rate": 9.831454932576296e-05, + "loss": 0.18914194107055665, + "step": 11880 + }, + { + "epoch": 1.687721788502484, + "grad_norm": 1.9591195583343506, + "learning_rate": 9.831312987934706e-05, + "loss": 0.1756757378578186, + "step": 11890 + }, + { + "epoch": 1.6891412349183819, + "grad_norm": 2.857415199279785, + "learning_rate": 9.831171043293115e-05, + "loss": 0.10278797149658203, + "step": 11900 + }, + { + "epoch": 1.6905606813342797, + "grad_norm": 2.342369556427002, + "learning_rate": 9.831029098651526e-05, + "loss": 0.12141529321670533, + "step": 11910 + }, + { + "epoch": 1.6919801277501776, + "grad_norm": 5.84676456451416, + "learning_rate": 9.830887154009936e-05, + "loss": 0.20085587501525878, + "step": 11920 + }, + { + "epoch": 1.6933995741660752, + "grad_norm": 3.6309845447540283, + "learning_rate": 9.830745209368347e-05, + "loss": 0.15413752794265748, + "step": 11930 + }, + { + "epoch": 1.6948190205819729, + "grad_norm": 2.3892900943756104, + "learning_rate": 9.830603264726757e-05, + "loss": 0.15552257299423217, + "step": 11940 + }, + { + "epoch": 1.6962384669978707, + "grad_norm": 0.9857825636863708, + "learning_rate": 9.830461320085167e-05, + "loss": 0.15181114673614501, + "step": 11950 + }, + { + "epoch": 1.6976579134137686, + "grad_norm": 6.49855375289917, + "learning_rate": 9.830319375443578e-05, + "loss": 0.17083282470703126, + "step": 11960 + }, + { + "epoch": 1.6990773598296665, + "grad_norm": 1.0913960933685303, + "learning_rate": 9.830177430801988e-05, + "loss": 0.2133202314376831, + "step": 11970 + }, + { + "epoch": 1.7004968062455643, + "grad_norm": 4.437821388244629, + "learning_rate": 9.830035486160399e-05, + "loss": 0.0879701018333435, + "step": 11980 + }, + { + "epoch": 1.7019162526614622, + "grad_norm": 4.715758800506592, + "learning_rate": 9.829893541518807e-05, + "loss": 0.15447641611099244, + "step": 11990 + }, + { + "epoch": 1.7033356990773598, + "grad_norm": 8.367589950561523, + "learning_rate": 9.829751596877218e-05, + "loss": 0.17715357542037963, + "step": 12000 + }, + { + "epoch": 1.7033356990773598, + "eval_accuracy": 0.9378775354485916, + "eval_loss": 0.17906926572322845, + "eval_runtime": 33.4925, + "eval_samples_per_second": 469.568, + "eval_steps_per_second": 14.69, + "step": 12000 + }, + { + "epoch": 1.7047551454932577, + "grad_norm": 8.013254165649414, + "learning_rate": 9.829609652235628e-05, + "loss": 0.1866832494735718, + "step": 12010 + }, + { + "epoch": 1.7061745919091553, + "grad_norm": 7.372905731201172, + "learning_rate": 9.829467707594039e-05, + "loss": 0.124139404296875, + "step": 12020 + }, + { + "epoch": 1.7075940383250532, + "grad_norm": 6.6865739822387695, + "learning_rate": 9.829325762952449e-05, + "loss": 0.12705342769622802, + "step": 12030 + }, + { + "epoch": 1.709013484740951, + "grad_norm": 4.504441738128662, + "learning_rate": 9.82918381831086e-05, + "loss": 0.1867109179496765, + "step": 12040 + }, + { + "epoch": 1.710432931156849, + "grad_norm": 1.8893638849258423, + "learning_rate": 9.82904187366927e-05, + "loss": 0.14493658542633056, + "step": 12050 + }, + { + "epoch": 1.7118523775727468, + "grad_norm": 1.72226083278656, + "learning_rate": 9.828899929027679e-05, + "loss": 0.1554844617843628, + "step": 12060 + }, + { + "epoch": 1.7132718239886444, + "grad_norm": 5.362784385681152, + "learning_rate": 9.82875798438609e-05, + "loss": 0.18286285400390626, + "step": 12070 + }, + { + "epoch": 1.7146912704045423, + "grad_norm": 9.535138130187988, + "learning_rate": 9.8286160397445e-05, + "loss": 0.1454553484916687, + "step": 12080 + }, + { + "epoch": 1.71611071682044, + "grad_norm": 5.757817268371582, + "learning_rate": 9.828474095102911e-05, + "loss": 0.14671599864959717, + "step": 12090 + }, + { + "epoch": 1.7175301632363378, + "grad_norm": 5.000237464904785, + "learning_rate": 9.82833215046132e-05, + "loss": 0.21178703308105468, + "step": 12100 + }, + { + "epoch": 1.7189496096522356, + "grad_norm": 5.827192306518555, + "learning_rate": 9.82819020581973e-05, + "loss": 0.21477718353271485, + "step": 12110 + }, + { + "epoch": 1.7203690560681335, + "grad_norm": 3.8673248291015625, + "learning_rate": 9.82804826117814e-05, + "loss": 0.2367461919784546, + "step": 12120 + }, + { + "epoch": 1.7217885024840314, + "grad_norm": 4.519773006439209, + "learning_rate": 9.827906316536551e-05, + "loss": 0.12398046255111694, + "step": 12130 + }, + { + "epoch": 1.723207948899929, + "grad_norm": 7.634313583374023, + "learning_rate": 9.827764371894961e-05, + "loss": 0.12134796380996704, + "step": 12140 + }, + { + "epoch": 1.7246273953158269, + "grad_norm": 7.9592766761779785, + "learning_rate": 9.827622427253371e-05, + "loss": 0.18058866262435913, + "step": 12150 + }, + { + "epoch": 1.7260468417317245, + "grad_norm": 6.438409805297852, + "learning_rate": 9.827480482611782e-05, + "loss": 0.17642263174057007, + "step": 12160 + }, + { + "epoch": 1.7274662881476224, + "grad_norm": 5.818785667419434, + "learning_rate": 9.827338537970192e-05, + "loss": 0.13319342136383056, + "step": 12170 + }, + { + "epoch": 1.7288857345635202, + "grad_norm": 5.536925315856934, + "learning_rate": 9.827196593328603e-05, + "loss": 0.13135639429092408, + "step": 12180 + }, + { + "epoch": 1.730305180979418, + "grad_norm": 5.665536403656006, + "learning_rate": 9.827054648687013e-05, + "loss": 0.12864874601364135, + "step": 12190 + }, + { + "epoch": 1.731724627395316, + "grad_norm": 5.198805809020996, + "learning_rate": 9.826912704045422e-05, + "loss": 0.09919618964195251, + "step": 12200 + }, + { + "epoch": 1.7331440738112136, + "grad_norm": 3.8186886310577393, + "learning_rate": 9.826770759403832e-05, + "loss": 0.15075846910476684, + "step": 12210 + }, + { + "epoch": 1.7345635202271115, + "grad_norm": 4.91066837310791, + "learning_rate": 9.826628814762243e-05, + "loss": 0.1283166766166687, + "step": 12220 + }, + { + "epoch": 1.735982966643009, + "grad_norm": 4.604067802429199, + "learning_rate": 9.826486870120653e-05, + "loss": 0.1516009211540222, + "step": 12230 + }, + { + "epoch": 1.737402413058907, + "grad_norm": 0.4906020164489746, + "learning_rate": 9.826344925479064e-05, + "loss": 0.1481213688850403, + "step": 12240 + }, + { + "epoch": 1.7388218594748048, + "grad_norm": 2.69415283203125, + "learning_rate": 9.826202980837474e-05, + "loss": 0.14420045614242555, + "step": 12250 + }, + { + "epoch": 1.7402413058907027, + "grad_norm": 10.119294166564941, + "learning_rate": 9.826061036195884e-05, + "loss": 0.1346837282180786, + "step": 12260 + }, + { + "epoch": 1.7416607523066006, + "grad_norm": 5.118008613586426, + "learning_rate": 9.825919091554295e-05, + "loss": 0.10409802198410034, + "step": 12270 + }, + { + "epoch": 1.7430801987224982, + "grad_norm": 9.627950668334961, + "learning_rate": 9.825777146912704e-05, + "loss": 0.12958219051361083, + "step": 12280 + }, + { + "epoch": 1.744499645138396, + "grad_norm": 7.486164093017578, + "learning_rate": 9.825635202271115e-05, + "loss": 0.15439097881317138, + "step": 12290 + }, + { + "epoch": 1.7459190915542937, + "grad_norm": 4.496451377868652, + "learning_rate": 9.825493257629524e-05, + "loss": 0.14370408058166503, + "step": 12300 + }, + { + "epoch": 1.7473385379701916, + "grad_norm": 1.7741354703903198, + "learning_rate": 9.825351312987935e-05, + "loss": 0.14793674945831298, + "step": 12310 + }, + { + "epoch": 1.7487579843860894, + "grad_norm": 6.230805397033691, + "learning_rate": 9.825209368346345e-05, + "loss": 0.12588064670562743, + "step": 12320 + }, + { + "epoch": 1.7501774308019873, + "grad_norm": 7.041757106781006, + "learning_rate": 9.825067423704756e-05, + "loss": 0.2671244144439697, + "step": 12330 + }, + { + "epoch": 1.7515968772178852, + "grad_norm": 8.067173957824707, + "learning_rate": 9.824925479063167e-05, + "loss": 0.18581972122192383, + "step": 12340 + }, + { + "epoch": 1.7530163236337828, + "grad_norm": 6.106922626495361, + "learning_rate": 9.824783534421575e-05, + "loss": 0.16915748119354249, + "step": 12350 + }, + { + "epoch": 1.7544357700496807, + "grad_norm": 6.7981743812561035, + "learning_rate": 9.824641589779986e-05, + "loss": 0.12603729963302612, + "step": 12360 + }, + { + "epoch": 1.7558552164655783, + "grad_norm": 5.5388360023498535, + "learning_rate": 9.824499645138396e-05, + "loss": 0.1549227714538574, + "step": 12370 + }, + { + "epoch": 1.7572746628814762, + "grad_norm": 6.960907459259033, + "learning_rate": 9.824357700496807e-05, + "loss": 0.18172571659088135, + "step": 12380 + }, + { + "epoch": 1.758694109297374, + "grad_norm": 4.753782272338867, + "learning_rate": 9.824215755855217e-05, + "loss": 0.14021997451782225, + "step": 12390 + }, + { + "epoch": 1.7601135557132719, + "grad_norm": 3.4172661304473877, + "learning_rate": 9.824073811213628e-05, + "loss": 0.13940014839172363, + "step": 12400 + }, + { + "epoch": 1.7615330021291697, + "grad_norm": 2.0530076026916504, + "learning_rate": 9.823931866572036e-05, + "loss": 0.16023153066635132, + "step": 12410 + }, + { + "epoch": 1.7629524485450674, + "grad_norm": 9.870774269104004, + "learning_rate": 9.823789921930447e-05, + "loss": 0.1769045352935791, + "step": 12420 + }, + { + "epoch": 1.7643718949609652, + "grad_norm": 2.381181001663208, + "learning_rate": 9.823647977288859e-05, + "loss": 0.10290155410766602, + "step": 12430 + }, + { + "epoch": 1.7657913413768629, + "grad_norm": 0.6588567495346069, + "learning_rate": 9.823506032647268e-05, + "loss": 0.07668265104293823, + "step": 12440 + }, + { + "epoch": 1.7672107877927608, + "grad_norm": 8.259925842285156, + "learning_rate": 9.82336408800568e-05, + "loss": 0.10816916227340698, + "step": 12450 + }, + { + "epoch": 1.7686302342086586, + "grad_norm": 10.110259056091309, + "learning_rate": 9.823222143364088e-05, + "loss": 0.1543756604194641, + "step": 12460 + }, + { + "epoch": 1.7700496806245565, + "grad_norm": 1.5917772054672241, + "learning_rate": 9.823080198722499e-05, + "loss": 0.1755792737007141, + "step": 12470 + }, + { + "epoch": 1.7714691270404543, + "grad_norm": 4.567733287811279, + "learning_rate": 9.822938254080909e-05, + "loss": 0.09556171298027039, + "step": 12480 + }, + { + "epoch": 1.772888573456352, + "grad_norm": 4.524011611938477, + "learning_rate": 9.82279630943932e-05, + "loss": 0.11977797746658325, + "step": 12490 + }, + { + "epoch": 1.7743080198722498, + "grad_norm": 3.390681266784668, + "learning_rate": 9.82265436479773e-05, + "loss": 0.20999493598937988, + "step": 12500 + }, + { + "epoch": 1.7743080198722498, + "eval_accuracy": 0.9404845170725504, + "eval_loss": 0.17900405824184418, + "eval_runtime": 33.0963, + "eval_samples_per_second": 475.19, + "eval_steps_per_second": 14.866, + "step": 12500 + }, + { + "epoch": 1.7757274662881475, + "grad_norm": 6.486291885375977, + "learning_rate": 9.822512420156139e-05, + "loss": 0.15412837266921997, + "step": 12510 + }, + { + "epoch": 1.7771469127040453, + "grad_norm": 8.4727201461792, + "learning_rate": 9.82237047551455e-05, + "loss": 0.1553104877471924, + "step": 12520 + }, + { + "epoch": 1.7785663591199432, + "grad_norm": 7.080015182495117, + "learning_rate": 9.82222853087296e-05, + "loss": 0.17961130142211915, + "step": 12530 + }, + { + "epoch": 1.779985805535841, + "grad_norm": 3.5858380794525146, + "learning_rate": 9.822086586231371e-05, + "loss": 0.16834441423416138, + "step": 12540 + }, + { + "epoch": 1.781405251951739, + "grad_norm": 1.947180986404419, + "learning_rate": 9.821944641589781e-05, + "loss": 0.16140348911285402, + "step": 12550 + }, + { + "epoch": 1.7828246983676366, + "grad_norm": 4.678013801574707, + "learning_rate": 9.82180269694819e-05, + "loss": 0.17220114469528197, + "step": 12560 + }, + { + "epoch": 1.7842441447835344, + "grad_norm": 1.8858182430267334, + "learning_rate": 9.8216607523066e-05, + "loss": 0.11123390197753906, + "step": 12570 + }, + { + "epoch": 1.785663591199432, + "grad_norm": 8.490455627441406, + "learning_rate": 9.821518807665011e-05, + "loss": 0.21482553482055664, + "step": 12580 + }, + { + "epoch": 1.78708303761533, + "grad_norm": 6.9470415115356445, + "learning_rate": 9.821376863023421e-05, + "loss": 0.22754549980163574, + "step": 12590 + }, + { + "epoch": 1.7885024840312278, + "grad_norm": 7.122620105743408, + "learning_rate": 9.821234918381832e-05, + "loss": 0.2618594169616699, + "step": 12600 + }, + { + "epoch": 1.7899219304471257, + "grad_norm": 4.771125316619873, + "learning_rate": 9.821092973740242e-05, + "loss": 0.1289076805114746, + "step": 12610 + }, + { + "epoch": 1.7913413768630235, + "grad_norm": 1.8268935680389404, + "learning_rate": 9.820951029098652e-05, + "loss": 0.18204834461212158, + "step": 12620 + }, + { + "epoch": 1.7927608232789212, + "grad_norm": 5.549787521362305, + "learning_rate": 9.820809084457063e-05, + "loss": 0.14632033109664916, + "step": 12630 + }, + { + "epoch": 1.794180269694819, + "grad_norm": 4.965446949005127, + "learning_rate": 9.820667139815473e-05, + "loss": 0.14237403869628906, + "step": 12640 + }, + { + "epoch": 1.7955997161107167, + "grad_norm": 3.6704654693603516, + "learning_rate": 9.820525195173884e-05, + "loss": 0.14324573278427125, + "step": 12650 + }, + { + "epoch": 1.7970191625266145, + "grad_norm": 2.443148612976074, + "learning_rate": 9.820383250532292e-05, + "loss": 0.1546507477760315, + "step": 12660 + }, + { + "epoch": 1.7984386089425124, + "grad_norm": 8.586228370666504, + "learning_rate": 9.820241305890703e-05, + "loss": 0.17691378593444823, + "step": 12670 + }, + { + "epoch": 1.7998580553584103, + "grad_norm": 3.938798666000366, + "learning_rate": 9.820099361249113e-05, + "loss": 0.11685086488723755, + "step": 12680 + }, + { + "epoch": 1.8012775017743081, + "grad_norm": 10.324106216430664, + "learning_rate": 9.819957416607524e-05, + "loss": 0.1108386754989624, + "step": 12690 + }, + { + "epoch": 1.802696948190206, + "grad_norm": 5.7965087890625, + "learning_rate": 9.819815471965934e-05, + "loss": 0.173872172832489, + "step": 12700 + }, + { + "epoch": 1.8041163946061036, + "grad_norm": 6.263943195343018, + "learning_rate": 9.819673527324343e-05, + "loss": 0.12461161613464355, + "step": 12710 + }, + { + "epoch": 1.8055358410220013, + "grad_norm": 3.52416729927063, + "learning_rate": 9.819531582682754e-05, + "loss": 0.1361951231956482, + "step": 12720 + }, + { + "epoch": 1.8069552874378991, + "grad_norm": 3.2541964054107666, + "learning_rate": 9.819389638041164e-05, + "loss": 0.13711843490600586, + "step": 12730 + }, + { + "epoch": 1.808374733853797, + "grad_norm": 2.708355188369751, + "learning_rate": 9.819247693399575e-05, + "loss": 0.16509486436843873, + "step": 12740 + }, + { + "epoch": 1.8097941802696949, + "grad_norm": 8.279736518859863, + "learning_rate": 9.819105748757985e-05, + "loss": 0.15762121677398683, + "step": 12750 + }, + { + "epoch": 1.8112136266855927, + "grad_norm": 4.580092906951904, + "learning_rate": 9.818963804116396e-05, + "loss": 0.1657193422317505, + "step": 12760 + }, + { + "epoch": 1.8126330731014906, + "grad_norm": 6.182056903839111, + "learning_rate": 9.818821859474805e-05, + "loss": 0.09075002670288086, + "step": 12770 + }, + { + "epoch": 1.8140525195173882, + "grad_norm": 2.8882968425750732, + "learning_rate": 9.818679914833216e-05, + "loss": 0.11564161777496337, + "step": 12780 + }, + { + "epoch": 1.8154719659332859, + "grad_norm": 1.9291869401931763, + "learning_rate": 9.818537970191625e-05, + "loss": 0.1788640022277832, + "step": 12790 + }, + { + "epoch": 1.8168914123491837, + "grad_norm": 1.8585617542266846, + "learning_rate": 9.818396025550036e-05, + "loss": 0.14054034948348998, + "step": 12800 + }, + { + "epoch": 1.8183108587650816, + "grad_norm": 3.6257970333099365, + "learning_rate": 9.818254080908446e-05, + "loss": 0.1330336332321167, + "step": 12810 + }, + { + "epoch": 1.8197303051809794, + "grad_norm": 6.263546943664551, + "learning_rate": 9.818112136266856e-05, + "loss": 0.1774816632270813, + "step": 12820 + }, + { + "epoch": 1.8211497515968773, + "grad_norm": 10.41680908203125, + "learning_rate": 9.817970191625267e-05, + "loss": 0.1763577938079834, + "step": 12830 + }, + { + "epoch": 1.8225691980127752, + "grad_norm": 9.07449722290039, + "learning_rate": 9.817828246983677e-05, + "loss": 0.1599531054496765, + "step": 12840 + }, + { + "epoch": 1.8239886444286728, + "grad_norm": 7.387566089630127, + "learning_rate": 9.817686302342088e-05, + "loss": 0.14263440370559693, + "step": 12850 + }, + { + "epoch": 1.8254080908445705, + "grad_norm": 5.237459659576416, + "learning_rate": 9.817544357700498e-05, + "loss": 0.22102766036987304, + "step": 12860 + }, + { + "epoch": 1.8268275372604683, + "grad_norm": 2.364966630935669, + "learning_rate": 9.817402413058907e-05, + "loss": 0.10828995704650879, + "step": 12870 + }, + { + "epoch": 1.8282469836763662, + "grad_norm": 4.197632789611816, + "learning_rate": 9.817260468417317e-05, + "loss": 0.11210172176361084, + "step": 12880 + }, + { + "epoch": 1.829666430092264, + "grad_norm": 9.747461318969727, + "learning_rate": 9.817118523775728e-05, + "loss": 0.20235188007354737, + "step": 12890 + }, + { + "epoch": 1.831085876508162, + "grad_norm": 1.4320733547210693, + "learning_rate": 9.816976579134138e-05, + "loss": 0.11145485639572143, + "step": 12900 + }, + { + "epoch": 1.8325053229240598, + "grad_norm": 4.429521560668945, + "learning_rate": 9.816834634492549e-05, + "loss": 0.10955873727798462, + "step": 12910 + }, + { + "epoch": 1.8339247693399574, + "grad_norm": 6.954484462738037, + "learning_rate": 9.816692689850959e-05, + "loss": 0.15254650115966797, + "step": 12920 + }, + { + "epoch": 1.8353442157558553, + "grad_norm": 5.583377361297607, + "learning_rate": 9.816550745209368e-05, + "loss": 0.17690763473510743, + "step": 12930 + }, + { + "epoch": 1.836763662171753, + "grad_norm": 5.169642925262451, + "learning_rate": 9.81640880056778e-05, + "loss": 0.1680360794067383, + "step": 12940 + }, + { + "epoch": 1.8381831085876508, + "grad_norm": 10.711297988891602, + "learning_rate": 9.816266855926189e-05, + "loss": 0.20626237392425537, + "step": 12950 + }, + { + "epoch": 1.8396025550035486, + "grad_norm": 6.396773338317871, + "learning_rate": 9.8161249112846e-05, + "loss": 0.12390644550323486, + "step": 12960 + }, + { + "epoch": 1.8410220014194465, + "grad_norm": 6.008213996887207, + "learning_rate": 9.815982966643009e-05, + "loss": 0.16526665687561035, + "step": 12970 + }, + { + "epoch": 1.8424414478353444, + "grad_norm": 2.8224973678588867, + "learning_rate": 9.81584102200142e-05, + "loss": 0.15004030466079712, + "step": 12980 + }, + { + "epoch": 1.843860894251242, + "grad_norm": 3.8376224040985107, + "learning_rate": 9.81569907735983e-05, + "loss": 0.12394638061523437, + "step": 12990 + }, + { + "epoch": 1.8452803406671399, + "grad_norm": 4.487581253051758, + "learning_rate": 9.81555713271824e-05, + "loss": 0.12469573020935058, + "step": 13000 + }, + { + "epoch": 1.8452803406671399, + "eval_accuracy": 0.9322184777770712, + "eval_loss": 0.20526456832885742, + "eval_runtime": 32.1483, + "eval_samples_per_second": 489.202, + "eval_steps_per_second": 15.304, + "step": 13000 + }, + { + "epoch": 1.8466997870830375, + "grad_norm": 7.591648101806641, + "learning_rate": 9.81541518807665e-05, + "loss": 0.1824552297592163, + "step": 13010 + }, + { + "epoch": 1.8481192334989354, + "grad_norm": 2.9393680095672607, + "learning_rate": 9.81527324343506e-05, + "loss": 0.18779258728027343, + "step": 13020 + }, + { + "epoch": 1.8495386799148332, + "grad_norm": 4.982316493988037, + "learning_rate": 9.815131298793471e-05, + "loss": 0.1856153726577759, + "step": 13030 + }, + { + "epoch": 1.850958126330731, + "grad_norm": 4.3030242919921875, + "learning_rate": 9.814989354151881e-05, + "loss": 0.13149327039718628, + "step": 13040 + }, + { + "epoch": 1.852377572746629, + "grad_norm": 3.1720340251922607, + "learning_rate": 9.814847409510292e-05, + "loss": 0.17401224374771118, + "step": 13050 + }, + { + "epoch": 1.8537970191625266, + "grad_norm": 5.330498218536377, + "learning_rate": 9.814705464868702e-05, + "loss": 0.18381781578063966, + "step": 13060 + }, + { + "epoch": 1.8552164655784245, + "grad_norm": 3.171062469482422, + "learning_rate": 9.814563520227113e-05, + "loss": 0.09782277941703796, + "step": 13070 + }, + { + "epoch": 1.856635911994322, + "grad_norm": 3.653743267059326, + "learning_rate": 9.814421575585521e-05, + "loss": 0.14549950361251832, + "step": 13080 + }, + { + "epoch": 1.85805535841022, + "grad_norm": 2.782893180847168, + "learning_rate": 9.814279630943932e-05, + "loss": 0.1609262704849243, + "step": 13090 + }, + { + "epoch": 1.8594748048261178, + "grad_norm": 7.247891426086426, + "learning_rate": 9.814137686302342e-05, + "loss": 0.14557520151138306, + "step": 13100 + }, + { + "epoch": 1.8608942512420157, + "grad_norm": 4.025136947631836, + "learning_rate": 9.813995741660753e-05, + "loss": 0.06900943517684936, + "step": 13110 + }, + { + "epoch": 1.8623136976579135, + "grad_norm": 2.248847007751465, + "learning_rate": 9.813853797019163e-05, + "loss": 0.12486515045166016, + "step": 13120 + }, + { + "epoch": 1.8637331440738112, + "grad_norm": 9.784401893615723, + "learning_rate": 9.813711852377573e-05, + "loss": 0.12270998954772949, + "step": 13130 + }, + { + "epoch": 1.865152590489709, + "grad_norm": 4.735940456390381, + "learning_rate": 9.813569907735984e-05, + "loss": 0.2059864282608032, + "step": 13140 + }, + { + "epoch": 1.8665720369056067, + "grad_norm": 5.477226257324219, + "learning_rate": 9.813427963094394e-05, + "loss": 0.10135586261749267, + "step": 13150 + }, + { + "epoch": 1.8679914833215046, + "grad_norm": 5.485146522521973, + "learning_rate": 9.813286018452805e-05, + "loss": 0.18213980197906493, + "step": 13160 + }, + { + "epoch": 1.8694109297374024, + "grad_norm": 4.844747543334961, + "learning_rate": 9.813144073811214e-05, + "loss": 0.10833338499069214, + "step": 13170 + }, + { + "epoch": 1.8708303761533003, + "grad_norm": 12.112831115722656, + "learning_rate": 9.813002129169624e-05, + "loss": 0.1866260290145874, + "step": 13180 + }, + { + "epoch": 1.8722498225691981, + "grad_norm": 1.797105073928833, + "learning_rate": 9.812860184528034e-05, + "loss": 0.1560835361480713, + "step": 13190 + }, + { + "epoch": 1.8736692689850958, + "grad_norm": 8.335697174072266, + "learning_rate": 9.812718239886445e-05, + "loss": 0.11914796829223633, + "step": 13200 + }, + { + "epoch": 1.8750887154009936, + "grad_norm": 4.479477405548096, + "learning_rate": 9.812576295244855e-05, + "loss": 0.18317773342132568, + "step": 13210 + }, + { + "epoch": 1.8765081618168913, + "grad_norm": 1.5853248834609985, + "learning_rate": 9.812434350603266e-05, + "loss": 0.09048664569854736, + "step": 13220 + }, + { + "epoch": 1.8779276082327891, + "grad_norm": 4.840945243835449, + "learning_rate": 9.812292405961675e-05, + "loss": 0.13578274250030517, + "step": 13230 + }, + { + "epoch": 1.879347054648687, + "grad_norm": 11.123950958251953, + "learning_rate": 9.812150461320085e-05, + "loss": 0.17634526491165162, + "step": 13240 + }, + { + "epoch": 1.8807665010645849, + "grad_norm": 4.322571754455566, + "learning_rate": 9.812008516678496e-05, + "loss": 0.10883429050445556, + "step": 13250 + }, + { + "epoch": 1.8821859474804827, + "grad_norm": 4.164629936218262, + "learning_rate": 9.811866572036906e-05, + "loss": 0.15946507453918457, + "step": 13260 + }, + { + "epoch": 1.8836053938963804, + "grad_norm": 4.701801300048828, + "learning_rate": 9.811724627395317e-05, + "loss": 0.15585731267929076, + "step": 13270 + }, + { + "epoch": 1.8850248403122782, + "grad_norm": 6.6244916915893555, + "learning_rate": 9.811582682753726e-05, + "loss": 0.1586725354194641, + "step": 13280 + }, + { + "epoch": 1.8864442867281759, + "grad_norm": 5.30622673034668, + "learning_rate": 9.811440738112137e-05, + "loss": 0.16929301023483276, + "step": 13290 + }, + { + "epoch": 1.8878637331440737, + "grad_norm": 7.866292476654053, + "learning_rate": 9.811298793470546e-05, + "loss": 0.1626114845275879, + "step": 13300 + }, + { + "epoch": 1.8892831795599716, + "grad_norm": 3.1928579807281494, + "learning_rate": 9.811156848828957e-05, + "loss": 0.11974685192108155, + "step": 13310 + }, + { + "epoch": 1.8907026259758695, + "grad_norm": 3.165278196334839, + "learning_rate": 9.811014904187367e-05, + "loss": 0.17966209650039672, + "step": 13320 + }, + { + "epoch": 1.8921220723917673, + "grad_norm": 7.965559959411621, + "learning_rate": 9.810872959545777e-05, + "loss": 0.1445131778717041, + "step": 13330 + }, + { + "epoch": 1.893541518807665, + "grad_norm": 7.0571722984313965, + "learning_rate": 9.810745209368347e-05, + "loss": 0.1508271336555481, + "step": 13340 + }, + { + "epoch": 1.8949609652235628, + "grad_norm": 6.5066351890563965, + "learning_rate": 9.810603264726757e-05, + "loss": 0.2136392116546631, + "step": 13350 + }, + { + "epoch": 1.8963804116394605, + "grad_norm": 5.8861517906188965, + "learning_rate": 9.810461320085168e-05, + "loss": 0.11962813138961792, + "step": 13360 + }, + { + "epoch": 1.8977998580553583, + "grad_norm": 12.299768447875977, + "learning_rate": 9.810319375443577e-05, + "loss": 0.14256292581558228, + "step": 13370 + }, + { + "epoch": 1.8992193044712562, + "grad_norm": 10.79692554473877, + "learning_rate": 9.810177430801988e-05, + "loss": 0.16675705909729005, + "step": 13380 + }, + { + "epoch": 1.900638750887154, + "grad_norm": 4.968460559844971, + "learning_rate": 9.810035486160398e-05, + "loss": 0.18271161317825318, + "step": 13390 + }, + { + "epoch": 1.902058197303052, + "grad_norm": 6.083104133605957, + "learning_rate": 9.809893541518809e-05, + "loss": 0.19613151550292968, + "step": 13400 + }, + { + "epoch": 1.9034776437189496, + "grad_norm": 7.929781913757324, + "learning_rate": 9.809751596877218e-05, + "loss": 0.12828643321990968, + "step": 13410 + }, + { + "epoch": 1.9048970901348474, + "grad_norm": 10.386966705322266, + "learning_rate": 9.809609652235629e-05, + "loss": 0.1059008240699768, + "step": 13420 + }, + { + "epoch": 1.906316536550745, + "grad_norm": 9.958741188049316, + "learning_rate": 9.809467707594038e-05, + "loss": 0.17238779067993165, + "step": 13430 + }, + { + "epoch": 1.907735982966643, + "grad_norm": 7.629611492156982, + "learning_rate": 9.80932576295245e-05, + "loss": 0.11009730100631714, + "step": 13440 + }, + { + "epoch": 1.9091554293825408, + "grad_norm": 4.110402584075928, + "learning_rate": 9.809183818310859e-05, + "loss": 0.15767955780029297, + "step": 13450 + }, + { + "epoch": 1.9105748757984387, + "grad_norm": 5.907031059265137, + "learning_rate": 9.809041873669269e-05, + "loss": 0.11883927583694458, + "step": 13460 + }, + { + "epoch": 1.9119943222143365, + "grad_norm": 6.367669105529785, + "learning_rate": 9.80889992902768e-05, + "loss": 0.15383024215698243, + "step": 13470 + }, + { + "epoch": 1.9134137686302342, + "grad_norm": 11.253113746643066, + "learning_rate": 9.80875798438609e-05, + "loss": 0.18761264085769652, + "step": 13480 + }, + { + "epoch": 1.914833215046132, + "grad_norm": 8.148927688598633, + "learning_rate": 9.808616039744501e-05, + "loss": 0.1913072109222412, + "step": 13490 + }, + { + "epoch": 1.9162526614620297, + "grad_norm": 5.086034774780273, + "learning_rate": 9.808474095102911e-05, + "loss": 0.1331562876701355, + "step": 13500 + }, + { + "epoch": 1.9162526614620297, + "eval_accuracy": 0.9270680994468112, + "eval_loss": 0.20431001484394073, + "eval_runtime": 33.1047, + "eval_samples_per_second": 475.068, + "eval_steps_per_second": 14.862, + "step": 13500 + }, + { + "epoch": 1.9176721078779275, + "grad_norm": 8.143988609313965, + "learning_rate": 9.80833215046132e-05, + "loss": 0.16751954555511475, + "step": 13510 + }, + { + "epoch": 1.9190915542938254, + "grad_norm": 8.666000366210938, + "learning_rate": 9.80819020581973e-05, + "loss": 0.10578331947326661, + "step": 13520 + }, + { + "epoch": 1.9205110007097232, + "grad_norm": 2.205212116241455, + "learning_rate": 9.808048261178141e-05, + "loss": 0.16295469999313356, + "step": 13530 + }, + { + "epoch": 1.921930447125621, + "grad_norm": 3.5031938552856445, + "learning_rate": 9.807906316536551e-05, + "loss": 0.19274975061416627, + "step": 13540 + }, + { + "epoch": 1.923349893541519, + "grad_norm": 6.0588884353637695, + "learning_rate": 9.807764371894962e-05, + "loss": 0.1572549819946289, + "step": 13550 + }, + { + "epoch": 1.9247693399574166, + "grad_norm": 5.022733688354492, + "learning_rate": 9.807622427253372e-05, + "loss": 0.1502652645111084, + "step": 13560 + }, + { + "epoch": 1.9261887863733143, + "grad_norm": 6.909353733062744, + "learning_rate": 9.807480482611782e-05, + "loss": 0.19446460008621216, + "step": 13570 + }, + { + "epoch": 1.9276082327892121, + "grad_norm": 4.539268970489502, + "learning_rate": 9.807338537970193e-05, + "loss": 0.1496061086654663, + "step": 13580 + }, + { + "epoch": 1.92902767920511, + "grad_norm": 5.273926258087158, + "learning_rate": 9.807196593328602e-05, + "loss": 0.1780215859413147, + "step": 13590 + }, + { + "epoch": 1.9304471256210078, + "grad_norm": 4.610520362854004, + "learning_rate": 9.807054648687014e-05, + "loss": 0.12462868690490722, + "step": 13600 + }, + { + "epoch": 1.9318665720369057, + "grad_norm": 7.675487041473389, + "learning_rate": 9.806912704045422e-05, + "loss": 0.17334070205688476, + "step": 13610 + }, + { + "epoch": 1.9332860184528036, + "grad_norm": 7.004896640777588, + "learning_rate": 9.806770759403833e-05, + "loss": 0.15332577228546143, + "step": 13620 + }, + { + "epoch": 1.9347054648687012, + "grad_norm": 2.8662800788879395, + "learning_rate": 9.806628814762243e-05, + "loss": 0.12613468170166015, + "step": 13630 + }, + { + "epoch": 1.9361249112845988, + "grad_norm": 3.3417696952819824, + "learning_rate": 9.806486870120654e-05, + "loss": 0.11488528251647949, + "step": 13640 + }, + { + "epoch": 1.9375443577004967, + "grad_norm": 8.002215385437012, + "learning_rate": 9.806344925479064e-05, + "loss": 0.12292193174362183, + "step": 13650 + }, + { + "epoch": 1.9389638041163946, + "grad_norm": 3.650278091430664, + "learning_rate": 9.806202980837473e-05, + "loss": 0.15752785205841063, + "step": 13660 + }, + { + "epoch": 1.9403832505322924, + "grad_norm": 3.4982657432556152, + "learning_rate": 9.806061036195884e-05, + "loss": 0.13047711849212645, + "step": 13670 + }, + { + "epoch": 1.9418026969481903, + "grad_norm": 7.711712837219238, + "learning_rate": 9.805919091554294e-05, + "loss": 0.144749915599823, + "step": 13680 + }, + { + "epoch": 1.9432221433640882, + "grad_norm": 5.939789772033691, + "learning_rate": 9.805777146912705e-05, + "loss": 0.13807902336120606, + "step": 13690 + }, + { + "epoch": 1.9446415897799858, + "grad_norm": 3.993557929992676, + "learning_rate": 9.805635202271115e-05, + "loss": 0.1018330454826355, + "step": 13700 + }, + { + "epoch": 1.9460610361958834, + "grad_norm": 6.909927845001221, + "learning_rate": 9.805493257629525e-05, + "loss": 0.1758143424987793, + "step": 13710 + }, + { + "epoch": 1.9474804826117813, + "grad_norm": 4.5612993240356445, + "learning_rate": 9.805351312987934e-05, + "loss": 0.13746780157089233, + "step": 13720 + }, + { + "epoch": 1.9488999290276792, + "grad_norm": 0.8813110589981079, + "learning_rate": 9.805209368346346e-05, + "loss": 0.13282377719879152, + "step": 13730 + }, + { + "epoch": 1.950319375443577, + "grad_norm": 4.4625630378723145, + "learning_rate": 9.805067423704755e-05, + "loss": 0.19286319017410278, + "step": 13740 + }, + { + "epoch": 1.951738821859475, + "grad_norm": 6.587796688079834, + "learning_rate": 9.804925479063166e-05, + "loss": 0.1381397008895874, + "step": 13750 + }, + { + "epoch": 1.9531582682753728, + "grad_norm": 7.006091594696045, + "learning_rate": 9.804783534421576e-05, + "loss": 0.10776946544647217, + "step": 13760 + }, + { + "epoch": 1.9545777146912704, + "grad_norm": 6.6057257652282715, + "learning_rate": 9.804641589779986e-05, + "loss": 0.1551327109336853, + "step": 13770 + }, + { + "epoch": 1.9559971611071683, + "grad_norm": 2.855726480484009, + "learning_rate": 9.804499645138397e-05, + "loss": 0.14515860080718995, + "step": 13780 + }, + { + "epoch": 1.957416607523066, + "grad_norm": 4.859558582305908, + "learning_rate": 9.804357700496807e-05, + "loss": 0.13317285776138305, + "step": 13790 + }, + { + "epoch": 1.9588360539389638, + "grad_norm": 4.010891437530518, + "learning_rate": 9.804215755855218e-05, + "loss": 0.21571955680847169, + "step": 13800 + }, + { + "epoch": 1.9602555003548616, + "grad_norm": 1.5958309173583984, + "learning_rate": 9.804073811213627e-05, + "loss": 0.11179524660110474, + "step": 13810 + }, + { + "epoch": 1.9616749467707595, + "grad_norm": 4.728942394256592, + "learning_rate": 9.803931866572037e-05, + "loss": 0.12224637269973755, + "step": 13820 + }, + { + "epoch": 1.9630943931866573, + "grad_norm": 5.639578342437744, + "learning_rate": 9.803789921930447e-05, + "loss": 0.10692014694213867, + "step": 13830 + }, + { + "epoch": 1.964513839602555, + "grad_norm": 3.7262027263641357, + "learning_rate": 9.803647977288858e-05, + "loss": 0.1023218035697937, + "step": 13840 + }, + { + "epoch": 1.9659332860184529, + "grad_norm": 6.50256872177124, + "learning_rate": 9.803506032647268e-05, + "loss": 0.12723206281661986, + "step": 13850 + }, + { + "epoch": 1.9673527324343505, + "grad_norm": 2.4793450832366943, + "learning_rate": 9.803364088005679e-05, + "loss": 0.18150064945220948, + "step": 13860 + }, + { + "epoch": 1.9687721788502484, + "grad_norm": 8.015069961547852, + "learning_rate": 9.803222143364089e-05, + "loss": 0.13160840272903443, + "step": 13870 + }, + { + "epoch": 1.9701916252661462, + "grad_norm": 2.3164284229278564, + "learning_rate": 9.803080198722498e-05, + "loss": 0.13569587469100952, + "step": 13880 + }, + { + "epoch": 1.971611071682044, + "grad_norm": 5.398233413696289, + "learning_rate": 9.80293825408091e-05, + "loss": 0.10830456018447876, + "step": 13890 + }, + { + "epoch": 1.973030518097942, + "grad_norm": 4.58472204208374, + "learning_rate": 9.802796309439319e-05, + "loss": 0.12152203321456909, + "step": 13900 + }, + { + "epoch": 1.9744499645138396, + "grad_norm": 3.399158239364624, + "learning_rate": 9.80265436479773e-05, + "loss": 0.09602898955345154, + "step": 13910 + }, + { + "epoch": 1.9758694109297374, + "grad_norm": 5.37898063659668, + "learning_rate": 9.802512420156139e-05, + "loss": 0.16220704317092896, + "step": 13920 + }, + { + "epoch": 1.977288857345635, + "grad_norm": 8.282011985778809, + "learning_rate": 9.80237047551455e-05, + "loss": 0.1817216157913208, + "step": 13930 + }, + { + "epoch": 1.978708303761533, + "grad_norm": 8.454946517944336, + "learning_rate": 9.80222853087296e-05, + "loss": 0.10207384824752808, + "step": 13940 + }, + { + "epoch": 1.9801277501774308, + "grad_norm": 5.604420185089111, + "learning_rate": 9.80208658623137e-05, + "loss": 0.13896651268005372, + "step": 13950 + }, + { + "epoch": 1.9815471965933287, + "grad_norm": 5.782528400421143, + "learning_rate": 9.80194464158978e-05, + "loss": 0.16523996591567994, + "step": 13960 + }, + { + "epoch": 1.9829666430092265, + "grad_norm": 7.257541656494141, + "learning_rate": 9.80180269694819e-05, + "loss": 0.1670131802558899, + "step": 13970 + }, + { + "epoch": 1.9843860894251242, + "grad_norm": 1.4823135137557983, + "learning_rate": 9.801660752306601e-05, + "loss": 0.09150451421737671, + "step": 13980 + }, + { + "epoch": 1.985805535841022, + "grad_norm": 11.689827919006348, + "learning_rate": 9.801518807665011e-05, + "loss": 0.12286759614944458, + "step": 13990 + }, + { + "epoch": 1.9872249822569197, + "grad_norm": 2.379868268966675, + "learning_rate": 9.801376863023422e-05, + "loss": 0.08730307221412659, + "step": 14000 + }, + { + "epoch": 1.9872249822569197, + "eval_accuracy": 0.9489413111210021, + "eval_loss": 0.14637306332588196, + "eval_runtime": 33.0818, + "eval_samples_per_second": 475.397, + "eval_steps_per_second": 14.872, + "step": 14000 + }, + { + "epoch": 1.9886444286728175, + "grad_norm": 3.562831163406372, + "learning_rate": 9.801234918381832e-05, + "loss": 0.10573784112930298, + "step": 14010 + }, + { + "epoch": 1.9900638750887154, + "grad_norm": 1.7032339572906494, + "learning_rate": 9.801092973740241e-05, + "loss": 0.1144748330116272, + "step": 14020 + }, + { + "epoch": 1.9914833215046133, + "grad_norm": 9.984017372131348, + "learning_rate": 9.800951029098651e-05, + "loss": 0.2368067979812622, + "step": 14030 + }, + { + "epoch": 1.9929027679205111, + "grad_norm": 4.510107517242432, + "learning_rate": 9.800809084457062e-05, + "loss": 0.11444370746612549, + "step": 14040 + }, + { + "epoch": 1.9943222143364088, + "grad_norm": 2.9397714138031006, + "learning_rate": 9.800667139815472e-05, + "loss": 0.08882022500038148, + "step": 14050 + }, + { + "epoch": 1.9957416607523066, + "grad_norm": 5.492639064788818, + "learning_rate": 9.800525195173883e-05, + "loss": 0.13332669734954833, + "step": 14060 + }, + { + "epoch": 1.9971611071682043, + "grad_norm": 6.94230318069458, + "learning_rate": 9.800383250532293e-05, + "loss": 0.1107181191444397, + "step": 14070 + }, + { + "epoch": 1.9985805535841021, + "grad_norm": 1.4583178758621216, + "learning_rate": 9.800241305890703e-05, + "loss": 0.1853145956993103, + "step": 14080 + }, + { + "epoch": 2.0, + "grad_norm": 3.6740102767944336, + "learning_rate": 9.800099361249114e-05, + "loss": 0.1035921812057495, + "step": 14090 + }, + { + "epoch": 2.001419446415898, + "grad_norm": 7.763698101043701, + "learning_rate": 9.799957416607523e-05, + "loss": 0.11998735666275025, + "step": 14100 + }, + { + "epoch": 2.0028388928317957, + "grad_norm": 9.761672019958496, + "learning_rate": 9.799815471965935e-05, + "loss": 0.130437171459198, + "step": 14110 + }, + { + "epoch": 2.0042583392476936, + "grad_norm": 6.725173473358154, + "learning_rate": 9.799673527324344e-05, + "loss": 0.1438794732093811, + "step": 14120 + }, + { + "epoch": 2.005677785663591, + "grad_norm": 2.627002477645874, + "learning_rate": 9.799531582682754e-05, + "loss": 0.13035544157028198, + "step": 14130 + }, + { + "epoch": 2.007097232079489, + "grad_norm": 1.8587443828582764, + "learning_rate": 9.799389638041164e-05, + "loss": 0.10760440826416015, + "step": 14140 + }, + { + "epoch": 2.0085166784953867, + "grad_norm": 5.432860851287842, + "learning_rate": 9.799247693399575e-05, + "loss": 0.08797118067741394, + "step": 14150 + }, + { + "epoch": 2.0099361249112846, + "grad_norm": 8.000253677368164, + "learning_rate": 9.799105748757985e-05, + "loss": 0.13834741115570068, + "step": 14160 + }, + { + "epoch": 2.0113555713271825, + "grad_norm": 4.846225738525391, + "learning_rate": 9.798963804116396e-05, + "loss": 0.1457647442817688, + "step": 14170 + }, + { + "epoch": 2.0127750177430803, + "grad_norm": 11.00196361541748, + "learning_rate": 9.798821859474805e-05, + "loss": 0.1213072657585144, + "step": 14180 + }, + { + "epoch": 2.014194464158978, + "grad_norm": 10.398648262023926, + "learning_rate": 9.798679914833215e-05, + "loss": 0.13774160146713257, + "step": 14190 + }, + { + "epoch": 2.0156139105748756, + "grad_norm": 2.693225145339966, + "learning_rate": 9.798537970191626e-05, + "loss": 0.1436489462852478, + "step": 14200 + }, + { + "epoch": 2.0170333569907735, + "grad_norm": 2.0098676681518555, + "learning_rate": 9.798396025550036e-05, + "loss": 0.11806844472885132, + "step": 14210 + }, + { + "epoch": 2.0184528034066713, + "grad_norm": 3.5687620639801025, + "learning_rate": 9.798254080908447e-05, + "loss": 0.11548566818237305, + "step": 14220 + }, + { + "epoch": 2.019872249822569, + "grad_norm": 4.691004276275635, + "learning_rate": 9.798112136266855e-05, + "loss": 0.11631312370300292, + "step": 14230 + }, + { + "epoch": 2.021291696238467, + "grad_norm": 5.144685745239258, + "learning_rate": 9.797970191625267e-05, + "loss": 0.08912101984024048, + "step": 14240 + }, + { + "epoch": 2.022711142654365, + "grad_norm": 10.743430137634277, + "learning_rate": 9.797828246983676e-05, + "loss": 0.10923216342926026, + "step": 14250 + }, + { + "epoch": 2.0241305890702628, + "grad_norm": 1.788232445716858, + "learning_rate": 9.797686302342087e-05, + "loss": 0.10165914297103881, + "step": 14260 + }, + { + "epoch": 2.02555003548616, + "grad_norm": 1.6243984699249268, + "learning_rate": 9.797544357700497e-05, + "loss": 0.07863327860832214, + "step": 14270 + }, + { + "epoch": 2.026969481902058, + "grad_norm": 4.447552680969238, + "learning_rate": 9.797402413058907e-05, + "loss": 0.09306793808937072, + "step": 14280 + }, + { + "epoch": 2.028388928317956, + "grad_norm": 6.648647308349609, + "learning_rate": 9.797260468417318e-05, + "loss": 0.13603001832962036, + "step": 14290 + }, + { + "epoch": 2.029808374733854, + "grad_norm": 6.4532952308654785, + "learning_rate": 9.797118523775728e-05, + "loss": 0.13374946117401124, + "step": 14300 + }, + { + "epoch": 2.0312278211497516, + "grad_norm": 3.549644708633423, + "learning_rate": 9.796976579134139e-05, + "loss": 0.11156256198883056, + "step": 14310 + }, + { + "epoch": 2.0326472675656495, + "grad_norm": 5.188971042633057, + "learning_rate": 9.796834634492548e-05, + "loss": 0.1163739800453186, + "step": 14320 + }, + { + "epoch": 2.0340667139815474, + "grad_norm": 2.5170130729675293, + "learning_rate": 9.796692689850958e-05, + "loss": 0.17147536277770997, + "step": 14330 + }, + { + "epoch": 2.035486160397445, + "grad_norm": 1.3498976230621338, + "learning_rate": 9.796550745209368e-05, + "loss": 0.10244355201721192, + "step": 14340 + }, + { + "epoch": 2.0369056068133427, + "grad_norm": 1.6554956436157227, + "learning_rate": 9.796408800567779e-05, + "loss": 0.10223543643951416, + "step": 14350 + }, + { + "epoch": 2.0383250532292405, + "grad_norm": 7.838418006896973, + "learning_rate": 9.796266855926189e-05, + "loss": 0.11812844276428222, + "step": 14360 + }, + { + "epoch": 2.0397444996451384, + "grad_norm": 1.8078879117965698, + "learning_rate": 9.7961249112846e-05, + "loss": 0.1252034544944763, + "step": 14370 + }, + { + "epoch": 2.0411639460610362, + "grad_norm": 3.4205777645111084, + "learning_rate": 9.79598296664301e-05, + "loss": 0.10178905725479126, + "step": 14380 + }, + { + "epoch": 2.042583392476934, + "grad_norm": 6.722558498382568, + "learning_rate": 9.79584102200142e-05, + "loss": 0.13192167282104492, + "step": 14390 + }, + { + "epoch": 2.044002838892832, + "grad_norm": 3.837047576904297, + "learning_rate": 9.79569907735983e-05, + "loss": 0.12296985387802124, + "step": 14400 + }, + { + "epoch": 2.0454222853087294, + "grad_norm": 2.1457889080047607, + "learning_rate": 9.79555713271824e-05, + "loss": 0.16315003633499145, + "step": 14410 + }, + { + "epoch": 2.0468417317246272, + "grad_norm": 6.29680871963501, + "learning_rate": 9.795415188076651e-05, + "loss": 0.12061352729797363, + "step": 14420 + }, + { + "epoch": 2.048261178140525, + "grad_norm": 6.541940689086914, + "learning_rate": 9.79527324343506e-05, + "loss": 0.2011786699295044, + "step": 14430 + }, + { + "epoch": 2.049680624556423, + "grad_norm": 4.376636505126953, + "learning_rate": 9.795131298793471e-05, + "loss": 0.10220627784729004, + "step": 14440 + }, + { + "epoch": 2.051100070972321, + "grad_norm": 3.3631985187530518, + "learning_rate": 9.79498935415188e-05, + "loss": 0.12176470756530762, + "step": 14450 + }, + { + "epoch": 2.0525195173882187, + "grad_norm": 3.7540066242218018, + "learning_rate": 9.794847409510292e-05, + "loss": 0.13856956958770753, + "step": 14460 + }, + { + "epoch": 2.0539389638041166, + "grad_norm": 4.199720859527588, + "learning_rate": 9.794705464868701e-05, + "loss": 0.11578547954559326, + "step": 14470 + }, + { + "epoch": 2.055358410220014, + "grad_norm": 2.478891134262085, + "learning_rate": 9.794563520227112e-05, + "loss": 0.11448420286178589, + "step": 14480 + }, + { + "epoch": 2.056777856635912, + "grad_norm": 10.809943199157715, + "learning_rate": 9.794421575585522e-05, + "loss": 0.11974853277206421, + "step": 14490 + }, + { + "epoch": 2.0581973030518097, + "grad_norm": 3.9403326511383057, + "learning_rate": 9.794279630943932e-05, + "loss": 0.09595261812210083, + "step": 14500 + }, + { + "epoch": 2.0581973030518097, + "eval_accuracy": 0.9520569720862212, + "eval_loss": 0.1421024352312088, + "eval_runtime": 32.3117, + "eval_samples_per_second": 486.728, + "eval_steps_per_second": 15.227, + "step": 14500 + }, + { + "epoch": 2.0596167494677076, + "grad_norm": 9.631017684936523, + "learning_rate": 9.794137686302343e-05, + "loss": 0.15254437923431396, + "step": 14510 + }, + { + "epoch": 2.0610361958836054, + "grad_norm": 4.611459255218506, + "learning_rate": 9.793995741660753e-05, + "loss": 0.09197093248367309, + "step": 14520 + }, + { + "epoch": 2.0624556422995033, + "grad_norm": 5.0104756355285645, + "learning_rate": 9.793853797019164e-05, + "loss": 0.17470468282699586, + "step": 14530 + }, + { + "epoch": 2.063875088715401, + "grad_norm": 6.290011882781982, + "learning_rate": 9.793711852377572e-05, + "loss": 0.13710517883300782, + "step": 14540 + }, + { + "epoch": 2.065294535131299, + "grad_norm": 5.759206771850586, + "learning_rate": 9.793569907735983e-05, + "loss": 0.08785209059715271, + "step": 14550 + }, + { + "epoch": 2.0667139815471964, + "grad_norm": 3.606126308441162, + "learning_rate": 9.793427963094393e-05, + "loss": 0.1606206178665161, + "step": 14560 + }, + { + "epoch": 2.0681334279630943, + "grad_norm": 1.4751636981964111, + "learning_rate": 9.793286018452804e-05, + "loss": 0.09843673706054687, + "step": 14570 + }, + { + "epoch": 2.069552874378992, + "grad_norm": 6.7842864990234375, + "learning_rate": 9.793144073811215e-05, + "loss": 0.12192797660827637, + "step": 14580 + }, + { + "epoch": 2.07097232079489, + "grad_norm": 0.8541110754013062, + "learning_rate": 9.793002129169624e-05, + "loss": 0.16259843111038208, + "step": 14590 + }, + { + "epoch": 2.072391767210788, + "grad_norm": 1.672593116760254, + "learning_rate": 9.792860184528035e-05, + "loss": 0.09362624883651734, + "step": 14600 + }, + { + "epoch": 2.0738112136266857, + "grad_norm": 1.834715485572815, + "learning_rate": 9.792718239886444e-05, + "loss": 0.09099584221839904, + "step": 14610 + }, + { + "epoch": 2.0752306600425836, + "grad_norm": 2.21016001701355, + "learning_rate": 9.792576295244856e-05, + "loss": 0.12747323513031006, + "step": 14620 + }, + { + "epoch": 2.076650106458481, + "grad_norm": 2.8152081966400146, + "learning_rate": 9.792434350603265e-05, + "loss": 0.08871068954467773, + "step": 14630 + }, + { + "epoch": 2.078069552874379, + "grad_norm": 10.869599342346191, + "learning_rate": 9.792292405961675e-05, + "loss": 0.09311275482177735, + "step": 14640 + }, + { + "epoch": 2.0794889992902768, + "grad_norm": 7.580860614776611, + "learning_rate": 9.792150461320085e-05, + "loss": 0.10084123611450195, + "step": 14650 + }, + { + "epoch": 2.0809084457061746, + "grad_norm": 4.795779228210449, + "learning_rate": 9.792008516678496e-05, + "loss": 0.11776796579360962, + "step": 14660 + }, + { + "epoch": 2.0823278921220725, + "grad_norm": 8.302618980407715, + "learning_rate": 9.791866572036907e-05, + "loss": 0.1491849184036255, + "step": 14670 + }, + { + "epoch": 2.0837473385379703, + "grad_norm": 0.23616167902946472, + "learning_rate": 9.791724627395317e-05, + "loss": 0.09274361729621887, + "step": 14680 + }, + { + "epoch": 2.085166784953868, + "grad_norm": 4.930098056793213, + "learning_rate": 9.791582682753726e-05, + "loss": 0.10362660884857178, + "step": 14690 + }, + { + "epoch": 2.0865862313697656, + "grad_norm": 5.442007064819336, + "learning_rate": 9.791440738112136e-05, + "loss": 0.16730997562408448, + "step": 14700 + }, + { + "epoch": 2.0880056777856635, + "grad_norm": 2.312178134918213, + "learning_rate": 9.791298793470547e-05, + "loss": 0.09510490894317628, + "step": 14710 + }, + { + "epoch": 2.0894251242015613, + "grad_norm": 4.624721527099609, + "learning_rate": 9.791156848828957e-05, + "loss": 0.11144552230834961, + "step": 14720 + }, + { + "epoch": 2.090844570617459, + "grad_norm": 4.009274482727051, + "learning_rate": 9.791014904187368e-05, + "loss": 0.05063519477844238, + "step": 14730 + }, + { + "epoch": 2.092264017033357, + "grad_norm": 3.2653450965881348, + "learning_rate": 9.790872959545776e-05, + "loss": 0.08952829837799073, + "step": 14740 + }, + { + "epoch": 2.093683463449255, + "grad_norm": 5.824209690093994, + "learning_rate": 9.790731014904188e-05, + "loss": 0.15206855535507202, + "step": 14750 + }, + { + "epoch": 2.095102909865153, + "grad_norm": 9.619600296020508, + "learning_rate": 9.790589070262599e-05, + "loss": 0.09403921961784363, + "step": 14760 + }, + { + "epoch": 2.09652235628105, + "grad_norm": 9.709185600280762, + "learning_rate": 9.790447125621008e-05, + "loss": 0.14637627601623535, + "step": 14770 + }, + { + "epoch": 2.097941802696948, + "grad_norm": 5.918253421783447, + "learning_rate": 9.79030518097942e-05, + "loss": 0.1368915319442749, + "step": 14780 + }, + { + "epoch": 2.099361249112846, + "grad_norm": 4.801339626312256, + "learning_rate": 9.790163236337828e-05, + "loss": 0.12445158958435058, + "step": 14790 + }, + { + "epoch": 2.100780695528744, + "grad_norm": 4.204085826873779, + "learning_rate": 9.790021291696239e-05, + "loss": 0.10883952379226684, + "step": 14800 + }, + { + "epoch": 2.1022001419446417, + "grad_norm": 2.81545352935791, + "learning_rate": 9.789879347054649e-05, + "loss": 0.14513410329818727, + "step": 14810 + }, + { + "epoch": 2.1036195883605395, + "grad_norm": 10.400982856750488, + "learning_rate": 9.78973740241306e-05, + "loss": 0.1663369655609131, + "step": 14820 + }, + { + "epoch": 2.1050390347764374, + "grad_norm": 4.7983078956604, + "learning_rate": 9.78959545777147e-05, + "loss": 0.10346471071243286, + "step": 14830 + }, + { + "epoch": 2.106458481192335, + "grad_norm": 6.536756992340088, + "learning_rate": 9.78945351312988e-05, + "loss": 0.12118889093399048, + "step": 14840 + }, + { + "epoch": 2.1078779276082327, + "grad_norm": 4.13341760635376, + "learning_rate": 9.78931156848829e-05, + "loss": 0.09681417346000672, + "step": 14850 + }, + { + "epoch": 2.1092973740241305, + "grad_norm": 6.235330581665039, + "learning_rate": 9.7891696238467e-05, + "loss": 0.11153937578201294, + "step": 14860 + }, + { + "epoch": 2.1107168204400284, + "grad_norm": 4.928127765655518, + "learning_rate": 9.789027679205111e-05, + "loss": 0.07672246694564819, + "step": 14870 + }, + { + "epoch": 2.1121362668559263, + "grad_norm": 4.837932109832764, + "learning_rate": 9.788885734563521e-05, + "loss": 0.07635858654975891, + "step": 14880 + }, + { + "epoch": 2.113555713271824, + "grad_norm": 7.02380895614624, + "learning_rate": 9.788743789921932e-05, + "loss": 0.07125227451324463, + "step": 14890 + }, + { + "epoch": 2.114975159687722, + "grad_norm": 5.700672149658203, + "learning_rate": 9.78860184528034e-05, + "loss": 0.19001219272613526, + "step": 14900 + }, + { + "epoch": 2.1163946061036194, + "grad_norm": 8.149482727050781, + "learning_rate": 9.788459900638751e-05, + "loss": 0.13992477655410768, + "step": 14910 + }, + { + "epoch": 2.1178140525195173, + "grad_norm": 2.9586234092712402, + "learning_rate": 9.788317955997161e-05, + "loss": 0.12763415575027465, + "step": 14920 + }, + { + "epoch": 2.119233498935415, + "grad_norm": 8.272931098937988, + "learning_rate": 9.788176011355572e-05, + "loss": 0.14072943925857545, + "step": 14930 + }, + { + "epoch": 2.120652945351313, + "grad_norm": 10.288031578063965, + "learning_rate": 9.788034066713982e-05, + "loss": 0.12365868091583251, + "step": 14940 + }, + { + "epoch": 2.122072391767211, + "grad_norm": 3.203730821609497, + "learning_rate": 9.787892122072392e-05, + "loss": 0.16196365356445314, + "step": 14950 + }, + { + "epoch": 2.1234918381831087, + "grad_norm": 1.575235366821289, + "learning_rate": 9.787750177430803e-05, + "loss": 0.10702955722808838, + "step": 14960 + }, + { + "epoch": 2.1249112845990066, + "grad_norm": 3.2818377017974854, + "learning_rate": 9.787608232789213e-05, + "loss": 0.109703528881073, + "step": 14970 + }, + { + "epoch": 2.126330731014904, + "grad_norm": 2.6222288608551025, + "learning_rate": 9.787466288147624e-05, + "loss": 0.13249775171279907, + "step": 14980 + }, + { + "epoch": 2.127750177430802, + "grad_norm": 2.1232478618621826, + "learning_rate": 9.787324343506033e-05, + "loss": 0.07887126207351684, + "step": 14990 + }, + { + "epoch": 2.1291696238466997, + "grad_norm": 2.6810293197631836, + "learning_rate": 9.787182398864443e-05, + "loss": 0.07232893705368042, + "step": 15000 + }, + { + "epoch": 2.1291696238466997, + "eval_accuracy": 0.9323456476123864, + "eval_loss": 0.19697453081607819, + "eval_runtime": 33.1486, + "eval_samples_per_second": 474.44, + "eval_steps_per_second": 14.842, + "step": 15000 + }, + { + "epoch": 2.1305890702625976, + "grad_norm": 1.1261463165283203, + "learning_rate": 9.787040454222853e-05, + "loss": 0.15110697746276855, + "step": 15010 + }, + { + "epoch": 2.1320085166784954, + "grad_norm": 7.047489166259766, + "learning_rate": 9.786898509581264e-05, + "loss": 0.12342967987060546, + "step": 15020 + }, + { + "epoch": 2.1334279630943933, + "grad_norm": 2.4421699047088623, + "learning_rate": 9.786756564939674e-05, + "loss": 0.10898158550262452, + "step": 15030 + }, + { + "epoch": 2.134847409510291, + "grad_norm": 13.27920913696289, + "learning_rate": 9.786614620298085e-05, + "loss": 0.17320735454559327, + "step": 15040 + }, + { + "epoch": 2.1362668559261886, + "grad_norm": 2.1594645977020264, + "learning_rate": 9.786472675656495e-05, + "loss": 0.1370407223701477, + "step": 15050 + }, + { + "epoch": 2.1376863023420865, + "grad_norm": 3.3465182781219482, + "learning_rate": 9.786330731014904e-05, + "loss": 0.0927284300327301, + "step": 15060 + }, + { + "epoch": 2.1391057487579843, + "grad_norm": 4.845798015594482, + "learning_rate": 9.786188786373315e-05, + "loss": 0.09592834115028381, + "step": 15070 + }, + { + "epoch": 2.140525195173882, + "grad_norm": 5.797274112701416, + "learning_rate": 9.786046841731725e-05, + "loss": 0.09021830558776855, + "step": 15080 + }, + { + "epoch": 2.14194464158978, + "grad_norm": 6.726304054260254, + "learning_rate": 9.785904897090136e-05, + "loss": 0.08812606334686279, + "step": 15090 + }, + { + "epoch": 2.143364088005678, + "grad_norm": 11.3377046585083, + "learning_rate": 9.785762952448545e-05, + "loss": 0.17364519834518433, + "step": 15100 + }, + { + "epoch": 2.1447835344215758, + "grad_norm": 3.504915237426758, + "learning_rate": 9.785621007806956e-05, + "loss": 0.11160609722137452, + "step": 15110 + }, + { + "epoch": 2.146202980837473, + "grad_norm": 8.797595024108887, + "learning_rate": 9.785479063165365e-05, + "loss": 0.19877324104309083, + "step": 15120 + }, + { + "epoch": 2.147622427253371, + "grad_norm": 3.8671157360076904, + "learning_rate": 9.785337118523777e-05, + "loss": 0.1070638656616211, + "step": 15130 + }, + { + "epoch": 2.149041873669269, + "grad_norm": 1.9480023384094238, + "learning_rate": 9.785195173882186e-05, + "loss": 0.08838028907775879, + "step": 15140 + }, + { + "epoch": 2.1504613200851668, + "grad_norm": 0.8382003903388977, + "learning_rate": 9.785053229240596e-05, + "loss": 0.13476892709732055, + "step": 15150 + }, + { + "epoch": 2.1518807665010646, + "grad_norm": 1.5311458110809326, + "learning_rate": 9.784911284599007e-05, + "loss": 0.1371008038520813, + "step": 15160 + }, + { + "epoch": 2.1533002129169625, + "grad_norm": 4.248318672180176, + "learning_rate": 9.784769339957417e-05, + "loss": 0.142839252948761, + "step": 15170 + }, + { + "epoch": 2.1547196593328604, + "grad_norm": 5.336694717407227, + "learning_rate": 9.784627395315828e-05, + "loss": 0.15205401182174683, + "step": 15180 + }, + { + "epoch": 2.156139105748758, + "grad_norm": 1.6950732469558716, + "learning_rate": 9.784485450674238e-05, + "loss": 0.09157877564430236, + "step": 15190 + }, + { + "epoch": 2.1575585521646556, + "grad_norm": 0.8742321133613586, + "learning_rate": 9.784343506032649e-05, + "loss": 0.07795000672340394, + "step": 15200 + }, + { + "epoch": 2.1589779985805535, + "grad_norm": 9.622370719909668, + "learning_rate": 9.784201561391057e-05, + "loss": 0.12661195993423463, + "step": 15210 + }, + { + "epoch": 2.1603974449964514, + "grad_norm": 2.450603723526001, + "learning_rate": 9.784059616749468e-05, + "loss": 0.07968658804893494, + "step": 15220 + }, + { + "epoch": 2.1618168914123492, + "grad_norm": 6.467986583709717, + "learning_rate": 9.783917672107878e-05, + "loss": 0.09993529319763184, + "step": 15230 + }, + { + "epoch": 2.163236337828247, + "grad_norm": 4.023931980133057, + "learning_rate": 9.783775727466289e-05, + "loss": 0.13655495643615723, + "step": 15240 + }, + { + "epoch": 2.164655784244145, + "grad_norm": 6.877175807952881, + "learning_rate": 9.783633782824699e-05, + "loss": 0.11687321662902832, + "step": 15250 + }, + { + "epoch": 2.1660752306600424, + "grad_norm": 6.720952033996582, + "learning_rate": 9.783491838183109e-05, + "loss": 0.1210485816001892, + "step": 15260 + }, + { + "epoch": 2.1674946770759402, + "grad_norm": 3.8507208824157715, + "learning_rate": 9.78334989354152e-05, + "loss": 0.132388699054718, + "step": 15270 + }, + { + "epoch": 2.168914123491838, + "grad_norm": 1.8653970956802368, + "learning_rate": 9.78320794889993e-05, + "loss": 0.08510831594467164, + "step": 15280 + }, + { + "epoch": 2.170333569907736, + "grad_norm": 2.0540809631347656, + "learning_rate": 9.78306600425834e-05, + "loss": 0.07614290714263916, + "step": 15290 + }, + { + "epoch": 2.171753016323634, + "grad_norm": 3.400786876678467, + "learning_rate": 9.78292405961675e-05, + "loss": 0.1373605966567993, + "step": 15300 + }, + { + "epoch": 2.1731724627395317, + "grad_norm": 4.475280284881592, + "learning_rate": 9.78278211497516e-05, + "loss": 0.170183527469635, + "step": 15310 + }, + { + "epoch": 2.1745919091554295, + "grad_norm": 1.2852575778961182, + "learning_rate": 9.78264017033357e-05, + "loss": 0.09261202812194824, + "step": 15320 + }, + { + "epoch": 2.176011355571327, + "grad_norm": 2.492828369140625, + "learning_rate": 9.782498225691981e-05, + "loss": 0.1506461977958679, + "step": 15330 + }, + { + "epoch": 2.177430801987225, + "grad_norm": 1.1873884201049805, + "learning_rate": 9.78235628105039e-05, + "loss": 0.1407165050506592, + "step": 15340 + }, + { + "epoch": 2.1788502484031227, + "grad_norm": 6.442225933074951, + "learning_rate": 9.782214336408802e-05, + "loss": 0.10227712392807006, + "step": 15350 + }, + { + "epoch": 2.1802696948190206, + "grad_norm": 4.296558856964111, + "learning_rate": 9.782072391767211e-05, + "loss": 0.1007123589515686, + "step": 15360 + }, + { + "epoch": 2.1816891412349184, + "grad_norm": 5.814218044281006, + "learning_rate": 9.781944641589781e-05, + "loss": 0.19718022346496583, + "step": 15370 + }, + { + "epoch": 2.1831085876508163, + "grad_norm": 4.71889066696167, + "learning_rate": 9.78180269694819e-05, + "loss": 0.18047010898590088, + "step": 15380 + }, + { + "epoch": 2.184528034066714, + "grad_norm": 4.318767070770264, + "learning_rate": 9.7816607523066e-05, + "loss": 0.15934972763061522, + "step": 15390 + }, + { + "epoch": 2.1859474804826116, + "grad_norm": 5.206693172454834, + "learning_rate": 9.78151880766501e-05, + "loss": 0.16389219760894774, + "step": 15400 + }, + { + "epoch": 2.1873669268985094, + "grad_norm": 5.830376148223877, + "learning_rate": 9.781376863023421e-05, + "loss": 0.09744818210601806, + "step": 15410 + }, + { + "epoch": 2.1887863733144073, + "grad_norm": 3.7071948051452637, + "learning_rate": 9.781234918381833e-05, + "loss": 0.06997872591018676, + "step": 15420 + }, + { + "epoch": 2.190205819730305, + "grad_norm": 1.3492387533187866, + "learning_rate": 9.781092973740241e-05, + "loss": 0.12530778646469115, + "step": 15430 + }, + { + "epoch": 2.191625266146203, + "grad_norm": 4.588033199310303, + "learning_rate": 9.780951029098652e-05, + "loss": 0.09968525767326356, + "step": 15440 + }, + { + "epoch": 2.193044712562101, + "grad_norm": 7.795054912567139, + "learning_rate": 9.780809084457062e-05, + "loss": 0.14231202602386475, + "step": 15450 + }, + { + "epoch": 2.1944641589779987, + "grad_norm": 3.2043259143829346, + "learning_rate": 9.780667139815473e-05, + "loss": 0.13562475442886351, + "step": 15460 + }, + { + "epoch": 2.195883605393896, + "grad_norm": 4.458872318267822, + "learning_rate": 9.780525195173883e-05, + "loss": 0.12291073799133301, + "step": 15470 + }, + { + "epoch": 2.197303051809794, + "grad_norm": 0.49556025862693787, + "learning_rate": 9.780383250532294e-05, + "loss": 0.030131521821022033, + "step": 15480 + }, + { + "epoch": 2.198722498225692, + "grad_norm": 10.009795188903809, + "learning_rate": 9.780241305890702e-05, + "loss": 0.13439586162567138, + "step": 15490 + }, + { + "epoch": 2.2001419446415897, + "grad_norm": 9.650060653686523, + "learning_rate": 9.780099361249113e-05, + "loss": 0.15003018379211425, + "step": 15500 + }, + { + "epoch": 2.2001419446415897, + "eval_accuracy": 0.9476696127678514, + "eval_loss": 0.15076443552970886, + "eval_runtime": 33.5101, + "eval_samples_per_second": 469.322, + "eval_steps_per_second": 14.682, + "step": 15500 + }, + { + "epoch": 2.2015613910574876, + "grad_norm": 3.4228737354278564, + "learning_rate": 9.779957416607524e-05, + "loss": 0.08043778538703919, + "step": 15510 + }, + { + "epoch": 2.2029808374733855, + "grad_norm": 7.456453800201416, + "learning_rate": 9.779815471965934e-05, + "loss": 0.08067357540130615, + "step": 15520 + }, + { + "epoch": 2.2044002838892833, + "grad_norm": 7.92563533782959, + "learning_rate": 9.779673527324345e-05, + "loss": 0.15267107486724854, + "step": 15530 + }, + { + "epoch": 2.2058197303051807, + "grad_norm": 7.132428169250488, + "learning_rate": 9.779531582682753e-05, + "loss": 0.20551769733428954, + "step": 15540 + }, + { + "epoch": 2.2072391767210786, + "grad_norm": 5.588425636291504, + "learning_rate": 9.779389638041165e-05, + "loss": 0.0594519853591919, + "step": 15550 + }, + { + "epoch": 2.2086586231369765, + "grad_norm": 0.8327229619026184, + "learning_rate": 9.779247693399574e-05, + "loss": 0.09828418493270874, + "step": 15560 + }, + { + "epoch": 2.2100780695528743, + "grad_norm": 4.466777324676514, + "learning_rate": 9.779105748757985e-05, + "loss": 0.0886389136314392, + "step": 15570 + }, + { + "epoch": 2.211497515968772, + "grad_norm": 6.381712913513184, + "learning_rate": 9.778963804116395e-05, + "loss": 0.11927787065505982, + "step": 15580 + }, + { + "epoch": 2.21291696238467, + "grad_norm": 6.469443321228027, + "learning_rate": 9.778821859474805e-05, + "loss": 0.17326163053512572, + "step": 15590 + }, + { + "epoch": 2.214336408800568, + "grad_norm": 6.632884502410889, + "learning_rate": 9.778679914833216e-05, + "loss": 0.11724759340286255, + "step": 15600 + }, + { + "epoch": 2.215755855216466, + "grad_norm": 3.7693932056427, + "learning_rate": 9.778537970191626e-05, + "loss": 0.12318531274795533, + "step": 15610 + }, + { + "epoch": 2.217175301632363, + "grad_norm": 11.708182334899902, + "learning_rate": 9.778396025550037e-05, + "loss": 0.1665675401687622, + "step": 15620 + }, + { + "epoch": 2.218594748048261, + "grad_norm": 6.708708763122559, + "learning_rate": 9.778254080908447e-05, + "loss": 0.09552123546600341, + "step": 15630 + }, + { + "epoch": 2.220014194464159, + "grad_norm": 3.537140130996704, + "learning_rate": 9.778112136266856e-05, + "loss": 0.17162953615188598, + "step": 15640 + }, + { + "epoch": 2.221433640880057, + "grad_norm": 3.47255802154541, + "learning_rate": 9.777970191625266e-05, + "loss": 0.11431492567062378, + "step": 15650 + }, + { + "epoch": 2.2228530872959547, + "grad_norm": 2.390170097351074, + "learning_rate": 9.777828246983677e-05, + "loss": 0.1374788761138916, + "step": 15660 + }, + { + "epoch": 2.2242725337118525, + "grad_norm": 8.488000869750977, + "learning_rate": 9.777686302342087e-05, + "loss": 0.075135737657547, + "step": 15670 + }, + { + "epoch": 2.2256919801277504, + "grad_norm": 5.250071048736572, + "learning_rate": 9.777544357700498e-05, + "loss": 0.15566228628158568, + "step": 15680 + }, + { + "epoch": 2.227111426543648, + "grad_norm": 1.0439021587371826, + "learning_rate": 9.777402413058908e-05, + "loss": 0.08581479787826538, + "step": 15690 + }, + { + "epoch": 2.2285308729595457, + "grad_norm": 5.081490993499756, + "learning_rate": 9.777260468417317e-05, + "loss": 0.0691333532333374, + "step": 15700 + }, + { + "epoch": 2.2299503193754435, + "grad_norm": 4.931427478790283, + "learning_rate": 9.777118523775729e-05, + "loss": 0.08706582188606263, + "step": 15710 + }, + { + "epoch": 2.2313697657913414, + "grad_norm": 2.0620617866516113, + "learning_rate": 9.776976579134138e-05, + "loss": 0.09351248145103455, + "step": 15720 + }, + { + "epoch": 2.2327892122072392, + "grad_norm": 11.9086275100708, + "learning_rate": 9.77683463449255e-05, + "loss": 0.167766273021698, + "step": 15730 + }, + { + "epoch": 2.234208658623137, + "grad_norm": 7.802628993988037, + "learning_rate": 9.776692689850958e-05, + "loss": 0.08956191539764405, + "step": 15740 + }, + { + "epoch": 2.235628105039035, + "grad_norm": 6.4769134521484375, + "learning_rate": 9.776550745209369e-05, + "loss": 0.0949668049812317, + "step": 15750 + }, + { + "epoch": 2.2370475514549324, + "grad_norm": 5.48812198638916, + "learning_rate": 9.776408800567779e-05, + "loss": 0.10781463384628295, + "step": 15760 + }, + { + "epoch": 2.2384669978708303, + "grad_norm": 4.095717430114746, + "learning_rate": 9.77626685592619e-05, + "loss": 0.10710879564285278, + "step": 15770 + }, + { + "epoch": 2.239886444286728, + "grad_norm": 7.886163234710693, + "learning_rate": 9.7761249112846e-05, + "loss": 0.16387512683868408, + "step": 15780 + }, + { + "epoch": 2.241305890702626, + "grad_norm": 5.275144577026367, + "learning_rate": 9.775982966643009e-05, + "loss": 0.13074166774749757, + "step": 15790 + }, + { + "epoch": 2.242725337118524, + "grad_norm": 6.263736248016357, + "learning_rate": 9.77584102200142e-05, + "loss": 0.1308918595314026, + "step": 15800 + }, + { + "epoch": 2.2441447835344217, + "grad_norm": 8.084881782531738, + "learning_rate": 9.77569907735983e-05, + "loss": 0.15410442352294923, + "step": 15810 + }, + { + "epoch": 2.2455642299503196, + "grad_norm": 5.83068323135376, + "learning_rate": 9.775557132718241e-05, + "loss": 0.15612525939941407, + "step": 15820 + }, + { + "epoch": 2.246983676366217, + "grad_norm": 0.40490075945854187, + "learning_rate": 9.775415188076651e-05, + "loss": 0.05670689940452576, + "step": 15830 + }, + { + "epoch": 2.248403122782115, + "grad_norm": 9.664972305297852, + "learning_rate": 9.775273243435062e-05, + "loss": 0.08322632312774658, + "step": 15840 + }, + { + "epoch": 2.2498225691980127, + "grad_norm": 5.599974632263184, + "learning_rate": 9.77513129879347e-05, + "loss": 0.0942413330078125, + "step": 15850 + }, + { + "epoch": 2.2512420156139106, + "grad_norm": 4.52598762512207, + "learning_rate": 9.774989354151881e-05, + "loss": 0.1192929744720459, + "step": 15860 + }, + { + "epoch": 2.2526614620298084, + "grad_norm": 8.435208320617676, + "learning_rate": 9.774847409510291e-05, + "loss": 0.10933787822723388, + "step": 15870 + }, + { + "epoch": 2.2540809084457063, + "grad_norm": 6.769467353820801, + "learning_rate": 9.774705464868702e-05, + "loss": 0.08203907608985901, + "step": 15880 + }, + { + "epoch": 2.255500354861604, + "grad_norm": 7.499700546264648, + "learning_rate": 9.774563520227112e-05, + "loss": 0.1266704320907593, + "step": 15890 + }, + { + "epoch": 2.2569198012775016, + "grad_norm": 4.320639133453369, + "learning_rate": 9.774421575585522e-05, + "loss": 0.13925156593322754, + "step": 15900 + }, + { + "epoch": 2.2583392476933994, + "grad_norm": 5.2828168869018555, + "learning_rate": 9.774279630943933e-05, + "loss": 0.12411700487136841, + "step": 15910 + }, + { + "epoch": 2.2597586941092973, + "grad_norm": 7.704649448394775, + "learning_rate": 9.774137686302343e-05, + "loss": 0.17451765537261962, + "step": 15920 + }, + { + "epoch": 2.261178140525195, + "grad_norm": 10.570831298828125, + "learning_rate": 9.773995741660754e-05, + "loss": 0.14159404039382933, + "step": 15930 + }, + { + "epoch": 2.262597586941093, + "grad_norm": 1.9156538248062134, + "learning_rate": 9.773853797019163e-05, + "loss": 0.10246649980545045, + "step": 15940 + }, + { + "epoch": 2.264017033356991, + "grad_norm": 10.271675109863281, + "learning_rate": 9.773711852377573e-05, + "loss": 0.1498422145843506, + "step": 15950 + }, + { + "epoch": 2.2654364797728888, + "grad_norm": 2.2951345443725586, + "learning_rate": 9.773569907735983e-05, + "loss": 0.1181708812713623, + "step": 15960 + }, + { + "epoch": 2.2668559261887866, + "grad_norm": 7.073802471160889, + "learning_rate": 9.773427963094394e-05, + "loss": 0.13307657241821289, + "step": 15970 + }, + { + "epoch": 2.268275372604684, + "grad_norm": 3.94195556640625, + "learning_rate": 9.773286018452804e-05, + "loss": 0.06159374713897705, + "step": 15980 + }, + { + "epoch": 2.269694819020582, + "grad_norm": 0.3510136902332306, + "learning_rate": 9.773144073811215e-05, + "loss": 0.05166938900947571, + "step": 15990 + }, + { + "epoch": 2.2711142654364798, + "grad_norm": 2.7349507808685303, + "learning_rate": 9.773002129169624e-05, + "loss": 0.12719658613204957, + "step": 16000 + }, + { + "epoch": 2.2711142654364798, + "eval_accuracy": 0.9534558402746869, + "eval_loss": 0.12974673509597778, + "eval_runtime": 32.9436, + "eval_samples_per_second": 477.391, + "eval_steps_per_second": 14.935, + "step": 16000 + }, + { + "epoch": 2.2725337118523776, + "grad_norm": 3.8580965995788574, + "learning_rate": 9.772860184528034e-05, + "loss": 0.07134815454483032, + "step": 16010 + }, + { + "epoch": 2.2739531582682755, + "grad_norm": 10.57183837890625, + "learning_rate": 9.772718239886445e-05, + "loss": 0.11877801418304443, + "step": 16020 + }, + { + "epoch": 2.2753726046841733, + "grad_norm": 8.526998519897461, + "learning_rate": 9.772576295244855e-05, + "loss": 0.11683057546615601, + "step": 16030 + }, + { + "epoch": 2.276792051100071, + "grad_norm": 2.470162868499756, + "learning_rate": 9.772434350603266e-05, + "loss": 0.0911303460597992, + "step": 16040 + }, + { + "epoch": 2.2782114975159686, + "grad_norm": 8.097274780273438, + "learning_rate": 9.772292405961675e-05, + "loss": 0.1780623197555542, + "step": 16050 + }, + { + "epoch": 2.2796309439318665, + "grad_norm": 4.431247234344482, + "learning_rate": 9.772150461320086e-05, + "loss": 0.13148776292800904, + "step": 16060 + }, + { + "epoch": 2.2810503903477644, + "grad_norm": 7.473452568054199, + "learning_rate": 9.772008516678495e-05, + "loss": 0.09967323541641235, + "step": 16070 + }, + { + "epoch": 2.282469836763662, + "grad_norm": 2.283681869506836, + "learning_rate": 9.771866572036906e-05, + "loss": 0.06625600457191468, + "step": 16080 + }, + { + "epoch": 2.28388928317956, + "grad_norm": 0.9107749462127686, + "learning_rate": 9.771724627395316e-05, + "loss": 0.07753741145133972, + "step": 16090 + }, + { + "epoch": 2.285308729595458, + "grad_norm": 2.082306146621704, + "learning_rate": 9.771582682753726e-05, + "loss": 0.06911807656288146, + "step": 16100 + }, + { + "epoch": 2.286728176011356, + "grad_norm": 8.424261093139648, + "learning_rate": 9.771440738112137e-05, + "loss": 0.06900658011436463, + "step": 16110 + }, + { + "epoch": 2.2881476224272532, + "grad_norm": 2.821417808532715, + "learning_rate": 9.771298793470547e-05, + "loss": 0.10042606592178345, + "step": 16120 + }, + { + "epoch": 2.289567068843151, + "grad_norm": 4.486814975738525, + "learning_rate": 9.771156848828958e-05, + "loss": 0.1290997862815857, + "step": 16130 + }, + { + "epoch": 2.290986515259049, + "grad_norm": 8.3433198928833, + "learning_rate": 9.771014904187368e-05, + "loss": 0.14453980922698975, + "step": 16140 + }, + { + "epoch": 2.292405961674947, + "grad_norm": 9.422966003417969, + "learning_rate": 9.770872959545777e-05, + "loss": 0.13661658763885498, + "step": 16150 + }, + { + "epoch": 2.2938254080908447, + "grad_norm": 6.411171913146973, + "learning_rate": 9.770731014904187e-05, + "loss": 0.09912009239196777, + "step": 16160 + }, + { + "epoch": 2.2952448545067425, + "grad_norm": 4.763072490692139, + "learning_rate": 9.770589070262598e-05, + "loss": 0.10291681289672852, + "step": 16170 + }, + { + "epoch": 2.2966643009226404, + "grad_norm": 5.987633228302002, + "learning_rate": 9.770447125621008e-05, + "loss": 0.15251626968383789, + "step": 16180 + }, + { + "epoch": 2.298083747338538, + "grad_norm": 10.529451370239258, + "learning_rate": 9.770305180979419e-05, + "loss": 0.17285287380218506, + "step": 16190 + }, + { + "epoch": 2.2995031937544357, + "grad_norm": 1.2355297803878784, + "learning_rate": 9.770163236337829e-05, + "loss": 0.16878000497817994, + "step": 16200 + }, + { + "epoch": 2.3009226401703335, + "grad_norm": 2.409059762954712, + "learning_rate": 9.770021291696238e-05, + "loss": 0.08963816165924073, + "step": 16210 + }, + { + "epoch": 2.3023420865862314, + "grad_norm": 0.2473367154598236, + "learning_rate": 9.76987934705465e-05, + "loss": 0.07898592352867126, + "step": 16220 + }, + { + "epoch": 2.3037615330021293, + "grad_norm": 3.4052321910858154, + "learning_rate": 9.769737402413059e-05, + "loss": 0.13420867919921875, + "step": 16230 + }, + { + "epoch": 2.305180979418027, + "grad_norm": 2.8136518001556396, + "learning_rate": 9.76959545777147e-05, + "loss": 0.08897106051445007, + "step": 16240 + }, + { + "epoch": 2.306600425833925, + "grad_norm": 4.1067094802856445, + "learning_rate": 9.76945351312988e-05, + "loss": 0.11498106718063354, + "step": 16250 + }, + { + "epoch": 2.3080198722498224, + "grad_norm": 3.161066770553589, + "learning_rate": 9.76931156848829e-05, + "loss": 0.12238447666168213, + "step": 16260 + }, + { + "epoch": 2.3094393186657203, + "grad_norm": 8.762333869934082, + "learning_rate": 9.7691696238467e-05, + "loss": 0.07079674601554871, + "step": 16270 + }, + { + "epoch": 2.310858765081618, + "grad_norm": 2.2034451961517334, + "learning_rate": 9.76902767920511e-05, + "loss": 0.1289450168609619, + "step": 16280 + }, + { + "epoch": 2.312278211497516, + "grad_norm": 3.3836324214935303, + "learning_rate": 9.76888573456352e-05, + "loss": 0.08217411041259766, + "step": 16290 + }, + { + "epoch": 2.313697657913414, + "grad_norm": 2.8655858039855957, + "learning_rate": 9.768743789921932e-05, + "loss": 0.09491733908653259, + "step": 16300 + }, + { + "epoch": 2.3151171043293117, + "grad_norm": 4.423978805541992, + "learning_rate": 9.768601845280341e-05, + "loss": 0.13365116119384765, + "step": 16310 + }, + { + "epoch": 2.3165365507452096, + "grad_norm": 8.303816795349121, + "learning_rate": 9.768459900638751e-05, + "loss": 0.15843117237091064, + "step": 16320 + }, + { + "epoch": 2.317955997161107, + "grad_norm": 0.4200175404548645, + "learning_rate": 9.768317955997162e-05, + "loss": 0.12860283851623536, + "step": 16330 + }, + { + "epoch": 2.319375443577005, + "grad_norm": 0.9817140698432922, + "learning_rate": 9.768176011355572e-05, + "loss": 0.0771494209766388, + "step": 16340 + }, + { + "epoch": 2.3207948899929027, + "grad_norm": 5.904425144195557, + "learning_rate": 9.768034066713983e-05, + "loss": 0.09748343229293824, + "step": 16350 + }, + { + "epoch": 2.3222143364088006, + "grad_norm": 11.307563781738281, + "learning_rate": 9.767892122072391e-05, + "loss": 0.16953353881835936, + "step": 16360 + }, + { + "epoch": 2.3236337828246985, + "grad_norm": 4.251320838928223, + "learning_rate": 9.767750177430802e-05, + "loss": 0.10789685249328614, + "step": 16370 + }, + { + "epoch": 2.3250532292405963, + "grad_norm": 3.149813175201416, + "learning_rate": 9.767608232789212e-05, + "loss": 0.09740127325057983, + "step": 16380 + }, + { + "epoch": 2.326472675656494, + "grad_norm": 9.757298469543457, + "learning_rate": 9.767466288147623e-05, + "loss": 0.16251888275146484, + "step": 16390 + }, + { + "epoch": 2.3278921220723916, + "grad_norm": 3.574176073074341, + "learning_rate": 9.767324343506033e-05, + "loss": 0.08429834246635437, + "step": 16400 + }, + { + "epoch": 2.3293115684882895, + "grad_norm": 3.4276225566864014, + "learning_rate": 9.767182398864443e-05, + "loss": 0.08981868624687195, + "step": 16410 + }, + { + "epoch": 2.3307310149041873, + "grad_norm": 7.491410732269287, + "learning_rate": 9.767040454222854e-05, + "loss": 0.1534734010696411, + "step": 16420 + }, + { + "epoch": 2.332150461320085, + "grad_norm": 7.178809642791748, + "learning_rate": 9.766898509581264e-05, + "loss": 0.13512442111968995, + "step": 16430 + }, + { + "epoch": 2.333569907735983, + "grad_norm": 7.452297687530518, + "learning_rate": 9.766756564939675e-05, + "loss": 0.15903291702270508, + "step": 16440 + }, + { + "epoch": 2.334989354151881, + "grad_norm": 4.820403575897217, + "learning_rate": 9.766614620298084e-05, + "loss": 0.1309017300605774, + "step": 16450 + }, + { + "epoch": 2.3364088005677788, + "grad_norm": 7.638652801513672, + "learning_rate": 9.766472675656494e-05, + "loss": 0.12363828420639038, + "step": 16460 + }, + { + "epoch": 2.337828246983676, + "grad_norm": 9.250051498413086, + "learning_rate": 9.766330731014904e-05, + "loss": 0.15233538150787354, + "step": 16470 + }, + { + "epoch": 2.339247693399574, + "grad_norm": 4.459556579589844, + "learning_rate": 9.766188786373315e-05, + "loss": 0.16799700260162354, + "step": 16480 + }, + { + "epoch": 2.340667139815472, + "grad_norm": 5.2020955085754395, + "learning_rate": 9.766046841731725e-05, + "loss": 0.0790201485157013, + "step": 16490 + }, + { + "epoch": 2.34208658623137, + "grad_norm": 1.893151879310608, + "learning_rate": 9.765904897090136e-05, + "loss": 0.07257702350616455, + "step": 16500 + }, + { + "epoch": 2.34208658623137, + "eval_accuracy": 0.952883576015769, + "eval_loss": 0.14282935857772827, + "eval_runtime": 32.8134, + "eval_samples_per_second": 479.287, + "eval_steps_per_second": 14.994, + "step": 16500 + }, + { + "epoch": 2.3435060326472676, + "grad_norm": 2.2250635623931885, + "learning_rate": 9.765762952448545e-05, + "loss": 0.11261917352676391, + "step": 16510 + }, + { + "epoch": 2.3449254790631655, + "grad_norm": 1.5180538892745972, + "learning_rate": 9.765621007806955e-05, + "loss": 0.08184219598770141, + "step": 16520 + }, + { + "epoch": 2.3463449254790634, + "grad_norm": 3.8298745155334473, + "learning_rate": 9.765479063165366e-05, + "loss": 0.08645458817481995, + "step": 16530 + }, + { + "epoch": 2.347764371894961, + "grad_norm": 3.3084588050842285, + "learning_rate": 9.765337118523776e-05, + "loss": 0.06830872893333435, + "step": 16540 + }, + { + "epoch": 2.3491838183108587, + "grad_norm": 0.7720867395401001, + "learning_rate": 9.765195173882187e-05, + "loss": 0.1192325472831726, + "step": 16550 + }, + { + "epoch": 2.3506032647267565, + "grad_norm": 7.036698341369629, + "learning_rate": 9.765053229240597e-05, + "loss": 0.09893574118614197, + "step": 16560 + }, + { + "epoch": 2.3520227111426544, + "grad_norm": 7.439764499664307, + "learning_rate": 9.764911284599007e-05, + "loss": 0.09484468102455139, + "step": 16570 + }, + { + "epoch": 2.3534421575585522, + "grad_norm": 4.2301435470581055, + "learning_rate": 9.764769339957416e-05, + "loss": 0.11805753707885742, + "step": 16580 + }, + { + "epoch": 2.35486160397445, + "grad_norm": 6.39113712310791, + "learning_rate": 9.764627395315827e-05, + "loss": 0.09725428223609925, + "step": 16590 + }, + { + "epoch": 2.356281050390348, + "grad_norm": 6.1582841873168945, + "learning_rate": 9.764485450674237e-05, + "loss": 0.10667927265167236, + "step": 16600 + }, + { + "epoch": 2.3577004968062454, + "grad_norm": 3.7757277488708496, + "learning_rate": 9.764343506032648e-05, + "loss": 0.12746351957321167, + "step": 16610 + }, + { + "epoch": 2.3591199432221432, + "grad_norm": 5.895532131195068, + "learning_rate": 9.764201561391058e-05, + "loss": 0.13624510765075684, + "step": 16620 + }, + { + "epoch": 2.360539389638041, + "grad_norm": 3.6180717945098877, + "learning_rate": 9.764059616749468e-05, + "loss": 0.12134850025177002, + "step": 16630 + }, + { + "epoch": 2.361958836053939, + "grad_norm": 4.084766864776611, + "learning_rate": 9.763917672107879e-05, + "loss": 0.09110198616981506, + "step": 16640 + }, + { + "epoch": 2.363378282469837, + "grad_norm": 7.207777500152588, + "learning_rate": 9.763775727466289e-05, + "loss": 0.0987035095691681, + "step": 16650 + }, + { + "epoch": 2.3647977288857347, + "grad_norm": 7.370236396789551, + "learning_rate": 9.7636337828247e-05, + "loss": 0.15047061443328857, + "step": 16660 + }, + { + "epoch": 2.3662171753016326, + "grad_norm": 7.778202056884766, + "learning_rate": 9.763491838183108e-05, + "loss": 0.16292293071746827, + "step": 16670 + }, + { + "epoch": 2.36763662171753, + "grad_norm": 3.764970541000366, + "learning_rate": 9.763349893541519e-05, + "loss": 0.12445385456085205, + "step": 16680 + }, + { + "epoch": 2.369056068133428, + "grad_norm": 2.9177567958831787, + "learning_rate": 9.763207948899929e-05, + "loss": 0.12629375457763672, + "step": 16690 + }, + { + "epoch": 2.3704755145493257, + "grad_norm": 4.7777099609375, + "learning_rate": 9.76306600425834e-05, + "loss": 0.10483566522598267, + "step": 16700 + }, + { + "epoch": 2.3718949609652236, + "grad_norm": 2.476802349090576, + "learning_rate": 9.76292405961675e-05, + "loss": 0.07830199003219604, + "step": 16710 + }, + { + "epoch": 2.3733144073811214, + "grad_norm": 6.576395034790039, + "learning_rate": 9.76278211497516e-05, + "loss": 0.12722206115722656, + "step": 16720 + }, + { + "epoch": 2.3747338537970193, + "grad_norm": 1.5219242572784424, + "learning_rate": 9.76264017033357e-05, + "loss": 0.08835641741752624, + "step": 16730 + }, + { + "epoch": 2.376153300212917, + "grad_norm": 2.6990671157836914, + "learning_rate": 9.76249822569198e-05, + "loss": 0.10250411033630372, + "step": 16740 + }, + { + "epoch": 2.3775727466288146, + "grad_norm": 4.596541404724121, + "learning_rate": 9.762356281050391e-05, + "loss": 0.14535219669342042, + "step": 16750 + }, + { + "epoch": 2.3789921930447124, + "grad_norm": 2.865243434906006, + "learning_rate": 9.762214336408801e-05, + "loss": 0.061080020666122434, + "step": 16760 + }, + { + "epoch": 2.3804116394606103, + "grad_norm": 4.850032806396484, + "learning_rate": 9.762072391767211e-05, + "loss": 0.11783115863800049, + "step": 16770 + }, + { + "epoch": 2.381831085876508, + "grad_norm": 1.7372711896896362, + "learning_rate": 9.76193044712562e-05, + "loss": 0.09774195551872253, + "step": 16780 + }, + { + "epoch": 2.383250532292406, + "grad_norm": 7.511697769165039, + "learning_rate": 9.761788502484032e-05, + "loss": 0.1309769868850708, + "step": 16790 + }, + { + "epoch": 2.384669978708304, + "grad_norm": 8.27840805053711, + "learning_rate": 9.761646557842441e-05, + "loss": 0.17970755100250244, + "step": 16800 + }, + { + "epoch": 2.3860894251242017, + "grad_norm": 0.9087435603141785, + "learning_rate": 9.761504613200853e-05, + "loss": 0.07040458917617798, + "step": 16810 + }, + { + "epoch": 2.387508871540099, + "grad_norm": 3.8493130207061768, + "learning_rate": 9.761362668559262e-05, + "loss": 0.11651371717453003, + "step": 16820 + }, + { + "epoch": 2.388928317955997, + "grad_norm": 1.5010507106781006, + "learning_rate": 9.761220723917672e-05, + "loss": 0.08106373548507691, + "step": 16830 + }, + { + "epoch": 2.390347764371895, + "grad_norm": 6.315835475921631, + "learning_rate": 9.761078779276083e-05, + "loss": 0.11155383586883545, + "step": 16840 + }, + { + "epoch": 2.3917672107877928, + "grad_norm": 2.8264517784118652, + "learning_rate": 9.760936834634493e-05, + "loss": 0.12171386480331421, + "step": 16850 + }, + { + "epoch": 2.3931866572036906, + "grad_norm": 2.294635057449341, + "learning_rate": 9.760794889992904e-05, + "loss": 0.1550905466079712, + "step": 16860 + }, + { + "epoch": 2.3946061036195885, + "grad_norm": 4.574626445770264, + "learning_rate": 9.760652945351312e-05, + "loss": 0.11418824195861817, + "step": 16870 + }, + { + "epoch": 2.3960255500354863, + "grad_norm": 2.7776918411254883, + "learning_rate": 9.760511000709723e-05, + "loss": 0.12959576845169068, + "step": 16880 + }, + { + "epoch": 2.3974449964513838, + "grad_norm": 3.4543848037719727, + "learning_rate": 9.760369056068133e-05, + "loss": 0.11354950666427613, + "step": 16890 + }, + { + "epoch": 2.3988644428672816, + "grad_norm": 5.274985313415527, + "learning_rate": 9.760227111426544e-05, + "loss": 0.06138370633125305, + "step": 16900 + }, + { + "epoch": 2.4002838892831795, + "grad_norm": 6.934667110443115, + "learning_rate": 9.760085166784955e-05, + "loss": 0.1329074501991272, + "step": 16910 + }, + { + "epoch": 2.4017033356990773, + "grad_norm": 6.645686626434326, + "learning_rate": 9.759943222143365e-05, + "loss": 0.17836753129959107, + "step": 16920 + }, + { + "epoch": 2.403122782114975, + "grad_norm": 6.251645088195801, + "learning_rate": 9.759801277501775e-05, + "loss": 0.0962505280971527, + "step": 16930 + }, + { + "epoch": 2.404542228530873, + "grad_norm": 5.136745452880859, + "learning_rate": 9.759659332860185e-05, + "loss": 0.08273377418518066, + "step": 16940 + }, + { + "epoch": 2.405961674946771, + "grad_norm": 7.956725120544434, + "learning_rate": 9.759517388218596e-05, + "loss": 0.11856834888458252, + "step": 16950 + }, + { + "epoch": 2.4073811213626684, + "grad_norm": 2.631044387817383, + "learning_rate": 9.759375443577005e-05, + "loss": 0.11917568445205688, + "step": 16960 + }, + { + "epoch": 2.408800567778566, + "grad_norm": 5.937511444091797, + "learning_rate": 9.759233498935416e-05, + "loss": 0.07629096508026123, + "step": 16970 + }, + { + "epoch": 2.410220014194464, + "grad_norm": 5.794412612915039, + "learning_rate": 9.759091554293825e-05, + "loss": 0.1741081953048706, + "step": 16980 + }, + { + "epoch": 2.411639460610362, + "grad_norm": 6.313220977783203, + "learning_rate": 9.758949609652236e-05, + "loss": 0.07898733615875245, + "step": 16990 + }, + { + "epoch": 2.41305890702626, + "grad_norm": 7.137319087982178, + "learning_rate": 9.758807665010647e-05, + "loss": 0.11363914012908935, + "step": 17000 + }, + { + "epoch": 2.41305890702626, + "eval_accuracy": 0.9416926305080435, + "eval_loss": 0.1742754876613617, + "eval_runtime": 31.9943, + "eval_samples_per_second": 491.556, + "eval_steps_per_second": 15.378, + "step": 17000 + }, + { + "epoch": 2.4144783534421577, + "grad_norm": 5.010659217834473, + "learning_rate": 9.758665720369057e-05, + "loss": 0.15786590576171874, + "step": 17010 + }, + { + "epoch": 2.4158977998580555, + "grad_norm": 6.37407112121582, + "learning_rate": 9.758523775727468e-05, + "loss": 0.1406489849090576, + "step": 17020 + }, + { + "epoch": 2.417317246273953, + "grad_norm": 4.527013301849365, + "learning_rate": 9.758381831085876e-05, + "loss": 0.10702930688858033, + "step": 17030 + }, + { + "epoch": 2.418736692689851, + "grad_norm": 2.203209161758423, + "learning_rate": 9.758239886444287e-05, + "loss": 0.21100082397460937, + "step": 17040 + }, + { + "epoch": 2.4201561391057487, + "grad_norm": 2.5778391361236572, + "learning_rate": 9.758097941802697e-05, + "loss": 0.05981506705284119, + "step": 17050 + }, + { + "epoch": 2.4215755855216465, + "grad_norm": 6.347795486450195, + "learning_rate": 9.757955997161108e-05, + "loss": 0.12853623628616334, + "step": 17060 + }, + { + "epoch": 2.4229950319375444, + "grad_norm": 9.994209289550781, + "learning_rate": 9.757814052519518e-05, + "loss": 0.10259546041488647, + "step": 17070 + }, + { + "epoch": 2.4244144783534423, + "grad_norm": 3.367839813232422, + "learning_rate": 9.757672107877928e-05, + "loss": 0.06157753467559814, + "step": 17080 + }, + { + "epoch": 2.42583392476934, + "grad_norm": 3.509408473968506, + "learning_rate": 9.757530163236339e-05, + "loss": 0.08180438876152038, + "step": 17090 + }, + { + "epoch": 2.4272533711852375, + "grad_norm": 4.197175025939941, + "learning_rate": 9.757388218594748e-05, + "loss": 0.14403607845306396, + "step": 17100 + }, + { + "epoch": 2.4286728176011354, + "grad_norm": 4.370192527770996, + "learning_rate": 9.75724627395316e-05, + "loss": 0.16384668350219728, + "step": 17110 + }, + { + "epoch": 2.4300922640170333, + "grad_norm": 3.144803047180176, + "learning_rate": 9.757104329311569e-05, + "loss": 0.08878316283226013, + "step": 17120 + }, + { + "epoch": 2.431511710432931, + "grad_norm": 4.3488593101501465, + "learning_rate": 9.756962384669979e-05, + "loss": 0.17752463817596437, + "step": 17130 + }, + { + "epoch": 2.432931156848829, + "grad_norm": 9.861291885375977, + "learning_rate": 9.756820440028389e-05, + "loss": 0.10461457967758178, + "step": 17140 + }, + { + "epoch": 2.434350603264727, + "grad_norm": 2.252723217010498, + "learning_rate": 9.7566784953868e-05, + "loss": 0.09538206458091736, + "step": 17150 + }, + { + "epoch": 2.4357700496806247, + "grad_norm": 3.788640022277832, + "learning_rate": 9.75653655074521e-05, + "loss": 0.10890170335769653, + "step": 17160 + }, + { + "epoch": 2.437189496096522, + "grad_norm": 8.450477600097656, + "learning_rate": 9.756394606103621e-05, + "loss": 0.1873611330986023, + "step": 17170 + }, + { + "epoch": 2.43860894251242, + "grad_norm": 6.922235012054443, + "learning_rate": 9.75625266146203e-05, + "loss": 0.13029056787490845, + "step": 17180 + }, + { + "epoch": 2.440028388928318, + "grad_norm": 6.11525821685791, + "learning_rate": 9.75611071682044e-05, + "loss": 0.11692187786102295, + "step": 17190 + }, + { + "epoch": 2.4414478353442157, + "grad_norm": 7.727966785430908, + "learning_rate": 9.755968772178851e-05, + "loss": 0.18141931295394897, + "step": 17200 + }, + { + "epoch": 2.4428672817601136, + "grad_norm": 1.1188493967056274, + "learning_rate": 9.755826827537261e-05, + "loss": 0.14119462966918944, + "step": 17210 + }, + { + "epoch": 2.4442867281760114, + "grad_norm": 8.788047790527344, + "learning_rate": 9.755684882895672e-05, + "loss": 0.11063623428344727, + "step": 17220 + }, + { + "epoch": 2.4457061745919093, + "grad_norm": 4.968696117401123, + "learning_rate": 9.75554293825408e-05, + "loss": 0.11871033906936646, + "step": 17230 + }, + { + "epoch": 2.4471256210078067, + "grad_norm": 4.146373271942139, + "learning_rate": 9.755400993612492e-05, + "loss": 0.1038577675819397, + "step": 17240 + }, + { + "epoch": 2.4485450674237046, + "grad_norm": 4.578568458557129, + "learning_rate": 9.755259048970901e-05, + "loss": 0.1644783616065979, + "step": 17250 + }, + { + "epoch": 2.4499645138396025, + "grad_norm": 5.26609992980957, + "learning_rate": 9.755117104329312e-05, + "loss": 0.1413109540939331, + "step": 17260 + }, + { + "epoch": 2.4513839602555003, + "grad_norm": 5.410380840301514, + "learning_rate": 9.754975159687722e-05, + "loss": 0.10622183084487916, + "step": 17270 + }, + { + "epoch": 2.452803406671398, + "grad_norm": 8.643942832946777, + "learning_rate": 9.754833215046133e-05, + "loss": 0.12519901990890503, + "step": 17280 + }, + { + "epoch": 2.454222853087296, + "grad_norm": 5.121556282043457, + "learning_rate": 9.754691270404543e-05, + "loss": 0.1216310977935791, + "step": 17290 + }, + { + "epoch": 2.455642299503194, + "grad_norm": 4.879176139831543, + "learning_rate": 9.754549325762953e-05, + "loss": 0.07838413119316101, + "step": 17300 + }, + { + "epoch": 2.4570617459190913, + "grad_norm": 5.997292518615723, + "learning_rate": 9.754407381121364e-05, + "loss": 0.11862040758132934, + "step": 17310 + }, + { + "epoch": 2.458481192334989, + "grad_norm": 7.370124340057373, + "learning_rate": 9.754265436479774e-05, + "loss": 0.13782591819763185, + "step": 17320 + }, + { + "epoch": 2.459900638750887, + "grad_norm": 3.0784833431243896, + "learning_rate": 9.754123491838185e-05, + "loss": 0.12893285751342773, + "step": 17330 + }, + { + "epoch": 2.461320085166785, + "grad_norm": 4.132889747619629, + "learning_rate": 9.753981547196593e-05, + "loss": 0.1482453465461731, + "step": 17340 + }, + { + "epoch": 2.4627395315826828, + "grad_norm": 4.386025905609131, + "learning_rate": 9.753839602555004e-05, + "loss": 0.08701491355895996, + "step": 17350 + }, + { + "epoch": 2.4641589779985806, + "grad_norm": 7.536581516265869, + "learning_rate": 9.753697657913414e-05, + "loss": 0.1785440683364868, + "step": 17360 + }, + { + "epoch": 2.4655784244144785, + "grad_norm": 4.566206455230713, + "learning_rate": 9.753555713271825e-05, + "loss": 0.07483741641044617, + "step": 17370 + }, + { + "epoch": 2.466997870830376, + "grad_norm": 4.969336032867432, + "learning_rate": 9.753413768630235e-05, + "loss": 0.09664581418037414, + "step": 17380 + }, + { + "epoch": 2.468417317246274, + "grad_norm": 3.3608598709106445, + "learning_rate": 9.753271823988644e-05, + "loss": 0.08268053531646728, + "step": 17390 + }, + { + "epoch": 2.4698367636621716, + "grad_norm": 7.48677396774292, + "learning_rate": 9.753129879347055e-05, + "loss": 0.08111786842346191, + "step": 17400 + }, + { + "epoch": 2.4712562100780695, + "grad_norm": 2.8628151416778564, + "learning_rate": 9.752987934705465e-05, + "loss": 0.09410454630851746, + "step": 17410 + }, + { + "epoch": 2.4726756564939674, + "grad_norm": 5.564269065856934, + "learning_rate": 9.752845990063876e-05, + "loss": 0.09594557881355285, + "step": 17420 + }, + { + "epoch": 2.4740951029098652, + "grad_norm": 0.6636775135993958, + "learning_rate": 9.752704045422286e-05, + "loss": 0.09588454365730285, + "step": 17430 + }, + { + "epoch": 2.475514549325763, + "grad_norm": 6.354304313659668, + "learning_rate": 9.752562100780696e-05, + "loss": 0.10989620685577392, + "step": 17440 + }, + { + "epoch": 2.4769339957416605, + "grad_norm": 3.9579975605010986, + "learning_rate": 9.752434350603266e-05, + "loss": 0.10450366735458375, + "step": 17450 + }, + { + "epoch": 2.4783534421575584, + "grad_norm": 2.8820838928222656, + "learning_rate": 9.752292405961675e-05, + "loss": 0.09479145407676696, + "step": 17460 + }, + { + "epoch": 2.4797728885734562, + "grad_norm": 1.7476080656051636, + "learning_rate": 9.752150461320085e-05, + "loss": 0.12545448541641235, + "step": 17470 + }, + { + "epoch": 2.481192334989354, + "grad_norm": 7.19633150100708, + "learning_rate": 9.752008516678496e-05, + "loss": 0.0939016044139862, + "step": 17480 + }, + { + "epoch": 2.482611781405252, + "grad_norm": 11.924422264099121, + "learning_rate": 9.751866572036906e-05, + "loss": 0.16066315174102783, + "step": 17490 + }, + { + "epoch": 2.48403122782115, + "grad_norm": 2.1974613666534424, + "learning_rate": 9.751724627395317e-05, + "loss": 0.1423601746559143, + "step": 17500 + }, + { + "epoch": 2.48403122782115, + "eval_accuracy": 0.9445539518026325, + "eval_loss": 0.1727043092250824, + "eval_runtime": 32.757, + "eval_samples_per_second": 480.111, + "eval_steps_per_second": 15.02, + "step": 17500 + }, + { + "epoch": 2.4854506742370477, + "grad_norm": 6.336993217468262, + "learning_rate": 9.751582682753725e-05, + "loss": 0.13694591522216798, + "step": 17510 + }, + { + "epoch": 2.486870120652945, + "grad_norm": 4.340056896209717, + "learning_rate": 9.751440738112137e-05, + "loss": 0.1784249186515808, + "step": 17520 + }, + { + "epoch": 2.488289567068843, + "grad_norm": 4.247930526733398, + "learning_rate": 9.751298793470546e-05, + "loss": 0.10544888973236084, + "step": 17530 + }, + { + "epoch": 2.489709013484741, + "grad_norm": 2.6260440349578857, + "learning_rate": 9.751156848828957e-05, + "loss": 0.0672307550907135, + "step": 17540 + }, + { + "epoch": 2.4911284599006387, + "grad_norm": 1.9838597774505615, + "learning_rate": 9.751014904187367e-05, + "loss": 0.14270519018173217, + "step": 17550 + }, + { + "epoch": 2.4925479063165366, + "grad_norm": 1.2045660018920898, + "learning_rate": 9.750872959545777e-05, + "loss": 0.15043948888778685, + "step": 17560 + }, + { + "epoch": 2.4939673527324344, + "grad_norm": 7.862235069274902, + "learning_rate": 9.750731014904188e-05, + "loss": 0.07321544885635375, + "step": 17570 + }, + { + "epoch": 2.4953867991483323, + "grad_norm": 6.350536823272705, + "learning_rate": 9.750589070262598e-05, + "loss": 0.11304857730865478, + "step": 17580 + }, + { + "epoch": 2.49680624556423, + "grad_norm": 0.9608795046806335, + "learning_rate": 9.750447125621009e-05, + "loss": 0.08769638538360595, + "step": 17590 + }, + { + "epoch": 2.4982256919801276, + "grad_norm": 4.455130100250244, + "learning_rate": 9.750305180979418e-05, + "loss": 0.06901848912239075, + "step": 17600 + }, + { + "epoch": 2.4996451383960254, + "grad_norm": 3.236755132675171, + "learning_rate": 9.75016323633783e-05, + "loss": 0.10142921209335327, + "step": 17610 + }, + { + "epoch": 2.5010645848119233, + "grad_norm": 0.9103105068206787, + "learning_rate": 9.750021291696238e-05, + "loss": 0.12128010988235474, + "step": 17620 + }, + { + "epoch": 2.502484031227821, + "grad_norm": 3.3010218143463135, + "learning_rate": 9.749879347054649e-05, + "loss": 0.09445170164108277, + "step": 17630 + }, + { + "epoch": 2.503903477643719, + "grad_norm": 5.537515163421631, + "learning_rate": 9.749737402413059e-05, + "loss": 0.051540815830230714, + "step": 17640 + }, + { + "epoch": 2.505322924059617, + "grad_norm": 6.594273090362549, + "learning_rate": 9.74959545777147e-05, + "loss": 0.11053000688552857, + "step": 17650 + }, + { + "epoch": 2.5067423704755143, + "grad_norm": 6.973751068115234, + "learning_rate": 9.749453513129881e-05, + "loss": 0.17602165937423705, + "step": 17660 + }, + { + "epoch": 2.5081618168914126, + "grad_norm": 1.8898471593856812, + "learning_rate": 9.74931156848829e-05, + "loss": 0.09699593782424927, + "step": 17670 + }, + { + "epoch": 2.50958126330731, + "grad_norm": 8.757147789001465, + "learning_rate": 9.7491696238467e-05, + "loss": 0.09828613996505738, + "step": 17680 + }, + { + "epoch": 2.511000709723208, + "grad_norm": 5.698178291320801, + "learning_rate": 9.74902767920511e-05, + "loss": 0.09792088270187378, + "step": 17690 + }, + { + "epoch": 2.5124201561391057, + "grad_norm": 2.3245534896850586, + "learning_rate": 9.748885734563521e-05, + "loss": 0.08730112314224243, + "step": 17700 + }, + { + "epoch": 2.5138396025550036, + "grad_norm": 3.97782301902771, + "learning_rate": 9.748743789921931e-05, + "loss": 0.09204915165901184, + "step": 17710 + }, + { + "epoch": 2.5152590489709015, + "grad_norm": 2.635392904281616, + "learning_rate": 9.748601845280341e-05, + "loss": 0.08571889400482177, + "step": 17720 + }, + { + "epoch": 2.516678495386799, + "grad_norm": 4.555758476257324, + "learning_rate": 9.74845990063875e-05, + "loss": 0.10614382028579712, + "step": 17730 + }, + { + "epoch": 2.518097941802697, + "grad_norm": 6.458566665649414, + "learning_rate": 9.748317955997162e-05, + "loss": 0.1116061806678772, + "step": 17740 + }, + { + "epoch": 2.5195173882185946, + "grad_norm": 7.498642921447754, + "learning_rate": 9.748176011355573e-05, + "loss": 0.08102936148643494, + "step": 17750 + }, + { + "epoch": 2.5209368346344925, + "grad_norm": 8.974710464477539, + "learning_rate": 9.748034066713982e-05, + "loss": 0.15357725620269774, + "step": 17760 + }, + { + "epoch": 2.5223562810503903, + "grad_norm": 6.158868789672852, + "learning_rate": 9.747892122072392e-05, + "loss": 0.1006664514541626, + "step": 17770 + }, + { + "epoch": 2.523775727466288, + "grad_norm": 0.8831135630607605, + "learning_rate": 9.747750177430802e-05, + "loss": 0.07348037958145141, + "step": 17780 + }, + { + "epoch": 2.525195173882186, + "grad_norm": 8.365797996520996, + "learning_rate": 9.747608232789213e-05, + "loss": 0.09979128241539001, + "step": 17790 + }, + { + "epoch": 2.5266146202980835, + "grad_norm": 13.500819206237793, + "learning_rate": 9.747466288147623e-05, + "loss": 0.13896944522857665, + "step": 17800 + }, + { + "epoch": 2.528034066713982, + "grad_norm": 4.766392230987549, + "learning_rate": 9.747324343506034e-05, + "loss": 0.07932850122451782, + "step": 17810 + }, + { + "epoch": 2.529453513129879, + "grad_norm": 7.3413310050964355, + "learning_rate": 9.747182398864442e-05, + "loss": 0.07950088977813721, + "step": 17820 + }, + { + "epoch": 2.530872959545777, + "grad_norm": 3.8923566341400146, + "learning_rate": 9.747040454222853e-05, + "loss": 0.09398716688156128, + "step": 17830 + }, + { + "epoch": 2.532292405961675, + "grad_norm": 5.209949970245361, + "learning_rate": 9.746898509581264e-05, + "loss": 0.11348887681961059, + "step": 17840 + }, + { + "epoch": 2.533711852377573, + "grad_norm": 8.087526321411133, + "learning_rate": 9.746756564939674e-05, + "loss": 0.13804304599761963, + "step": 17850 + }, + { + "epoch": 2.5351312987934707, + "grad_norm": 4.874515056610107, + "learning_rate": 9.746614620298085e-05, + "loss": 0.12363841533660888, + "step": 17860 + }, + { + "epoch": 2.536550745209368, + "grad_norm": 9.139041900634766, + "learning_rate": 9.746472675656494e-05, + "loss": 0.09068549871444702, + "step": 17870 + }, + { + "epoch": 2.5379701916252664, + "grad_norm": 6.489454746246338, + "learning_rate": 9.746330731014905e-05, + "loss": 0.1587399125099182, + "step": 17880 + }, + { + "epoch": 2.539389638041164, + "grad_norm": 9.474618911743164, + "learning_rate": 9.746188786373314e-05, + "loss": 0.13566343784332274, + "step": 17890 + }, + { + "epoch": 2.5408090844570617, + "grad_norm": 3.8730716705322266, + "learning_rate": 9.746046841731726e-05, + "loss": 0.08422473669052125, + "step": 17900 + }, + { + "epoch": 2.5422285308729595, + "grad_norm": 2.2097864151000977, + "learning_rate": 9.745904897090135e-05, + "loss": 0.13542672395706176, + "step": 17910 + }, + { + "epoch": 2.5436479772888574, + "grad_norm": 15.095120429992676, + "learning_rate": 9.745762952448545e-05, + "loss": 0.14511890411376954, + "step": 17920 + }, + { + "epoch": 2.5450674237047552, + "grad_norm": 12.847689628601074, + "learning_rate": 9.745621007806956e-05, + "loss": 0.0919945478439331, + "step": 17930 + }, + { + "epoch": 2.5464868701206527, + "grad_norm": 2.031590223312378, + "learning_rate": 9.745479063165366e-05, + "loss": 0.13927642107009888, + "step": 17940 + }, + { + "epoch": 2.547906316536551, + "grad_norm": 4.216944694519043, + "learning_rate": 9.745337118523777e-05, + "loss": 0.10198723077774048, + "step": 17950 + }, + { + "epoch": 2.5493257629524484, + "grad_norm": 7.031200408935547, + "learning_rate": 9.745195173882187e-05, + "loss": 0.11566638946533203, + "step": 17960 + }, + { + "epoch": 2.5507452093683463, + "grad_norm": 5.59580135345459, + "learning_rate": 9.745053229240598e-05, + "loss": 0.0891038417816162, + "step": 17970 + }, + { + "epoch": 2.552164655784244, + "grad_norm": 8.706607818603516, + "learning_rate": 9.744911284599006e-05, + "loss": 0.09640666842460632, + "step": 17980 + }, + { + "epoch": 2.553584102200142, + "grad_norm": 3.204340934753418, + "learning_rate": 9.744769339957417e-05, + "loss": 0.10391557216644287, + "step": 17990 + }, + { + "epoch": 2.55500354861604, + "grad_norm": 6.2729573249816895, + "learning_rate": 9.744627395315827e-05, + "loss": 0.11966743469238281, + "step": 18000 + }, + { + "epoch": 2.55500354861604, + "eval_accuracy": 0.9338081007185096, + "eval_loss": 0.1904294788837433, + "eval_runtime": 32.5049, + "eval_samples_per_second": 483.835, + "eval_steps_per_second": 15.136, + "step": 18000 + }, + { + "epoch": 2.5564229950319377, + "grad_norm": 4.016758918762207, + "learning_rate": 9.744485450674238e-05, + "loss": 0.16458499431610107, + "step": 18010 + }, + { + "epoch": 2.5578424414478356, + "grad_norm": 9.767767906188965, + "learning_rate": 9.744343506032648e-05, + "loss": 0.11013137102127075, + "step": 18020 + }, + { + "epoch": 2.559261887863733, + "grad_norm": 10.628437042236328, + "learning_rate": 9.744201561391058e-05, + "loss": 0.1186720848083496, + "step": 18030 + }, + { + "epoch": 2.560681334279631, + "grad_norm": 4.2828545570373535, + "learning_rate": 9.744059616749469e-05, + "loss": 0.11388142108917236, + "step": 18040 + }, + { + "epoch": 2.5621007806955287, + "grad_norm": 5.870272636413574, + "learning_rate": 9.743917672107878e-05, + "loss": 0.09274822473526001, + "step": 18050 + }, + { + "epoch": 2.5635202271114266, + "grad_norm": 1.7781943082809448, + "learning_rate": 9.74377572746629e-05, + "loss": 0.10968050956726075, + "step": 18060 + }, + { + "epoch": 2.5649396735273244, + "grad_norm": 10.247567176818848, + "learning_rate": 9.743633782824699e-05, + "loss": 0.12503312826156615, + "step": 18070 + }, + { + "epoch": 2.5663591199432223, + "grad_norm": 5.602545261383057, + "learning_rate": 9.743491838183109e-05, + "loss": 0.09583965539932252, + "step": 18080 + }, + { + "epoch": 2.56777856635912, + "grad_norm": 1.3222918510437012, + "learning_rate": 9.743349893541519e-05, + "loss": 0.11057568788528442, + "step": 18090 + }, + { + "epoch": 2.5691980127750176, + "grad_norm": 2.3814685344696045, + "learning_rate": 9.74320794889993e-05, + "loss": 0.11936540603637695, + "step": 18100 + }, + { + "epoch": 2.5706174591909154, + "grad_norm": 2.4344863891601562, + "learning_rate": 9.74306600425834e-05, + "loss": 0.0944204032421112, + "step": 18110 + }, + { + "epoch": 2.5720369056068133, + "grad_norm": 8.206236839294434, + "learning_rate": 9.74292405961675e-05, + "loss": 0.08790295124053955, + "step": 18120 + }, + { + "epoch": 2.573456352022711, + "grad_norm": 6.2798566818237305, + "learning_rate": 9.74278211497516e-05, + "loss": 0.13661357164382934, + "step": 18130 + }, + { + "epoch": 2.574875798438609, + "grad_norm": 9.54171085357666, + "learning_rate": 9.74264017033357e-05, + "loss": 0.11890660524368286, + "step": 18140 + }, + { + "epoch": 2.576295244854507, + "grad_norm": 2.0758354663848877, + "learning_rate": 9.742498225691981e-05, + "loss": 0.14780707359313966, + "step": 18150 + }, + { + "epoch": 2.5777146912704048, + "grad_norm": 9.819342613220215, + "learning_rate": 9.742356281050391e-05, + "loss": 0.17009602785110473, + "step": 18160 + }, + { + "epoch": 2.579134137686302, + "grad_norm": 0.4771549105644226, + "learning_rate": 9.742214336408802e-05, + "loss": 0.09668282270431519, + "step": 18170 + }, + { + "epoch": 2.5805535841022, + "grad_norm": 3.620116710662842, + "learning_rate": 9.74207239176721e-05, + "loss": 0.09066780805587768, + "step": 18180 + }, + { + "epoch": 2.581973030518098, + "grad_norm": 2.4723594188690186, + "learning_rate": 9.741930447125621e-05, + "loss": 0.09381983876228332, + "step": 18190 + }, + { + "epoch": 2.5833924769339958, + "grad_norm": 8.35051441192627, + "learning_rate": 9.741788502484031e-05, + "loss": 0.15823612213134766, + "step": 18200 + }, + { + "epoch": 2.5848119233498936, + "grad_norm": 5.235237121582031, + "learning_rate": 9.741646557842442e-05, + "loss": 0.1483514666557312, + "step": 18210 + }, + { + "epoch": 2.5862313697657915, + "grad_norm": 4.181369781494141, + "learning_rate": 9.741504613200852e-05, + "loss": 0.07576992511749267, + "step": 18220 + }, + { + "epoch": 2.5876508161816894, + "grad_norm": 7.384850025177002, + "learning_rate": 9.741362668559262e-05, + "loss": 0.07849894762039185, + "step": 18230 + }, + { + "epoch": 2.5890702625975868, + "grad_norm": 2.344217300415039, + "learning_rate": 9.741220723917673e-05, + "loss": 0.10990880727767945, + "step": 18240 + }, + { + "epoch": 2.5904897090134846, + "grad_norm": 5.363242149353027, + "learning_rate": 9.741078779276083e-05, + "loss": 0.1412426710128784, + "step": 18250 + }, + { + "epoch": 2.5919091554293825, + "grad_norm": 3.7980527877807617, + "learning_rate": 9.740936834634494e-05, + "loss": 0.10421816110610962, + "step": 18260 + }, + { + "epoch": 2.5933286018452804, + "grad_norm": 9.759673118591309, + "learning_rate": 9.740794889992903e-05, + "loss": 0.11693978309631348, + "step": 18270 + }, + { + "epoch": 2.594748048261178, + "grad_norm": 2.0219240188598633, + "learning_rate": 9.740652945351315e-05, + "loss": 0.12884674072265626, + "step": 18280 + }, + { + "epoch": 2.596167494677076, + "grad_norm": 9.535964012145996, + "learning_rate": 9.740511000709723e-05, + "loss": 0.12031383514404297, + "step": 18290 + }, + { + "epoch": 2.597586941092974, + "grad_norm": 5.354515552520752, + "learning_rate": 9.740369056068134e-05, + "loss": 0.0845773994922638, + "step": 18300 + }, + { + "epoch": 2.5990063875088714, + "grad_norm": 1.1112140417099, + "learning_rate": 9.740227111426544e-05, + "loss": 0.1002803087234497, + "step": 18310 + }, + { + "epoch": 2.6004258339247692, + "grad_norm": 2.0215070247650146, + "learning_rate": 9.740085166784955e-05, + "loss": 0.10047941207885742, + "step": 18320 + }, + { + "epoch": 2.601845280340667, + "grad_norm": 6.67712926864624, + "learning_rate": 9.739943222143365e-05, + "loss": 0.13017858266830445, + "step": 18330 + }, + { + "epoch": 2.603264726756565, + "grad_norm": 10.4568452835083, + "learning_rate": 9.739801277501774e-05, + "loss": 0.19226794242858886, + "step": 18340 + }, + { + "epoch": 2.604684173172463, + "grad_norm": 6.936629772186279, + "learning_rate": 9.739659332860185e-05, + "loss": 0.1478518009185791, + "step": 18350 + }, + { + "epoch": 2.6061036195883607, + "grad_norm": 0.7439237236976624, + "learning_rate": 9.739517388218595e-05, + "loss": 0.11475565433502197, + "step": 18360 + }, + { + "epoch": 2.6075230660042585, + "grad_norm": 6.165897369384766, + "learning_rate": 9.739375443577006e-05, + "loss": 0.13509042263031007, + "step": 18370 + }, + { + "epoch": 2.608942512420156, + "grad_norm": 5.026000022888184, + "learning_rate": 9.739233498935416e-05, + "loss": 0.11895132064819336, + "step": 18380 + }, + { + "epoch": 2.610361958836054, + "grad_norm": 4.722821235656738, + "learning_rate": 9.739091554293826e-05, + "loss": 0.15483348369598388, + "step": 18390 + }, + { + "epoch": 2.6117814052519517, + "grad_norm": 4.340688705444336, + "learning_rate": 9.738949609652235e-05, + "loss": 0.09090102910995483, + "step": 18400 + }, + { + "epoch": 2.6132008516678495, + "grad_norm": 1.8677579164505005, + "learning_rate": 9.738807665010647e-05, + "loss": 0.12864718437194825, + "step": 18410 + }, + { + "epoch": 2.6146202980837474, + "grad_norm": 4.120899200439453, + "learning_rate": 9.738665720369056e-05, + "loss": 0.10905364751815796, + "step": 18420 + }, + { + "epoch": 2.6160397444996453, + "grad_norm": 2.1230714321136475, + "learning_rate": 9.738523775727467e-05, + "loss": 0.11330556869506836, + "step": 18430 + }, + { + "epoch": 2.617459190915543, + "grad_norm": 7.033359527587891, + "learning_rate": 9.738381831085877e-05, + "loss": 0.08752457499504089, + "step": 18440 + }, + { + "epoch": 2.6188786373314406, + "grad_norm": 5.958856105804443, + "learning_rate": 9.738239886444287e-05, + "loss": 0.07405679225921631, + "step": 18450 + }, + { + "epoch": 2.6202980837473384, + "grad_norm": 3.3164892196655273, + "learning_rate": 9.738097941802698e-05, + "loss": 0.049712374806404114, + "step": 18460 + }, + { + "epoch": 2.6217175301632363, + "grad_norm": 5.792750358581543, + "learning_rate": 9.737955997161108e-05, + "loss": 0.11241586208343506, + "step": 18470 + }, + { + "epoch": 2.623136976579134, + "grad_norm": 5.713932514190674, + "learning_rate": 9.737814052519519e-05, + "loss": 0.0947425127029419, + "step": 18480 + }, + { + "epoch": 2.624556422995032, + "grad_norm": 5.652758598327637, + "learning_rate": 9.737672107877927e-05, + "loss": 0.09331372976303101, + "step": 18490 + }, + { + "epoch": 2.62597586941093, + "grad_norm": 4.281705856323242, + "learning_rate": 9.737530163236338e-05, + "loss": 0.09365745782852172, + "step": 18500 + }, + { + "epoch": 2.62597586941093, + "eval_accuracy": 0.950721688815413, + "eval_loss": 0.1462646871805191, + "eval_runtime": 34.5176, + "eval_samples_per_second": 455.623, + "eval_steps_per_second": 14.254, + "step": 18500 + }, + { + "epoch": 2.6273953158268277, + "grad_norm": 4.376514911651611, + "learning_rate": 9.737388218594748e-05, + "loss": 0.06313493251800537, + "step": 18510 + }, + { + "epoch": 2.628814762242725, + "grad_norm": 7.006924629211426, + "learning_rate": 9.737246273953159e-05, + "loss": 0.1129868745803833, + "step": 18520 + }, + { + "epoch": 2.630234208658623, + "grad_norm": 6.207458972930908, + "learning_rate": 9.737104329311569e-05, + "loss": 0.15238604545593262, + "step": 18530 + }, + { + "epoch": 2.631653655074521, + "grad_norm": 0.35649651288986206, + "learning_rate": 9.736962384669979e-05, + "loss": 0.1252423644065857, + "step": 18540 + }, + { + "epoch": 2.6330731014904187, + "grad_norm": 4.224631309509277, + "learning_rate": 9.73682044002839e-05, + "loss": 0.11180676221847534, + "step": 18550 + }, + { + "epoch": 2.6344925479063166, + "grad_norm": 6.666781425476074, + "learning_rate": 9.7366784953868e-05, + "loss": 0.09207946062088013, + "step": 18560 + }, + { + "epoch": 2.6359119943222145, + "grad_norm": 5.663329124450684, + "learning_rate": 9.73653655074521e-05, + "loss": 0.09166657328605651, + "step": 18570 + }, + { + "epoch": 2.6373314407381123, + "grad_norm": 4.614907741546631, + "learning_rate": 9.73639460610362e-05, + "loss": 0.08460969924926758, + "step": 18580 + }, + { + "epoch": 2.6387508871540097, + "grad_norm": 4.568515300750732, + "learning_rate": 9.73625266146203e-05, + "loss": 0.0926063060760498, + "step": 18590 + }, + { + "epoch": 2.6401703335699076, + "grad_norm": 4.265593528747559, + "learning_rate": 9.73611071682044e-05, + "loss": 0.14236600399017335, + "step": 18600 + }, + { + "epoch": 2.6415897799858055, + "grad_norm": 3.393044948577881, + "learning_rate": 9.735968772178851e-05, + "loss": 0.06547205448150635, + "step": 18610 + }, + { + "epoch": 2.6430092264017033, + "grad_norm": 2.976576328277588, + "learning_rate": 9.73582682753726e-05, + "loss": 0.07752239108085632, + "step": 18620 + }, + { + "epoch": 2.644428672817601, + "grad_norm": 5.691226959228516, + "learning_rate": 9.735684882895672e-05, + "loss": 0.10452626943588257, + "step": 18630 + }, + { + "epoch": 2.645848119233499, + "grad_norm": 6.348296642303467, + "learning_rate": 9.735542938254081e-05, + "loss": 0.11550105810165405, + "step": 18640 + }, + { + "epoch": 2.647267565649397, + "grad_norm": 9.737822532653809, + "learning_rate": 9.735400993612491e-05, + "loss": 0.12678935527801513, + "step": 18650 + }, + { + "epoch": 2.6486870120652943, + "grad_norm": 1.7993618249893188, + "learning_rate": 9.735259048970902e-05, + "loss": 0.09803841710090637, + "step": 18660 + }, + { + "epoch": 2.650106458481192, + "grad_norm": 5.785006523132324, + "learning_rate": 9.735117104329312e-05, + "loss": 0.12399122714996338, + "step": 18670 + }, + { + "epoch": 2.65152590489709, + "grad_norm": 5.436007976531982, + "learning_rate": 9.734975159687723e-05, + "loss": 0.11214399337768555, + "step": 18680 + }, + { + "epoch": 2.652945351312988, + "grad_norm": 6.046454429626465, + "learning_rate": 9.734833215046133e-05, + "loss": 0.08356254100799561, + "step": 18690 + }, + { + "epoch": 2.654364797728886, + "grad_norm": 7.5290021896362305, + "learning_rate": 9.734691270404542e-05, + "loss": 0.101429283618927, + "step": 18700 + }, + { + "epoch": 2.6557842441447836, + "grad_norm": 3.0168631076812744, + "learning_rate": 9.734549325762952e-05, + "loss": 0.09058440327644349, + "step": 18710 + }, + { + "epoch": 2.6572036905606815, + "grad_norm": 8.676300048828125, + "learning_rate": 9.734407381121363e-05, + "loss": 0.13883825540542602, + "step": 18720 + }, + { + "epoch": 2.658623136976579, + "grad_norm": 10.840899467468262, + "learning_rate": 9.734265436479773e-05, + "loss": 0.11511178016662597, + "step": 18730 + }, + { + "epoch": 2.660042583392477, + "grad_norm": 2.287022113800049, + "learning_rate": 9.734123491838184e-05, + "loss": 0.10089895725250245, + "step": 18740 + }, + { + "epoch": 2.6614620298083747, + "grad_norm": 5.894728183746338, + "learning_rate": 9.733981547196594e-05, + "loss": 0.10052759647369384, + "step": 18750 + }, + { + "epoch": 2.6628814762242725, + "grad_norm": 3.954016923904419, + "learning_rate": 9.733839602555004e-05, + "loss": 0.1216499924659729, + "step": 18760 + }, + { + "epoch": 2.6643009226401704, + "grad_norm": 9.103641510009766, + "learning_rate": 9.733697657913415e-05, + "loss": 0.10710169076919555, + "step": 18770 + }, + { + "epoch": 2.6657203690560682, + "grad_norm": 0.37438610196113586, + "learning_rate": 9.733555713271824e-05, + "loss": 0.08723070025444031, + "step": 18780 + }, + { + "epoch": 2.667139815471966, + "grad_norm": 9.926944732666016, + "learning_rate": 9.733413768630236e-05, + "loss": 0.12807276248931884, + "step": 18790 + }, + { + "epoch": 2.6685592618878635, + "grad_norm": 9.92432689666748, + "learning_rate": 9.733271823988644e-05, + "loss": 0.18386597633361818, + "step": 18800 + }, + { + "epoch": 2.6699787083037614, + "grad_norm": 4.39555549621582, + "learning_rate": 9.733129879347055e-05, + "loss": 0.10847448110580445, + "step": 18810 + }, + { + "epoch": 2.6713981547196592, + "grad_norm": 4.371532440185547, + "learning_rate": 9.732987934705465e-05, + "loss": 0.11950172185897827, + "step": 18820 + }, + { + "epoch": 2.672817601135557, + "grad_norm": 3.7563788890838623, + "learning_rate": 9.732845990063876e-05, + "loss": 0.11064698696136474, + "step": 18830 + }, + { + "epoch": 2.674237047551455, + "grad_norm": 8.16103458404541, + "learning_rate": 9.732704045422286e-05, + "loss": 0.1522403836250305, + "step": 18840 + }, + { + "epoch": 2.675656493967353, + "grad_norm": 2.7513720989227295, + "learning_rate": 9.732562100780695e-05, + "loss": 0.14767955541610717, + "step": 18850 + }, + { + "epoch": 2.6770759403832507, + "grad_norm": 4.588718891143799, + "learning_rate": 9.732420156139106e-05, + "loss": 0.11084201335906982, + "step": 18860 + }, + { + "epoch": 2.678495386799148, + "grad_norm": 3.071213722229004, + "learning_rate": 9.732278211497516e-05, + "loss": 0.15097259283065795, + "step": 18870 + }, + { + "epoch": 2.679914833215046, + "grad_norm": 6.630822658538818, + "learning_rate": 9.732136266855927e-05, + "loss": 0.09166755676269531, + "step": 18880 + }, + { + "epoch": 2.681334279630944, + "grad_norm": 7.124295711517334, + "learning_rate": 9.731994322214337e-05, + "loss": 0.14961253404617308, + "step": 18890 + }, + { + "epoch": 2.6827537260468417, + "grad_norm": 8.885273933410645, + "learning_rate": 9.731852377572747e-05, + "loss": 0.1840854525566101, + "step": 18900 + }, + { + "epoch": 2.6841731724627396, + "grad_norm": 14.617013931274414, + "learning_rate": 9.731710432931156e-05, + "loss": 0.15676331520080566, + "step": 18910 + }, + { + "epoch": 2.6855926188786374, + "grad_norm": 5.9459452629089355, + "learning_rate": 9.731568488289568e-05, + "loss": 0.13418021202087402, + "step": 18920 + }, + { + "epoch": 2.6870120652945353, + "grad_norm": 1.0808570384979248, + "learning_rate": 9.731426543647977e-05, + "loss": 0.15757611989974976, + "step": 18930 + }, + { + "epoch": 2.6884315117104327, + "grad_norm": 1.862561583518982, + "learning_rate": 9.731284599006388e-05, + "loss": 0.09019602537155151, + "step": 18940 + }, + { + "epoch": 2.6898509581263306, + "grad_norm": 2.4577274322509766, + "learning_rate": 9.731142654364798e-05, + "loss": 0.06294019222259521, + "step": 18950 + }, + { + "epoch": 2.6912704045422284, + "grad_norm": 3.2663893699645996, + "learning_rate": 9.731000709723208e-05, + "loss": 0.06696848869323731, + "step": 18960 + }, + { + "epoch": 2.6926898509581263, + "grad_norm": 1.4709694385528564, + "learning_rate": 9.730858765081619e-05, + "loss": 0.061003082990646364, + "step": 18970 + }, + { + "epoch": 2.694109297374024, + "grad_norm": 2.4802117347717285, + "learning_rate": 9.730716820440029e-05, + "loss": 0.10601764917373657, + "step": 18980 + }, + { + "epoch": 2.695528743789922, + "grad_norm": 5.821985244750977, + "learning_rate": 9.73057487579844e-05, + "loss": 0.12596286535263063, + "step": 18990 + }, + { + "epoch": 2.69694819020582, + "grad_norm": 4.4037981033325195, + "learning_rate": 9.73043293115685e-05, + "loss": 0.08721169829368591, + "step": 19000 + }, + { + "epoch": 2.69694819020582, + "eval_accuracy": 0.9462071596617282, + "eval_loss": 0.1497952938079834, + "eval_runtime": 35.1407, + "eval_samples_per_second": 447.544, + "eval_steps_per_second": 14.001, + "step": 19000 + }, + { + "epoch": 2.6983676366217173, + "grad_norm": 5.155467987060547, + "learning_rate": 9.730290986515259e-05, + "loss": 0.16012940406799317, + "step": 19010 + }, + { + "epoch": 2.699787083037615, + "grad_norm": 6.539963245391846, + "learning_rate": 9.730149041873669e-05, + "loss": 0.13179491758346557, + "step": 19020 + }, + { + "epoch": 2.701206529453513, + "grad_norm": 5.117822647094727, + "learning_rate": 9.73000709723208e-05, + "loss": 0.11193997859954834, + "step": 19030 + }, + { + "epoch": 2.702625975869411, + "grad_norm": 13.319026947021484, + "learning_rate": 9.72986515259049e-05, + "loss": 0.06884243488311767, + "step": 19040 + }, + { + "epoch": 2.7040454222853088, + "grad_norm": 12.856066703796387, + "learning_rate": 9.729723207948901e-05, + "loss": 0.11155580282211304, + "step": 19050 + }, + { + "epoch": 2.7054648687012066, + "grad_norm": 3.3367395401000977, + "learning_rate": 9.72958126330731e-05, + "loss": 0.11018801927566528, + "step": 19060 + }, + { + "epoch": 2.7068843151171045, + "grad_norm": 2.5702414512634277, + "learning_rate": 9.72943931866572e-05, + "loss": 0.14847090244293212, + "step": 19070 + }, + { + "epoch": 2.708303761533002, + "grad_norm": 3.5079307556152344, + "learning_rate": 9.729297374024131e-05, + "loss": 0.12648016214370728, + "step": 19080 + }, + { + "epoch": 2.7097232079489, + "grad_norm": 7.1927642822265625, + "learning_rate": 9.729155429382541e-05, + "loss": 0.08001441359519959, + "step": 19090 + }, + { + "epoch": 2.7111426543647976, + "grad_norm": 2.3428845405578613, + "learning_rate": 9.729013484740952e-05, + "loss": 0.07565593719482422, + "step": 19100 + }, + { + "epoch": 2.7125621007806955, + "grad_norm": 5.344996929168701, + "learning_rate": 9.728871540099361e-05, + "loss": 0.06011520624160767, + "step": 19110 + }, + { + "epoch": 2.7139815471965933, + "grad_norm": 3.558228015899658, + "learning_rate": 9.728729595457772e-05, + "loss": 0.13906779289245605, + "step": 19120 + }, + { + "epoch": 2.715400993612491, + "grad_norm": 2.2271339893341064, + "learning_rate": 9.728587650816182e-05, + "loss": 0.06516092419624328, + "step": 19130 + }, + { + "epoch": 2.716820440028389, + "grad_norm": 6.620656490325928, + "learning_rate": 9.728445706174593e-05, + "loss": 0.08588937520980836, + "step": 19140 + }, + { + "epoch": 2.7182398864442865, + "grad_norm": 0.9995052218437195, + "learning_rate": 9.728303761533004e-05, + "loss": 0.07684165835380555, + "step": 19150 + }, + { + "epoch": 2.719659332860185, + "grad_norm": 2.3631653785705566, + "learning_rate": 9.728161816891412e-05, + "loss": 0.08287461400032044, + "step": 19160 + }, + { + "epoch": 2.721078779276082, + "grad_norm": 6.304315567016602, + "learning_rate": 9.728019872249823e-05, + "loss": 0.14411957263946534, + "step": 19170 + }, + { + "epoch": 2.72249822569198, + "grad_norm": 2.651029109954834, + "learning_rate": 9.727877927608233e-05, + "loss": 0.16562498807907106, + "step": 19180 + }, + { + "epoch": 2.723917672107878, + "grad_norm": 1.1602712869644165, + "learning_rate": 9.727735982966644e-05, + "loss": 0.0994363009929657, + "step": 19190 + }, + { + "epoch": 2.725337118523776, + "grad_norm": 2.081709384918213, + "learning_rate": 9.727594038325054e-05, + "loss": 0.1161266803741455, + "step": 19200 + }, + { + "epoch": 2.7267565649396737, + "grad_norm": 5.32574462890625, + "learning_rate": 9.727452093683463e-05, + "loss": 0.11266434192657471, + "step": 19210 + }, + { + "epoch": 2.728176011355571, + "grad_norm": 4.33624267578125, + "learning_rate": 9.727310149041873e-05, + "loss": 0.07457006573677064, + "step": 19220 + }, + { + "epoch": 2.7295954577714694, + "grad_norm": 9.516417503356934, + "learning_rate": 9.727168204400284e-05, + "loss": 0.06251566410064698, + "step": 19230 + }, + { + "epoch": 2.731014904187367, + "grad_norm": 7.441606044769287, + "learning_rate": 9.727026259758695e-05, + "loss": 0.11953941583633423, + "step": 19240 + }, + { + "epoch": 2.7324343506032647, + "grad_norm": 0.9915375113487244, + "learning_rate": 9.726884315117105e-05, + "loss": 0.10013129711151122, + "step": 19250 + }, + { + "epoch": 2.7338537970191625, + "grad_norm": 6.937955379486084, + "learning_rate": 9.726742370475515e-05, + "loss": 0.13717392683029175, + "step": 19260 + }, + { + "epoch": 2.7352732434350604, + "grad_norm": 6.149573802947998, + "learning_rate": 9.726600425833925e-05, + "loss": 0.11093438863754272, + "step": 19270 + }, + { + "epoch": 2.7366926898509583, + "grad_norm": 4.646894454956055, + "learning_rate": 9.726458481192336e-05, + "loss": 0.15733885765075684, + "step": 19280 + }, + { + "epoch": 2.7381121362668557, + "grad_norm": 5.516530513763428, + "learning_rate": 9.726316536550745e-05, + "loss": 0.06147825121879578, + "step": 19290 + }, + { + "epoch": 2.739531582682754, + "grad_norm": 3.121425151824951, + "learning_rate": 9.726174591909157e-05, + "loss": 0.06866928935050964, + "step": 19300 + }, + { + "epoch": 2.7409510290986514, + "grad_norm": 7.502362251281738, + "learning_rate": 9.726032647267565e-05, + "loss": 0.08418467044830322, + "step": 19310 + }, + { + "epoch": 2.7423704755145493, + "grad_norm": 2.791508436203003, + "learning_rate": 9.725890702625976e-05, + "loss": 0.11801939010620117, + "step": 19320 + }, + { + "epoch": 2.743789921930447, + "grad_norm": 7.064516544342041, + "learning_rate": 9.725748757984387e-05, + "loss": 0.13972241878509523, + "step": 19330 + }, + { + "epoch": 2.745209368346345, + "grad_norm": 5.9328932762146, + "learning_rate": 9.725606813342797e-05, + "loss": 0.12251147031784057, + "step": 19340 + }, + { + "epoch": 2.746628814762243, + "grad_norm": 6.175622940063477, + "learning_rate": 9.725464868701208e-05, + "loss": 0.06602987051010131, + "step": 19350 + }, + { + "epoch": 2.7480482611781403, + "grad_norm": 4.53786563873291, + "learning_rate": 9.725322924059618e-05, + "loss": 0.1297551393508911, + "step": 19360 + }, + { + "epoch": 2.7494677075940386, + "grad_norm": 3.098621368408203, + "learning_rate": 9.725180979418027e-05, + "loss": 0.1370749831199646, + "step": 19370 + }, + { + "epoch": 2.750887154009936, + "grad_norm": 3.015416383743286, + "learning_rate": 9.725039034776437e-05, + "loss": 0.12202317714691162, + "step": 19380 + }, + { + "epoch": 2.752306600425834, + "grad_norm": 2.518812656402588, + "learning_rate": 9.724897090134848e-05, + "loss": 0.08936739563941956, + "step": 19390 + }, + { + "epoch": 2.7537260468417317, + "grad_norm": 6.073837757110596, + "learning_rate": 9.724755145493258e-05, + "loss": 0.1370900511741638, + "step": 19400 + }, + { + "epoch": 2.7551454932576296, + "grad_norm": 5.372803211212158, + "learning_rate": 9.724613200851669e-05, + "loss": 0.16160420179367066, + "step": 19410 + }, + { + "epoch": 2.7565649396735274, + "grad_norm": 3.8927814960479736, + "learning_rate": 9.724471256210079e-05, + "loss": 0.18655315637588502, + "step": 19420 + }, + { + "epoch": 2.757984386089425, + "grad_norm": 6.601566314697266, + "learning_rate": 9.724329311568489e-05, + "loss": 0.06503421068191528, + "step": 19430 + }, + { + "epoch": 2.759403832505323, + "grad_norm": 8.965290069580078, + "learning_rate": 9.7241873669269e-05, + "loss": 0.15749263763427734, + "step": 19440 + }, + { + "epoch": 2.7608232789212206, + "grad_norm": 6.057149410247803, + "learning_rate": 9.72404542228531e-05, + "loss": 0.09035987257957459, + "step": 19450 + }, + { + "epoch": 2.7622427253371185, + "grad_norm": 3.8677871227264404, + "learning_rate": 9.72390347764372e-05, + "loss": 0.09661787152290344, + "step": 19460 + }, + { + "epoch": 2.7636621717530163, + "grad_norm": 1.7954285144805908, + "learning_rate": 9.723761533002129e-05, + "loss": 0.11428978443145751, + "step": 19470 + }, + { + "epoch": 2.765081618168914, + "grad_norm": 8.921133041381836, + "learning_rate": 9.72361958836054e-05, + "loss": 0.14268529415130615, + "step": 19480 + }, + { + "epoch": 2.766501064584812, + "grad_norm": 0.6554881930351257, + "learning_rate": 9.72347764371895e-05, + "loss": 0.05844693183898926, + "step": 19490 + }, + { + "epoch": 2.7679205110007095, + "grad_norm": 1.2021902799606323, + "learning_rate": 9.723335699077361e-05, + "loss": 0.048795363306999205, + "step": 19500 + }, + { + "epoch": 2.7679205110007095, + "eval_accuracy": 0.9642016913588097, + "eval_loss": 0.10457975417375565, + "eval_runtime": 32.1695, + "eval_samples_per_second": 488.879, + "eval_steps_per_second": 15.294, + "step": 19500 + }, + { + "epoch": 2.7693399574166078, + "grad_norm": 3.3482987880706787, + "learning_rate": 9.72319375443577e-05, + "loss": 0.08013315200805664, + "step": 19510 + }, + { + "epoch": 2.770759403832505, + "grad_norm": 7.4644036293029785, + "learning_rate": 9.72305180979418e-05, + "loss": 0.12772181034088134, + "step": 19520 + }, + { + "epoch": 2.772178850248403, + "grad_norm": 4.970337390899658, + "learning_rate": 9.722909865152591e-05, + "loss": 0.08325361609458923, + "step": 19530 + }, + { + "epoch": 2.773598296664301, + "grad_norm": 5.109130382537842, + "learning_rate": 9.722767920511001e-05, + "loss": 0.12823007106781006, + "step": 19540 + }, + { + "epoch": 2.7750177430801988, + "grad_norm": Infinity, + "learning_rate": 9.722625975869412e-05, + "loss": 0.07545018792152405, + "step": 19550 + }, + { + "epoch": 2.7764371894960966, + "grad_norm": 2.3274765014648438, + "learning_rate": 9.72249822569198e-05, + "loss": 0.09213562607765198, + "step": 19560 + }, + { + "epoch": 2.777856635911994, + "grad_norm": 1.3119785785675049, + "learning_rate": 9.72235628105039e-05, + "loss": 0.09134193658828735, + "step": 19570 + }, + { + "epoch": 2.7792760823278924, + "grad_norm": 1.7308454513549805, + "learning_rate": 9.722214336408801e-05, + "loss": 0.07336680889129639, + "step": 19580 + }, + { + "epoch": 2.78069552874379, + "grad_norm": 5.1270623207092285, + "learning_rate": 9.722072391767211e-05, + "loss": 0.10246673822402955, + "step": 19590 + }, + { + "epoch": 2.7821149751596876, + "grad_norm": 8.638457298278809, + "learning_rate": 9.721930447125621e-05, + "loss": 0.15175464153289794, + "step": 19600 + }, + { + "epoch": 2.7835344215755855, + "grad_norm": 2.7487826347351074, + "learning_rate": 9.721788502484032e-05, + "loss": 0.09026304483413697, + "step": 19610 + }, + { + "epoch": 2.7849538679914834, + "grad_norm": 1.0804003477096558, + "learning_rate": 9.721646557842442e-05, + "loss": 0.1334142804145813, + "step": 19620 + }, + { + "epoch": 2.7863733144073812, + "grad_norm": 4.871701717376709, + "learning_rate": 9.721504613200853e-05, + "loss": 0.0774698793888092, + "step": 19630 + }, + { + "epoch": 2.7877927608232786, + "grad_norm": 5.122735500335693, + "learning_rate": 9.721362668559261e-05, + "loss": 0.0750051498413086, + "step": 19640 + }, + { + "epoch": 2.789212207239177, + "grad_norm": 4.928715705871582, + "learning_rate": 9.721220723917672e-05, + "loss": 0.10383319854736328, + "step": 19650 + }, + { + "epoch": 2.7906316536550744, + "grad_norm": 4.654665470123291, + "learning_rate": 9.721078779276082e-05, + "loss": 0.07332990169525147, + "step": 19660 + }, + { + "epoch": 2.7920511000709722, + "grad_norm": 9.121614456176758, + "learning_rate": 9.720936834634493e-05, + "loss": 0.17799346446990966, + "step": 19670 + }, + { + "epoch": 2.79347054648687, + "grad_norm": 0.8097667694091797, + "learning_rate": 9.720794889992903e-05, + "loss": 0.13993927240371704, + "step": 19680 + }, + { + "epoch": 2.794889992902768, + "grad_norm": 6.301029682159424, + "learning_rate": 9.720652945351314e-05, + "loss": 0.049062016606330874, + "step": 19690 + }, + { + "epoch": 2.796309439318666, + "grad_norm": 7.916932582855225, + "learning_rate": 9.720511000709724e-05, + "loss": 0.13611079454421998, + "step": 19700 + }, + { + "epoch": 2.7977288857345637, + "grad_norm": 6.278209209442139, + "learning_rate": 9.720369056068134e-05, + "loss": 0.12774984836578368, + "step": 19710 + }, + { + "epoch": 2.7991483321504615, + "grad_norm": 8.645759582519531, + "learning_rate": 9.720227111426545e-05, + "loss": 0.09328774213790894, + "step": 19720 + }, + { + "epoch": 2.800567778566359, + "grad_norm": 3.0282325744628906, + "learning_rate": 9.720085166784954e-05, + "loss": 0.0923624575138092, + "step": 19730 + }, + { + "epoch": 2.801987224982257, + "grad_norm": 4.2578444480896, + "learning_rate": 9.719943222143365e-05, + "loss": 0.09177879095077515, + "step": 19740 + }, + { + "epoch": 2.8034066713981547, + "grad_norm": 7.6798996925354, + "learning_rate": 9.719801277501774e-05, + "loss": 0.12493581771850586, + "step": 19750 + }, + { + "epoch": 2.8048261178140526, + "grad_norm": 4.347507953643799, + "learning_rate": 9.719659332860185e-05, + "loss": 0.09963855147361755, + "step": 19760 + }, + { + "epoch": 2.8062455642299504, + "grad_norm": 4.931194305419922, + "learning_rate": 9.719517388218595e-05, + "loss": 0.07842986583709717, + "step": 19770 + }, + { + "epoch": 2.8076650106458483, + "grad_norm": 4.186477184295654, + "learning_rate": 9.719375443577006e-05, + "loss": 0.12233660221099854, + "step": 19780 + }, + { + "epoch": 2.809084457061746, + "grad_norm": 7.659719944000244, + "learning_rate": 9.719233498935415e-05, + "loss": 0.09655895829200745, + "step": 19790 + }, + { + "epoch": 2.8105039034776436, + "grad_norm": 0.47399571537971497, + "learning_rate": 9.719091554293825e-05, + "loss": 0.07599647045135498, + "step": 19800 + }, + { + "epoch": 2.8119233498935414, + "grad_norm": 4.59540319442749, + "learning_rate": 9.718949609652236e-05, + "loss": 0.07412179708480834, + "step": 19810 + }, + { + "epoch": 2.8133427963094393, + "grad_norm": 8.436945915222168, + "learning_rate": 9.718807665010646e-05, + "loss": 0.10687708854675293, + "step": 19820 + }, + { + "epoch": 2.814762242725337, + "grad_norm": 4.068880081176758, + "learning_rate": 9.718665720369057e-05, + "loss": 0.04072721004486084, + "step": 19830 + }, + { + "epoch": 2.816181689141235, + "grad_norm": 8.406689643859863, + "learning_rate": 9.718523775727467e-05, + "loss": 0.09728883504867554, + "step": 19840 + }, + { + "epoch": 2.817601135557133, + "grad_norm": 2.9611806869506836, + "learning_rate": 9.718381831085877e-05, + "loss": 0.0824375331401825, + "step": 19850 + }, + { + "epoch": 2.8190205819730307, + "grad_norm": 8.75788402557373, + "learning_rate": 9.718239886444286e-05, + "loss": 0.13575732707977295, + "step": 19860 + }, + { + "epoch": 2.820440028388928, + "grad_norm": 2.153355598449707, + "learning_rate": 9.718097941802697e-05, + "loss": 0.0826115369796753, + "step": 19870 + }, + { + "epoch": 2.821859474804826, + "grad_norm": 5.776090145111084, + "learning_rate": 9.717955997161107e-05, + "loss": 0.07727134227752686, + "step": 19880 + }, + { + "epoch": 2.823278921220724, + "grad_norm": 10.297713279724121, + "learning_rate": 9.717814052519518e-05, + "loss": 0.08978387117385864, + "step": 19890 + }, + { + "epoch": 2.8246983676366217, + "grad_norm": 4.710965156555176, + "learning_rate": 9.717672107877928e-05, + "loss": 0.14321819543838502, + "step": 19900 + }, + { + "epoch": 2.8261178140525196, + "grad_norm": 4.13072395324707, + "learning_rate": 9.717530163236338e-05, + "loss": 0.15760390758514403, + "step": 19910 + }, + { + "epoch": 2.8275372604684175, + "grad_norm": 0.497278094291687, + "learning_rate": 9.717388218594749e-05, + "loss": 0.08274838328361511, + "step": 19920 + }, + { + "epoch": 2.8289567068843153, + "grad_norm": 7.707274913787842, + "learning_rate": 9.717246273953159e-05, + "loss": 0.09570494294166565, + "step": 19930 + }, + { + "epoch": 2.8303761533002127, + "grad_norm": 5.368363857269287, + "learning_rate": 9.71710432931157e-05, + "loss": 0.07190582752227784, + "step": 19940 + }, + { + "epoch": 2.8317955997161106, + "grad_norm": 7.027709484100342, + "learning_rate": 9.716962384669978e-05, + "loss": 0.11582446098327637, + "step": 19950 + }, + { + "epoch": 2.8332150461320085, + "grad_norm": 12.213539123535156, + "learning_rate": 9.716820440028389e-05, + "loss": 0.10933125019073486, + "step": 19960 + }, + { + "epoch": 2.8346344925479063, + "grad_norm": 6.922082901000977, + "learning_rate": 9.716678495386799e-05, + "loss": 0.16551480293273926, + "step": 19970 + }, + { + "epoch": 2.836053938963804, + "grad_norm": 3.005093812942505, + "learning_rate": 9.71653655074521e-05, + "loss": 0.09381322860717774, + "step": 19980 + }, + { + "epoch": 2.837473385379702, + "grad_norm": 5.592711925506592, + "learning_rate": 9.716394606103621e-05, + "loss": 0.11934515237808227, + "step": 19990 + }, + { + "epoch": 2.8388928317956, + "grad_norm": 2.7002058029174805, + "learning_rate": 9.71625266146203e-05, + "loss": 0.06390081644058228, + "step": 20000 + }, + { + "epoch": 2.8388928317956, + "eval_accuracy": 0.9484326317797418, + "eval_loss": 0.14968876540660858, + "eval_runtime": 33.8107, + "eval_samples_per_second": 465.148, + "eval_steps_per_second": 14.552, + "step": 20000 + }, + { + "epoch": 2.8403122782114973, + "grad_norm": 4.695428371429443, + "learning_rate": 9.71611071682044e-05, + "loss": 0.11333894729614258, + "step": 20010 + }, + { + "epoch": 2.841731724627395, + "grad_norm": 0.6784132719039917, + "learning_rate": 9.71596877217885e-05, + "loss": 0.09425503015518188, + "step": 20020 + }, + { + "epoch": 2.843151171043293, + "grad_norm": 7.540246963500977, + "learning_rate": 9.715826827537261e-05, + "loss": 0.15037193298339843, + "step": 20030 + }, + { + "epoch": 2.844570617459191, + "grad_norm": 1.3910176753997803, + "learning_rate": 9.715684882895671e-05, + "loss": 0.10529568195343017, + "step": 20040 + }, + { + "epoch": 2.845990063875089, + "grad_norm": 10.363840103149414, + "learning_rate": 9.715542938254082e-05, + "loss": 0.13602850437164307, + "step": 20050 + }, + { + "epoch": 2.8474095102909867, + "grad_norm": 9.801745414733887, + "learning_rate": 9.71540099361249e-05, + "loss": 0.09394903779029846, + "step": 20060 + }, + { + "epoch": 2.8488289567068845, + "grad_norm": 4.273351192474365, + "learning_rate": 9.715259048970902e-05, + "loss": 0.12311586141586303, + "step": 20070 + }, + { + "epoch": 2.850248403122782, + "grad_norm": 11.77322006225586, + "learning_rate": 9.715117104329313e-05, + "loss": 0.12338924407958984, + "step": 20080 + }, + { + "epoch": 2.85166784953868, + "grad_norm": 2.7312419414520264, + "learning_rate": 9.714975159687723e-05, + "loss": 0.06953715085983277, + "step": 20090 + }, + { + "epoch": 2.8530872959545777, + "grad_norm": 5.562644958496094, + "learning_rate": 9.714833215046134e-05, + "loss": 0.06668174266815186, + "step": 20100 + }, + { + "epoch": 2.8545067423704755, + "grad_norm": 6.543910980224609, + "learning_rate": 9.714691270404542e-05, + "loss": 0.11938363313674927, + "step": 20110 + }, + { + "epoch": 2.8559261887863734, + "grad_norm": 1.5311610698699951, + "learning_rate": 9.714549325762953e-05, + "loss": 0.0953073263168335, + "step": 20120 + }, + { + "epoch": 2.8573456352022713, + "grad_norm": 10.13642406463623, + "learning_rate": 9.714407381121363e-05, + "loss": 0.10842293500900269, + "step": 20130 + }, + { + "epoch": 2.858765081618169, + "grad_norm": 6.405614376068115, + "learning_rate": 9.714265436479774e-05, + "loss": 0.18160440921783447, + "step": 20140 + }, + { + "epoch": 2.8601845280340665, + "grad_norm": 8.15994644165039, + "learning_rate": 9.714123491838184e-05, + "loss": 0.15880486965179444, + "step": 20150 + }, + { + "epoch": 2.8616039744499644, + "grad_norm": 9.660137176513672, + "learning_rate": 9.713981547196593e-05, + "loss": 0.1277371048927307, + "step": 20160 + }, + { + "epoch": 2.8630234208658623, + "grad_norm": 13.830092430114746, + "learning_rate": 9.713839602555004e-05, + "loss": 0.12971055507659912, + "step": 20170 + }, + { + "epoch": 2.86444286728176, + "grad_norm": 3.822737455368042, + "learning_rate": 9.713697657913414e-05, + "loss": 0.16139203310012817, + "step": 20180 + }, + { + "epoch": 2.865862313697658, + "grad_norm": 2.0092313289642334, + "learning_rate": 9.713555713271825e-05, + "loss": 0.06620528101921082, + "step": 20190 + }, + { + "epoch": 2.867281760113556, + "grad_norm": 3.479095458984375, + "learning_rate": 9.713413768630235e-05, + "loss": 0.10068619251251221, + "step": 20200 + }, + { + "epoch": 2.8687012065294537, + "grad_norm": 1.8399436473846436, + "learning_rate": 9.713271823988645e-05, + "loss": 0.07809083461761475, + "step": 20210 + }, + { + "epoch": 2.870120652945351, + "grad_norm": 1.2535580396652222, + "learning_rate": 9.713129879347055e-05, + "loss": 0.10528775453567504, + "step": 20220 + }, + { + "epoch": 2.871540099361249, + "grad_norm": 5.34690523147583, + "learning_rate": 9.712987934705466e-05, + "loss": 0.09714440107345582, + "step": 20230 + }, + { + "epoch": 2.872959545777147, + "grad_norm": 3.72548770904541, + "learning_rate": 9.712845990063875e-05, + "loss": 0.05409139394760132, + "step": 20240 + }, + { + "epoch": 2.8743789921930447, + "grad_norm": 4.422288417816162, + "learning_rate": 9.712704045422286e-05, + "loss": 0.0929717779159546, + "step": 20250 + }, + { + "epoch": 2.8757984386089426, + "grad_norm": 1.4169726371765137, + "learning_rate": 9.712562100780696e-05, + "loss": 0.04481082260608673, + "step": 20260 + }, + { + "epoch": 2.8772178850248404, + "grad_norm": 3.0234224796295166, + "learning_rate": 9.712420156139106e-05, + "loss": 0.14665982723236085, + "step": 20270 + }, + { + "epoch": 2.8786373314407383, + "grad_norm": 0.8741635680198669, + "learning_rate": 9.712278211497517e-05, + "loss": 0.057705503702163694, + "step": 20280 + }, + { + "epoch": 2.8800567778566357, + "grad_norm": 1.1250085830688477, + "learning_rate": 9.712136266855927e-05, + "loss": 0.1067537546157837, + "step": 20290 + }, + { + "epoch": 2.8814762242725336, + "grad_norm": 10.388190269470215, + "learning_rate": 9.711994322214338e-05, + "loss": 0.10462450981140137, + "step": 20300 + }, + { + "epoch": 2.8828956706884314, + "grad_norm": 3.0416109561920166, + "learning_rate": 9.711852377572746e-05, + "loss": 0.10544465780258179, + "step": 20310 + }, + { + "epoch": 2.8843151171043293, + "grad_norm": 5.297311782836914, + "learning_rate": 9.711710432931157e-05, + "loss": 0.06729884147644043, + "step": 20320 + }, + { + "epoch": 2.885734563520227, + "grad_norm": 2.5105323791503906, + "learning_rate": 9.711568488289567e-05, + "loss": 0.08199673295021057, + "step": 20330 + }, + { + "epoch": 2.887154009936125, + "grad_norm": 2.514965057373047, + "learning_rate": 9.711426543647978e-05, + "loss": 0.07696297764778137, + "step": 20340 + }, + { + "epoch": 2.888573456352023, + "grad_norm": 2.9623782634735107, + "learning_rate": 9.711284599006388e-05, + "loss": 0.06418653130531311, + "step": 20350 + }, + { + "epoch": 2.8899929027679203, + "grad_norm": 7.9242777824401855, + "learning_rate": 9.711142654364798e-05, + "loss": 0.10036368370056152, + "step": 20360 + }, + { + "epoch": 2.891412349183818, + "grad_norm": 0.3050519526004791, + "learning_rate": 9.711000709723209e-05, + "loss": 0.09710363149642945, + "step": 20370 + }, + { + "epoch": 2.892831795599716, + "grad_norm": 4.167988300323486, + "learning_rate": 9.710858765081618e-05, + "loss": 0.09933966994285584, + "step": 20380 + }, + { + "epoch": 2.894251242015614, + "grad_norm": 4.994990348815918, + "learning_rate": 9.71071682044003e-05, + "loss": 0.14826220273971558, + "step": 20390 + }, + { + "epoch": 2.8956706884315118, + "grad_norm": 5.276573657989502, + "learning_rate": 9.710574875798439e-05, + "loss": 0.06008061766624451, + "step": 20400 + }, + { + "epoch": 2.8970901348474096, + "grad_norm": 1.4481778144836426, + "learning_rate": 9.71043293115685e-05, + "loss": 0.07454321980476379, + "step": 20410 + }, + { + "epoch": 2.8985095812633075, + "grad_norm": 3.215022087097168, + "learning_rate": 9.710290986515259e-05, + "loss": 0.11371394395828247, + "step": 20420 + }, + { + "epoch": 2.899929027679205, + "grad_norm": 7.932292461395264, + "learning_rate": 9.71014904187367e-05, + "loss": 0.1307593822479248, + "step": 20430 + }, + { + "epoch": 2.9013484740951028, + "grad_norm": 3.419353723526001, + "learning_rate": 9.71000709723208e-05, + "loss": 0.11492658853530884, + "step": 20440 + }, + { + "epoch": 2.9027679205110006, + "grad_norm": 1.6420551538467407, + "learning_rate": 9.70986515259049e-05, + "loss": 0.09474117159843445, + "step": 20450 + }, + { + "epoch": 2.9041873669268985, + "grad_norm": 1.5180848836898804, + "learning_rate": 9.7097232079489e-05, + "loss": 0.16010476350784303, + "step": 20460 + }, + { + "epoch": 2.9056068133427964, + "grad_norm": 7.387273788452148, + "learning_rate": 9.70958126330731e-05, + "loss": 0.12979986667633056, + "step": 20470 + }, + { + "epoch": 2.907026259758694, + "grad_norm": 2.0460073947906494, + "learning_rate": 9.709439318665721e-05, + "loss": 0.09822458028793335, + "step": 20480 + }, + { + "epoch": 2.908445706174592, + "grad_norm": 8.7783784866333, + "learning_rate": 9.709297374024131e-05, + "loss": 0.10728850364685058, + "step": 20490 + }, + { + "epoch": 2.9098651525904895, + "grad_norm": 10.74223804473877, + "learning_rate": 9.709155429382542e-05, + "loss": 0.14357963800430298, + "step": 20500 + }, + { + "epoch": 2.9098651525904895, + "eval_accuracy": 0.9639473516881796, + "eval_loss": 0.10387223958969116, + "eval_runtime": 32.6025, + "eval_samples_per_second": 482.386, + "eval_steps_per_second": 15.091, + "step": 20500 + }, + { + "epoch": 2.9112845990063874, + "grad_norm": 6.561285495758057, + "learning_rate": 9.709013484740952e-05, + "loss": 0.09862427711486817, + "step": 20510 + }, + { + "epoch": 2.9127040454222852, + "grad_norm": 9.80976390838623, + "learning_rate": 9.708871540099362e-05, + "loss": 0.10329036712646485, + "step": 20520 + }, + { + "epoch": 2.914123491838183, + "grad_norm": 3.2249553203582764, + "learning_rate": 9.708729595457771e-05, + "loss": 0.09398200511932372, + "step": 20530 + }, + { + "epoch": 2.915542938254081, + "grad_norm": 3.0429465770721436, + "learning_rate": 9.708587650816182e-05, + "loss": 0.08952078223228455, + "step": 20540 + }, + { + "epoch": 2.916962384669979, + "grad_norm": 2.384573459625244, + "learning_rate": 9.708445706174592e-05, + "loss": 0.06835871934890747, + "step": 20550 + }, + { + "epoch": 2.9183818310858767, + "grad_norm": 2.1423826217651367, + "learning_rate": 9.708303761533003e-05, + "loss": 0.07619114518165589, + "step": 20560 + }, + { + "epoch": 2.919801277501774, + "grad_norm": 3.932051181793213, + "learning_rate": 9.708161816891413e-05, + "loss": 0.11714667081832886, + "step": 20570 + }, + { + "epoch": 2.921220723917672, + "grad_norm": 5.277032852172852, + "learning_rate": 9.708019872249823e-05, + "loss": 0.07001240253448486, + "step": 20580 + }, + { + "epoch": 2.92264017033357, + "grad_norm": 5.1413798332214355, + "learning_rate": 9.707877927608234e-05, + "loss": 0.09098179340362549, + "step": 20590 + }, + { + "epoch": 2.9240596167494677, + "grad_norm": 5.64100456237793, + "learning_rate": 9.707735982966644e-05, + "loss": 0.10666381120681763, + "step": 20600 + }, + { + "epoch": 2.9254790631653655, + "grad_norm": 9.501540184020996, + "learning_rate": 9.707594038325055e-05, + "loss": 0.05949283242225647, + "step": 20610 + }, + { + "epoch": 2.9268985095812634, + "grad_norm": 6.489498138427734, + "learning_rate": 9.707452093683463e-05, + "loss": 0.083852881193161, + "step": 20620 + }, + { + "epoch": 2.9283179559971613, + "grad_norm": 1.9999171495437622, + "learning_rate": 9.707310149041874e-05, + "loss": 0.08824072480201721, + "step": 20630 + }, + { + "epoch": 2.9297374024130587, + "grad_norm": 10.467041015625, + "learning_rate": 9.707168204400284e-05, + "loss": 0.22370665073394774, + "step": 20640 + }, + { + "epoch": 2.9311568488289566, + "grad_norm": 3.191193103790283, + "learning_rate": 9.707026259758695e-05, + "loss": 0.08672508597373962, + "step": 20650 + }, + { + "epoch": 2.9325762952448544, + "grad_norm": 8.910825729370117, + "learning_rate": 9.706884315117105e-05, + "loss": 0.09984519481658935, + "step": 20660 + }, + { + "epoch": 2.9339957416607523, + "grad_norm": 5.282776832580566, + "learning_rate": 9.706742370475514e-05, + "loss": 0.12132351398468018, + "step": 20670 + }, + { + "epoch": 2.93541518807665, + "grad_norm": 6.024061679840088, + "learning_rate": 9.706600425833925e-05, + "loss": 0.0702341616153717, + "step": 20680 + }, + { + "epoch": 2.936834634492548, + "grad_norm": 3.016757011413574, + "learning_rate": 9.706458481192335e-05, + "loss": 0.18172093629837036, + "step": 20690 + }, + { + "epoch": 2.938254080908446, + "grad_norm": 6.451714515686035, + "learning_rate": 9.706316536550746e-05, + "loss": 0.16414980888366698, + "step": 20700 + }, + { + "epoch": 2.9396735273243433, + "grad_norm": 3.6543655395507812, + "learning_rate": 9.706174591909156e-05, + "loss": 0.06588509678840637, + "step": 20710 + }, + { + "epoch": 2.941092973740241, + "grad_norm": 2.2044341564178467, + "learning_rate": 9.706032647267566e-05, + "loss": 0.10182955265045165, + "step": 20720 + }, + { + "epoch": 2.942512420156139, + "grad_norm": 4.035127639770508, + "learning_rate": 9.705890702625976e-05, + "loss": 0.08563597202301025, + "step": 20730 + }, + { + "epoch": 2.943931866572037, + "grad_norm": 0.5155683159828186, + "learning_rate": 9.705748757984387e-05, + "loss": 0.061300069093704224, + "step": 20740 + }, + { + "epoch": 2.9453513129879347, + "grad_norm": 5.438033103942871, + "learning_rate": 9.705606813342796e-05, + "loss": 0.10039635896682739, + "step": 20750 + }, + { + "epoch": 2.9467707594038326, + "grad_norm": 4.031142711639404, + "learning_rate": 9.705464868701207e-05, + "loss": 0.13106780052185057, + "step": 20760 + }, + { + "epoch": 2.9481902058197305, + "grad_norm": 1.6434075832366943, + "learning_rate": 9.705322924059617e-05, + "loss": 0.06390889883041381, + "step": 20770 + }, + { + "epoch": 2.949609652235628, + "grad_norm": 0.5606821775436401, + "learning_rate": 9.705180979418027e-05, + "loss": 0.1445988893508911, + "step": 20780 + }, + { + "epoch": 2.9510290986515257, + "grad_norm": 8.509517669677734, + "learning_rate": 9.705039034776438e-05, + "loss": 0.11696761846542358, + "step": 20790 + }, + { + "epoch": 2.9524485450674236, + "grad_norm": 1.219256043434143, + "learning_rate": 9.704897090134848e-05, + "loss": 0.1131407618522644, + "step": 20800 + }, + { + "epoch": 2.9538679914833215, + "grad_norm": 4.903664588928223, + "learning_rate": 9.704755145493259e-05, + "loss": 0.12911027669906616, + "step": 20810 + }, + { + "epoch": 2.9552874378992193, + "grad_norm": 2.6238746643066406, + "learning_rate": 9.704613200851669e-05, + "loss": 0.04189004898071289, + "step": 20820 + }, + { + "epoch": 2.956706884315117, + "grad_norm": 4.9772443771362305, + "learning_rate": 9.704471256210078e-05, + "loss": 0.1641558289527893, + "step": 20830 + }, + { + "epoch": 2.958126330731015, + "grad_norm": 3.766991376876831, + "learning_rate": 9.704329311568488e-05, + "loss": 0.15985740423202516, + "step": 20840 + }, + { + "epoch": 2.9595457771469125, + "grad_norm": 0.883904218673706, + "learning_rate": 9.704187366926899e-05, + "loss": 0.04713291525840759, + "step": 20850 + }, + { + "epoch": 2.9609652235628108, + "grad_norm": 2.7351174354553223, + "learning_rate": 9.704045422285309e-05, + "loss": 0.08329285383224487, + "step": 20860 + }, + { + "epoch": 2.962384669978708, + "grad_norm": 7.424506187438965, + "learning_rate": 9.70390347764372e-05, + "loss": 0.1191827893257141, + "step": 20870 + }, + { + "epoch": 2.963804116394606, + "grad_norm": 2.405928134918213, + "learning_rate": 9.70376153300213e-05, + "loss": 0.03490549027919769, + "step": 20880 + }, + { + "epoch": 2.965223562810504, + "grad_norm": 2.498183488845825, + "learning_rate": 9.70361958836054e-05, + "loss": 0.043746381998062134, + "step": 20890 + }, + { + "epoch": 2.966643009226402, + "grad_norm": 5.296067237854004, + "learning_rate": 9.70347764371895e-05, + "loss": 0.12182191610336304, + "step": 20900 + }, + { + "epoch": 2.9680624556422996, + "grad_norm": 5.240711688995361, + "learning_rate": 9.70333569907736e-05, + "loss": 0.05737144351005554, + "step": 20910 + }, + { + "epoch": 2.969481902058197, + "grad_norm": 9.032751083374023, + "learning_rate": 9.703193754435771e-05, + "loss": 0.10139278173446656, + "step": 20920 + }, + { + "epoch": 2.9709013484740954, + "grad_norm": 8.68384838104248, + "learning_rate": 9.70305180979418e-05, + "loss": 0.10488021373748779, + "step": 20930 + }, + { + "epoch": 2.972320794889993, + "grad_norm": 11.946162223815918, + "learning_rate": 9.702909865152591e-05, + "loss": 0.08643736839294433, + "step": 20940 + }, + { + "epoch": 2.9737402413058907, + "grad_norm": 7.999373435974121, + "learning_rate": 9.702767920511e-05, + "loss": 0.10784640312194824, + "step": 20950 + }, + { + "epoch": 2.9751596877217885, + "grad_norm": 10.503974914550781, + "learning_rate": 9.702625975869412e-05, + "loss": 0.14789512157440185, + "step": 20960 + }, + { + "epoch": 2.9765791341376864, + "grad_norm": 2.7038733959198, + "learning_rate": 9.702484031227821e-05, + "loss": 0.11931388378143311, + "step": 20970 + }, + { + "epoch": 2.9779985805535842, + "grad_norm": 4.435423374176025, + "learning_rate": 9.702342086586231e-05, + "loss": 0.1014961838722229, + "step": 20980 + }, + { + "epoch": 2.9794180269694817, + "grad_norm": 9.037029266357422, + "learning_rate": 9.702200141944642e-05, + "loss": 0.10802547931671143, + "step": 20990 + }, + { + "epoch": 2.98083747338538, + "grad_norm": 1.3593106269836426, + "learning_rate": 9.702058197303052e-05, + "loss": 0.06956174969673157, + "step": 21000 + }, + { + "epoch": 2.98083747338538, + "eval_accuracy": 0.9595599923698099, + "eval_loss": 0.12062688916921616, + "eval_runtime": 33.1203, + "eval_samples_per_second": 474.845, + "eval_steps_per_second": 14.855, + "step": 21000 + }, + { + "epoch": 2.9822569198012774, + "grad_norm": 0.6549391150474548, + "learning_rate": 9.701916252661463e-05, + "loss": 0.13815345764160156, + "step": 21010 + }, + { + "epoch": 2.9836763662171752, + "grad_norm": 6.318053722381592, + "learning_rate": 9.701774308019873e-05, + "loss": 0.126990008354187, + "step": 21020 + }, + { + "epoch": 2.985095812633073, + "grad_norm": 0.9818340539932251, + "learning_rate": 9.701632363378283e-05, + "loss": 0.1425946831703186, + "step": 21030 + }, + { + "epoch": 2.986515259048971, + "grad_norm": 7.161218643188477, + "learning_rate": 9.701490418736692e-05, + "loss": 0.10167466402053833, + "step": 21040 + }, + { + "epoch": 2.987934705464869, + "grad_norm": 2.8544816970825195, + "learning_rate": 9.701348474095103e-05, + "loss": 0.052589023113250734, + "step": 21050 + }, + { + "epoch": 2.9893541518807663, + "grad_norm": 3.788613796234131, + "learning_rate": 9.701206529453513e-05, + "loss": 0.0730807602405548, + "step": 21060 + }, + { + "epoch": 2.9907735982966646, + "grad_norm": 3.1659812927246094, + "learning_rate": 9.701064584811924e-05, + "loss": 0.0667772889137268, + "step": 21070 + }, + { + "epoch": 2.992193044712562, + "grad_norm": 3.7923996448516846, + "learning_rate": 9.700922640170334e-05, + "loss": 0.09958038330078126, + "step": 21080 + }, + { + "epoch": 2.99361249112846, + "grad_norm": 0.3780229985713959, + "learning_rate": 9.700780695528744e-05, + "loss": 0.04535020887851715, + "step": 21090 + }, + { + "epoch": 2.9950319375443577, + "grad_norm": 6.924422264099121, + "learning_rate": 9.700638750887155e-05, + "loss": 0.07231849431991577, + "step": 21100 + }, + { + "epoch": 2.9964513839602556, + "grad_norm": 4.052742958068848, + "learning_rate": 9.700496806245565e-05, + "loss": 0.06652356386184692, + "step": 21110 + }, + { + "epoch": 2.9978708303761534, + "grad_norm": 2.4228880405426025, + "learning_rate": 9.700354861603976e-05, + "loss": 0.13166139125823975, + "step": 21120 + }, + { + "epoch": 2.999290276792051, + "grad_norm": 1.3871126174926758, + "learning_rate": 9.700212916962385e-05, + "loss": 0.0858015775680542, + "step": 21130 + }, + { + "epoch": 3.0007097232079487, + "grad_norm": 4.722600936889648, + "learning_rate": 9.700070972320795e-05, + "loss": 0.12765930891036986, + "step": 21140 + }, + { + "epoch": 3.0021291696238466, + "grad_norm": 1.4345152378082275, + "learning_rate": 9.699929027679205e-05, + "loss": 0.11781737804412842, + "step": 21150 + }, + { + "epoch": 3.0035486160397444, + "grad_norm": 4.4884352684021, + "learning_rate": 9.699787083037616e-05, + "loss": 0.05820587873458862, + "step": 21160 + }, + { + "epoch": 3.0049680624556423, + "grad_norm": 2.4350528717041016, + "learning_rate": 9.699645138396026e-05, + "loss": 0.08642982244491577, + "step": 21170 + }, + { + "epoch": 3.00638750887154, + "grad_norm": 2.5722460746765137, + "learning_rate": 9.699503193754437e-05, + "loss": 0.13995343446731567, + "step": 21180 + }, + { + "epoch": 3.007806955287438, + "grad_norm": 8.12808895111084, + "learning_rate": 9.699361249112846e-05, + "loss": 0.10619027614593506, + "step": 21190 + }, + { + "epoch": 3.009226401703336, + "grad_norm": 9.416518211364746, + "learning_rate": 9.699219304471256e-05, + "loss": 0.10611592531204224, + "step": 21200 + }, + { + "epoch": 3.0106458481192333, + "grad_norm": 2.595517873764038, + "learning_rate": 9.699077359829667e-05, + "loss": 0.03944927752017975, + "step": 21210 + }, + { + "epoch": 3.012065294535131, + "grad_norm": 6.59434175491333, + "learning_rate": 9.698935415188077e-05, + "loss": 0.06297655701637268, + "step": 21220 + }, + { + "epoch": 3.013484740951029, + "grad_norm": 7.814486026763916, + "learning_rate": 9.698793470546488e-05, + "loss": 0.12234771251678467, + "step": 21230 + }, + { + "epoch": 3.014904187366927, + "grad_norm": 3.0475339889526367, + "learning_rate": 9.698651525904897e-05, + "loss": 0.10413910150527954, + "step": 21240 + }, + { + "epoch": 3.0163236337828248, + "grad_norm": 3.0739729404449463, + "learning_rate": 9.698509581263308e-05, + "loss": 0.048439356684684756, + "step": 21250 + }, + { + "epoch": 3.0177430801987226, + "grad_norm": 2.5247795581817627, + "learning_rate": 9.698367636621717e-05, + "loss": 0.08907513618469239, + "step": 21260 + }, + { + "epoch": 3.0191625266146205, + "grad_norm": 1.5360527038574219, + "learning_rate": 9.698225691980128e-05, + "loss": 0.08706284761428833, + "step": 21270 + }, + { + "epoch": 3.020581973030518, + "grad_norm": 2.9414641857147217, + "learning_rate": 9.698083747338538e-05, + "loss": 0.06573014259338379, + "step": 21280 + }, + { + "epoch": 3.0220014194464158, + "grad_norm": 4.994847297668457, + "learning_rate": 9.697941802696948e-05, + "loss": 0.13635185956954957, + "step": 21290 + }, + { + "epoch": 3.0234208658623136, + "grad_norm": 1.8111882209777832, + "learning_rate": 9.697799858055359e-05, + "loss": 0.09840369820594788, + "step": 21300 + }, + { + "epoch": 3.0248403122782115, + "grad_norm": 1.4137115478515625, + "learning_rate": 9.697657913413769e-05, + "loss": 0.136954402923584, + "step": 21310 + }, + { + "epoch": 3.0262597586941093, + "grad_norm": 2.749936819076538, + "learning_rate": 9.697530163236339e-05, + "loss": 0.10054677724838257, + "step": 21320 + }, + { + "epoch": 3.027679205110007, + "grad_norm": 4.701079368591309, + "learning_rate": 9.697388218594748e-05, + "loss": 0.07355481386184692, + "step": 21330 + }, + { + "epoch": 3.029098651525905, + "grad_norm": 4.2811408042907715, + "learning_rate": 9.69724627395316e-05, + "loss": 0.07188469767570496, + "step": 21340 + }, + { + "epoch": 3.0305180979418025, + "grad_norm": 7.573612213134766, + "learning_rate": 9.697104329311569e-05, + "loss": 0.05111314058303833, + "step": 21350 + }, + { + "epoch": 3.0319375443577004, + "grad_norm": 3.0801517963409424, + "learning_rate": 9.696962384669979e-05, + "loss": 0.0739107072353363, + "step": 21360 + }, + { + "epoch": 3.033356990773598, + "grad_norm": 14.997776985168457, + "learning_rate": 9.696820440028389e-05, + "loss": 0.18201708793640137, + "step": 21370 + }, + { + "epoch": 3.034776437189496, + "grad_norm": 8.705801963806152, + "learning_rate": 9.6966784953868e-05, + "loss": 0.09414076805114746, + "step": 21380 + }, + { + "epoch": 3.036195883605394, + "grad_norm": 2.687983751296997, + "learning_rate": 9.69653655074521e-05, + "loss": 0.10116174221038818, + "step": 21390 + }, + { + "epoch": 3.037615330021292, + "grad_norm": 3.300055503845215, + "learning_rate": 9.69639460610362e-05, + "loss": 0.05839415788650513, + "step": 21400 + }, + { + "epoch": 3.0390347764371897, + "grad_norm": 4.883892059326172, + "learning_rate": 9.69625266146203e-05, + "loss": 0.0997147798538208, + "step": 21410 + }, + { + "epoch": 3.040454222853087, + "grad_norm": 4.513243675231934, + "learning_rate": 9.69611071682044e-05, + "loss": 0.053650110960006714, + "step": 21420 + }, + { + "epoch": 3.041873669268985, + "grad_norm": 1.9839102029800415, + "learning_rate": 9.695968772178851e-05, + "loss": 0.1009899377822876, + "step": 21430 + }, + { + "epoch": 3.043293115684883, + "grad_norm": 3.678035259246826, + "learning_rate": 9.695826827537261e-05, + "loss": 0.09355159401893616, + "step": 21440 + }, + { + "epoch": 3.0447125621007807, + "grad_norm": 12.25532054901123, + "learning_rate": 9.695684882895672e-05, + "loss": 0.09784587025642395, + "step": 21450 + }, + { + "epoch": 3.0461320085166785, + "grad_norm": 10.08337688446045, + "learning_rate": 9.695542938254082e-05, + "loss": 0.22380528450012208, + "step": 21460 + }, + { + "epoch": 3.0475514549325764, + "grad_norm": 1.0703997611999512, + "learning_rate": 9.695400993612491e-05, + "loss": 0.03969487845897675, + "step": 21470 + }, + { + "epoch": 3.0489709013484743, + "grad_norm": 2.9388980865478516, + "learning_rate": 9.695259048970901e-05, + "loss": 0.07186501622200012, + "step": 21480 + }, + { + "epoch": 3.0503903477643717, + "grad_norm": 3.5290896892547607, + "learning_rate": 9.695117104329312e-05, + "loss": 0.07260159850120544, + "step": 21490 + }, + { + "epoch": 3.0518097941802695, + "grad_norm": 2.9938881397247314, + "learning_rate": 9.694975159687722e-05, + "loss": 0.09509387612342834, + "step": 21500 + }, + { + "epoch": 3.0518097941802695, + "eval_accuracy": 0.9589877281108921, + "eval_loss": 0.12867264449596405, + "eval_runtime": 31.3789, + "eval_samples_per_second": 501.196, + "eval_steps_per_second": 15.679, + "step": 21500 + }, + { + "epoch": 3.0532292405961674, + "grad_norm": 10.751752853393555, + "learning_rate": 9.694833215046133e-05, + "loss": 0.08080363273620605, + "step": 21510 + }, + { + "epoch": 3.0546486870120653, + "grad_norm": 2.299959659576416, + "learning_rate": 9.694691270404543e-05, + "loss": 0.14854525327682494, + "step": 21520 + }, + { + "epoch": 3.056068133427963, + "grad_norm": 4.220566272735596, + "learning_rate": 9.694549325762953e-05, + "loss": 0.09466566443443299, + "step": 21530 + }, + { + "epoch": 3.057487579843861, + "grad_norm": 6.087703704833984, + "learning_rate": 9.694407381121364e-05, + "loss": 0.09965238571166993, + "step": 21540 + }, + { + "epoch": 3.058907026259759, + "grad_norm": 8.385695457458496, + "learning_rate": 9.694265436479773e-05, + "loss": 0.10562925338745117, + "step": 21550 + }, + { + "epoch": 3.0603264726756567, + "grad_norm": 0.5750550031661987, + "learning_rate": 9.694123491838185e-05, + "loss": 0.07159033417701721, + "step": 21560 + }, + { + "epoch": 3.061745919091554, + "grad_norm": 5.470452308654785, + "learning_rate": 9.693981547196593e-05, + "loss": 0.1067430019378662, + "step": 21570 + }, + { + "epoch": 3.063165365507452, + "grad_norm": 1.6126492023468018, + "learning_rate": 9.693839602555004e-05, + "loss": 0.07778850793838502, + "step": 21580 + }, + { + "epoch": 3.06458481192335, + "grad_norm": 8.54702377319336, + "learning_rate": 9.693697657913414e-05, + "loss": 0.051190412044525145, + "step": 21590 + }, + { + "epoch": 3.0660042583392477, + "grad_norm": 5.9458818435668945, + "learning_rate": 9.693555713271825e-05, + "loss": 0.05976734161376953, + "step": 21600 + }, + { + "epoch": 3.0674237047551456, + "grad_norm": 11.962884902954102, + "learning_rate": 9.693413768630235e-05, + "loss": 0.08366570472717286, + "step": 21610 + }, + { + "epoch": 3.0688431511710434, + "grad_norm": 5.248124122619629, + "learning_rate": 9.693271823988644e-05, + "loss": 0.06071932911872864, + "step": 21620 + }, + { + "epoch": 3.0702625975869413, + "grad_norm": 3.1197493076324463, + "learning_rate": 9.693129879347055e-05, + "loss": 0.08671906590461731, + "step": 21630 + }, + { + "epoch": 3.0716820440028387, + "grad_norm": 6.69197940826416, + "learning_rate": 9.692987934705465e-05, + "loss": 0.0895846426486969, + "step": 21640 + }, + { + "epoch": 3.0731014904187366, + "grad_norm": 1.1883106231689453, + "learning_rate": 9.692845990063876e-05, + "loss": 0.09830948114395141, + "step": 21650 + }, + { + "epoch": 3.0745209368346345, + "grad_norm": 1.1830201148986816, + "learning_rate": 9.692704045422286e-05, + "loss": 0.09011884927749633, + "step": 21660 + }, + { + "epoch": 3.0759403832505323, + "grad_norm": 1.0241851806640625, + "learning_rate": 9.692562100780696e-05, + "loss": 0.11997926235198975, + "step": 21670 + }, + { + "epoch": 3.07735982966643, + "grad_norm": 5.068016052246094, + "learning_rate": 9.692420156139105e-05, + "loss": 0.11507253646850586, + "step": 21680 + }, + { + "epoch": 3.078779276082328, + "grad_norm": 3.562347173690796, + "learning_rate": 9.692278211497517e-05, + "loss": 0.07022674679756165, + "step": 21690 + }, + { + "epoch": 3.080198722498226, + "grad_norm": 7.2673163414001465, + "learning_rate": 9.692136266855926e-05, + "loss": 0.09197630882263183, + "step": 21700 + }, + { + "epoch": 3.0816181689141233, + "grad_norm": 2.2533631324768066, + "learning_rate": 9.691994322214337e-05, + "loss": 0.05809432864189148, + "step": 21710 + }, + { + "epoch": 3.083037615330021, + "grad_norm": 5.0073561668396, + "learning_rate": 9.691852377572747e-05, + "loss": 0.10983726978302003, + "step": 21720 + }, + { + "epoch": 3.084457061745919, + "grad_norm": 8.21857738494873, + "learning_rate": 9.691710432931157e-05, + "loss": 0.06723290681838989, + "step": 21730 + }, + { + "epoch": 3.085876508161817, + "grad_norm": 9.05629825592041, + "learning_rate": 9.691568488289568e-05, + "loss": 0.05822429656982422, + "step": 21740 + }, + { + "epoch": 3.0872959545777148, + "grad_norm": 2.9089202880859375, + "learning_rate": 9.691426543647978e-05, + "loss": 0.062278813123703, + "step": 21750 + }, + { + "epoch": 3.0887154009936126, + "grad_norm": 5.445140838623047, + "learning_rate": 9.691284599006389e-05, + "loss": 0.07242774367332458, + "step": 21760 + }, + { + "epoch": 3.0901348474095105, + "grad_norm": 5.643183708190918, + "learning_rate": 9.691142654364798e-05, + "loss": 0.17729694843292237, + "step": 21770 + }, + { + "epoch": 3.091554293825408, + "grad_norm": 1.2977749109268188, + "learning_rate": 9.691000709723208e-05, + "loss": 0.06676494479179382, + "step": 21780 + }, + { + "epoch": 3.092973740241306, + "grad_norm": 3.805422067642212, + "learning_rate": 9.690858765081618e-05, + "loss": 0.09304124712944031, + "step": 21790 + }, + { + "epoch": 3.0943931866572036, + "grad_norm": 6.814877510070801, + "learning_rate": 9.690716820440029e-05, + "loss": 0.08317658305168152, + "step": 21800 + }, + { + "epoch": 3.0958126330731015, + "grad_norm": 6.4380388259887695, + "learning_rate": 9.690574875798439e-05, + "loss": 0.11440763473510743, + "step": 21810 + }, + { + "epoch": 3.0972320794889994, + "grad_norm": 2.2712135314941406, + "learning_rate": 9.69043293115685e-05, + "loss": 0.05781182050704956, + "step": 21820 + }, + { + "epoch": 3.0986515259048972, + "grad_norm": 2.6996850967407227, + "learning_rate": 9.69029098651526e-05, + "loss": 0.09182395935058593, + "step": 21830 + }, + { + "epoch": 3.100070972320795, + "grad_norm": 3.8571221828460693, + "learning_rate": 9.69014904187367e-05, + "loss": 0.05620205998420715, + "step": 21840 + }, + { + "epoch": 3.1014904187366925, + "grad_norm": 2.1438169479370117, + "learning_rate": 9.69000709723208e-05, + "loss": 0.11742359399795532, + "step": 21850 + }, + { + "epoch": 3.1029098651525904, + "grad_norm": 0.5870881676673889, + "learning_rate": 9.68986515259049e-05, + "loss": 0.10411131381988525, + "step": 21860 + }, + { + "epoch": 3.1043293115684882, + "grad_norm": 3.8963239192962646, + "learning_rate": 9.689723207948901e-05, + "loss": 0.073959881067276, + "step": 21870 + }, + { + "epoch": 3.105748757984386, + "grad_norm": 1.869137167930603, + "learning_rate": 9.68958126330731e-05, + "loss": 0.09284948706626892, + "step": 21880 + }, + { + "epoch": 3.107168204400284, + "grad_norm": 7.974472522735596, + "learning_rate": 9.689439318665721e-05, + "loss": 0.08199034929275513, + "step": 21890 + }, + { + "epoch": 3.108587650816182, + "grad_norm": 5.112462520599365, + "learning_rate": 9.68929737402413e-05, + "loss": 0.04500599205493927, + "step": 21900 + }, + { + "epoch": 3.1100070972320797, + "grad_norm": 4.712485313415527, + "learning_rate": 9.689155429382542e-05, + "loss": 0.08608510494232177, + "step": 21910 + }, + { + "epoch": 3.111426543647977, + "grad_norm": 4.643701553344727, + "learning_rate": 9.689013484740951e-05, + "loss": 0.06371254920959472, + "step": 21920 + }, + { + "epoch": 3.112845990063875, + "grad_norm": 0.6126397252082825, + "learning_rate": 9.688871540099361e-05, + "loss": 0.06569015383720397, + "step": 21930 + }, + { + "epoch": 3.114265436479773, + "grad_norm": 0.9692607522010803, + "learning_rate": 9.688729595457772e-05, + "loss": 0.04018869698047638, + "step": 21940 + }, + { + "epoch": 3.1156848828956707, + "grad_norm": 1.4925132989883423, + "learning_rate": 9.688587650816182e-05, + "loss": 0.12035884857177734, + "step": 21950 + }, + { + "epoch": 3.1171043293115686, + "grad_norm": 8.849794387817383, + "learning_rate": 9.688445706174593e-05, + "loss": 0.10423930883407592, + "step": 21960 + }, + { + "epoch": 3.1185237757274664, + "grad_norm": 0.555972158908844, + "learning_rate": 9.688303761533003e-05, + "loss": 0.036292347311973575, + "step": 21970 + }, + { + "epoch": 3.1199432221433643, + "grad_norm": 1.3053301572799683, + "learning_rate": 9.688161816891412e-05, + "loss": 0.055543911457061765, + "step": 21980 + }, + { + "epoch": 3.1213626685592617, + "grad_norm": 5.318549633026123, + "learning_rate": 9.688019872249822e-05, + "loss": 0.06087319850921631, + "step": 21990 + }, + { + "epoch": 3.1227821149751596, + "grad_norm": 1.2716312408447266, + "learning_rate": 9.687877927608233e-05, + "loss": 0.05343518257141113, + "step": 22000 + }, + { + "epoch": 3.1227821149751596, + "eval_accuracy": 0.9604501812170153, + "eval_loss": 0.12624655663967133, + "eval_runtime": 31.6816, + "eval_samples_per_second": 496.408, + "eval_steps_per_second": 15.53, + "step": 22000 + }, + { + "epoch": 3.1242015613910574, + "grad_norm": 2.791890859603882, + "learning_rate": 9.687735982966643e-05, + "loss": 0.16994814872741698, + "step": 22010 + }, + { + "epoch": 3.1256210078069553, + "grad_norm": 0.727378249168396, + "learning_rate": 9.687594038325054e-05, + "loss": 0.09330202341079712, + "step": 22020 + }, + { + "epoch": 3.127040454222853, + "grad_norm": 2.6088101863861084, + "learning_rate": 9.687452093683464e-05, + "loss": 0.05271919369697571, + "step": 22030 + }, + { + "epoch": 3.128459900638751, + "grad_norm": 5.061529159545898, + "learning_rate": 9.687310149041874e-05, + "loss": 0.1032175898551941, + "step": 22040 + }, + { + "epoch": 3.129879347054649, + "grad_norm": 3.324045419692993, + "learning_rate": 9.687168204400285e-05, + "loss": 0.13030195236206055, + "step": 22050 + }, + { + "epoch": 3.1312987934705463, + "grad_norm": 2.8977231979370117, + "learning_rate": 9.687026259758694e-05, + "loss": 0.04515729248523712, + "step": 22060 + }, + { + "epoch": 3.132718239886444, + "grad_norm": 13.42546272277832, + "learning_rate": 9.686884315117106e-05, + "loss": 0.16047141551971436, + "step": 22070 + }, + { + "epoch": 3.134137686302342, + "grad_norm": 8.009624481201172, + "learning_rate": 9.686742370475514e-05, + "loss": 0.07332398891448974, + "step": 22080 + }, + { + "epoch": 3.13555713271824, + "grad_norm": 1.6250791549682617, + "learning_rate": 9.686600425833925e-05, + "loss": 0.08664785027503967, + "step": 22090 + }, + { + "epoch": 3.1369765791341377, + "grad_norm": 3.961372137069702, + "learning_rate": 9.686458481192335e-05, + "loss": 0.05184776782989502, + "step": 22100 + }, + { + "epoch": 3.1383960255500356, + "grad_norm": 3.3162078857421875, + "learning_rate": 9.686316536550746e-05, + "loss": 0.14172728061676027, + "step": 22110 + }, + { + "epoch": 3.1398154719659335, + "grad_norm": 2.8545219898223877, + "learning_rate": 9.686174591909156e-05, + "loss": 0.12487195730209351, + "step": 22120 + }, + { + "epoch": 3.141234918381831, + "grad_norm": 5.991825580596924, + "learning_rate": 9.686032647267567e-05, + "loss": 0.09468575716018676, + "step": 22130 + }, + { + "epoch": 3.1426543647977287, + "grad_norm": 3.7277402877807617, + "learning_rate": 9.685890702625976e-05, + "loss": 0.15779935121536254, + "step": 22140 + }, + { + "epoch": 3.1440738112136266, + "grad_norm": 5.867143630981445, + "learning_rate": 9.685748757984386e-05, + "loss": 0.06446941494941712, + "step": 22150 + }, + { + "epoch": 3.1454932576295245, + "grad_norm": 0.9702675342559814, + "learning_rate": 9.685606813342797e-05, + "loss": 0.10171631574630738, + "step": 22160 + }, + { + "epoch": 3.1469127040454223, + "grad_norm": 12.031753540039062, + "learning_rate": 9.685464868701207e-05, + "loss": 0.1400713086128235, + "step": 22170 + }, + { + "epoch": 3.14833215046132, + "grad_norm": 3.781707525253296, + "learning_rate": 9.685322924059618e-05, + "loss": 0.05259775519371033, + "step": 22180 + }, + { + "epoch": 3.149751596877218, + "grad_norm": 4.4153642654418945, + "learning_rate": 9.685180979418026e-05, + "loss": 0.10050948858261108, + "step": 22190 + }, + { + "epoch": 3.1511710432931155, + "grad_norm": 2.492379665374756, + "learning_rate": 9.685039034776438e-05, + "loss": 0.13373640775680543, + "step": 22200 + }, + { + "epoch": 3.1525904897090133, + "grad_norm": 8.212589263916016, + "learning_rate": 9.684897090134847e-05, + "loss": 0.0804680585861206, + "step": 22210 + }, + { + "epoch": 3.154009936124911, + "grad_norm": 7.918879508972168, + "learning_rate": 9.684755145493258e-05, + "loss": 0.04239166975021362, + "step": 22220 + }, + { + "epoch": 3.155429382540809, + "grad_norm": 0.38615530729293823, + "learning_rate": 9.68461320085167e-05, + "loss": 0.07814024686813355, + "step": 22230 + }, + { + "epoch": 3.156848828956707, + "grad_norm": 6.945682048797607, + "learning_rate": 9.684471256210078e-05, + "loss": 0.11140685081481934, + "step": 22240 + }, + { + "epoch": 3.158268275372605, + "grad_norm": 5.574148654937744, + "learning_rate": 9.684329311568489e-05, + "loss": 0.12524588108062745, + "step": 22250 + }, + { + "epoch": 3.1596877217885027, + "grad_norm": 2.4712400436401367, + "learning_rate": 9.684187366926899e-05, + "loss": 0.06859158277511597, + "step": 22260 + }, + { + "epoch": 3.1611071682044, + "grad_norm": 11.472119331359863, + "learning_rate": 9.68404542228531e-05, + "loss": 0.07999058961868286, + "step": 22270 + }, + { + "epoch": 3.162526614620298, + "grad_norm": 0.743500828742981, + "learning_rate": 9.68390347764372e-05, + "loss": 0.05272719860076904, + "step": 22280 + }, + { + "epoch": 3.163946061036196, + "grad_norm": 3.228672742843628, + "learning_rate": 9.683761533002129e-05, + "loss": 0.09461968541145324, + "step": 22290 + }, + { + "epoch": 3.1653655074520937, + "grad_norm": 9.705907821655273, + "learning_rate": 9.683619588360539e-05, + "loss": 0.08296184539794922, + "step": 22300 + }, + { + "epoch": 3.1667849538679915, + "grad_norm": 5.514443397521973, + "learning_rate": 9.68347764371895e-05, + "loss": 0.08486506342887878, + "step": 22310 + }, + { + "epoch": 3.1682044002838894, + "grad_norm": 10.679105758666992, + "learning_rate": 9.683335699077361e-05, + "loss": 0.1270732879638672, + "step": 22320 + }, + { + "epoch": 3.1696238466997873, + "grad_norm": 6.348006725311279, + "learning_rate": 9.683193754435771e-05, + "loss": 0.09326770305633544, + "step": 22330 + }, + { + "epoch": 3.1710432931156847, + "grad_norm": 0.7028082609176636, + "learning_rate": 9.68305180979418e-05, + "loss": 0.057895565032958986, + "step": 22340 + }, + { + "epoch": 3.1724627395315825, + "grad_norm": 2.103309392929077, + "learning_rate": 9.68290986515259e-05, + "loss": 0.08313475251197815, + "step": 22350 + }, + { + "epoch": 3.1738821859474804, + "grad_norm": 1.7693034410476685, + "learning_rate": 9.682767920511001e-05, + "loss": 0.07178552150726318, + "step": 22360 + }, + { + "epoch": 3.1753016323633783, + "grad_norm": 1.420407772064209, + "learning_rate": 9.682625975869411e-05, + "loss": 0.1434171199798584, + "step": 22370 + }, + { + "epoch": 3.176721078779276, + "grad_norm": 2.847599744796753, + "learning_rate": 9.682484031227822e-05, + "loss": 0.06267567276954651, + "step": 22380 + }, + { + "epoch": 3.178140525195174, + "grad_norm": 2.813729763031006, + "learning_rate": 9.68234208658623e-05, + "loss": 0.07424157261848449, + "step": 22390 + }, + { + "epoch": 3.179559971611072, + "grad_norm": 7.473203182220459, + "learning_rate": 9.682200141944642e-05, + "loss": 0.11200079917907715, + "step": 22400 + }, + { + "epoch": 3.1809794180269693, + "grad_norm": 6.4801177978515625, + "learning_rate": 9.682058197303053e-05, + "loss": 0.13543713092803955, + "step": 22410 + }, + { + "epoch": 3.182398864442867, + "grad_norm": 3.577303409576416, + "learning_rate": 9.681916252661463e-05, + "loss": 0.11488020420074463, + "step": 22420 + }, + { + "epoch": 3.183818310858765, + "grad_norm": 4.844555377960205, + "learning_rate": 9.681774308019874e-05, + "loss": 0.03927421867847443, + "step": 22430 + }, + { + "epoch": 3.185237757274663, + "grad_norm": 1.6158503293991089, + "learning_rate": 9.681632363378282e-05, + "loss": 0.09847801327705383, + "step": 22440 + }, + { + "epoch": 3.1866572036905607, + "grad_norm": 12.733912467956543, + "learning_rate": 9.681490418736693e-05, + "loss": 0.08998562097549438, + "step": 22450 + }, + { + "epoch": 3.1880766501064586, + "grad_norm": 0.760240912437439, + "learning_rate": 9.681348474095103e-05, + "loss": 0.07409574389457703, + "step": 22460 + }, + { + "epoch": 3.1894960965223564, + "grad_norm": 2.920081377029419, + "learning_rate": 9.681206529453514e-05, + "loss": 0.11183276176452636, + "step": 22470 + }, + { + "epoch": 3.190915542938254, + "grad_norm": 4.768205165863037, + "learning_rate": 9.681064584811924e-05, + "loss": 0.07697643041610717, + "step": 22480 + }, + { + "epoch": 3.1923349893541517, + "grad_norm": 3.8446145057678223, + "learning_rate": 9.680922640170335e-05, + "loss": 0.068821781873703, + "step": 22490 + }, + { + "epoch": 3.1937544357700496, + "grad_norm": 8.481558799743652, + "learning_rate": 9.680780695528745e-05, + "loss": 0.09039323329925537, + "step": 22500 + }, + { + "epoch": 3.1937544357700496, + "eval_accuracy": 0.954791123545495, + "eval_loss": 0.1401221603155136, + "eval_runtime": 31.488, + "eval_samples_per_second": 499.46, + "eval_steps_per_second": 15.625, + "step": 22500 + }, + { + "epoch": 3.1951738821859474, + "grad_norm": 5.633203983306885, + "learning_rate": 9.680638750887154e-05, + "loss": 0.07210381031036377, + "step": 22510 + }, + { + "epoch": 3.1965933286018453, + "grad_norm": 1.863991379737854, + "learning_rate": 9.680496806245565e-05, + "loss": 0.0704656958580017, + "step": 22520 + }, + { + "epoch": 3.198012775017743, + "grad_norm": 0.9419695734977722, + "learning_rate": 9.680354861603975e-05, + "loss": 0.08578440546989441, + "step": 22530 + }, + { + "epoch": 3.199432221433641, + "grad_norm": 9.0354642868042, + "learning_rate": 9.680212916962386e-05, + "loss": 0.10872071981430054, + "step": 22540 + }, + { + "epoch": 3.2008516678495385, + "grad_norm": 3.955871820449829, + "learning_rate": 9.680070972320795e-05, + "loss": 0.05301453471183777, + "step": 22550 + }, + { + "epoch": 3.2022711142654363, + "grad_norm": 9.719240188598633, + "learning_rate": 9.679929027679206e-05, + "loss": 0.1132009506225586, + "step": 22560 + }, + { + "epoch": 3.203690560681334, + "grad_norm": 8.175822257995605, + "learning_rate": 9.679787083037615e-05, + "loss": 0.03667646646499634, + "step": 22570 + }, + { + "epoch": 3.205110007097232, + "grad_norm": 1.2011351585388184, + "learning_rate": 9.679645138396027e-05, + "loss": 0.06343533992767333, + "step": 22580 + }, + { + "epoch": 3.20652945351313, + "grad_norm": 2.916196823120117, + "learning_rate": 9.679503193754436e-05, + "loss": 0.054550164937973024, + "step": 22590 + }, + { + "epoch": 3.2079488999290278, + "grad_norm": 11.839608192443848, + "learning_rate": 9.679361249112846e-05, + "loss": 0.11105455160140991, + "step": 22600 + }, + { + "epoch": 3.2093683463449256, + "grad_norm": 5.120648384094238, + "learning_rate": 9.679219304471257e-05, + "loss": 0.04551963210105896, + "step": 22610 + }, + { + "epoch": 3.210787792760823, + "grad_norm": 2.64894437789917, + "learning_rate": 9.679077359829667e-05, + "loss": 0.07367442846298218, + "step": 22620 + }, + { + "epoch": 3.212207239176721, + "grad_norm": 7.870187759399414, + "learning_rate": 9.678935415188078e-05, + "loss": 0.12482872009277343, + "step": 22630 + }, + { + "epoch": 3.2136266855926188, + "grad_norm": 1.49652898311615, + "learning_rate": 9.678793470546488e-05, + "loss": 0.11122183799743653, + "step": 22640 + }, + { + "epoch": 3.2150461320085166, + "grad_norm": 2.1385059356689453, + "learning_rate": 9.678651525904897e-05, + "loss": 0.08030745387077332, + "step": 22650 + }, + { + "epoch": 3.2164655784244145, + "grad_norm": 5.634016036987305, + "learning_rate": 9.678509581263307e-05, + "loss": 0.135706627368927, + "step": 22660 + }, + { + "epoch": 3.2178850248403124, + "grad_norm": 7.30700159072876, + "learning_rate": 9.678367636621718e-05, + "loss": 0.09824522137641907, + "step": 22670 + }, + { + "epoch": 3.21930447125621, + "grad_norm": 3.9598324298858643, + "learning_rate": 9.678225691980128e-05, + "loss": 0.0592613160610199, + "step": 22680 + }, + { + "epoch": 3.220723917672108, + "grad_norm": 3.0672085285186768, + "learning_rate": 9.678083747338539e-05, + "loss": 0.10512404441833496, + "step": 22690 + }, + { + "epoch": 3.2221433640880055, + "grad_norm": 3.729863405227661, + "learning_rate": 9.677941802696949e-05, + "loss": 0.08016419410705566, + "step": 22700 + }, + { + "epoch": 3.2235628105039034, + "grad_norm": 2.7525126934051514, + "learning_rate": 9.677799858055359e-05, + "loss": 0.062538743019104, + "step": 22710 + }, + { + "epoch": 3.2249822569198012, + "grad_norm": 2.103010892868042, + "learning_rate": 9.67765791341377e-05, + "loss": 0.07154433131217956, + "step": 22720 + }, + { + "epoch": 3.226401703335699, + "grad_norm": 1.3044795989990234, + "learning_rate": 9.67751596877218e-05, + "loss": 0.04868173897266388, + "step": 22730 + }, + { + "epoch": 3.227821149751597, + "grad_norm": 0.34033793210983276, + "learning_rate": 9.67737402413059e-05, + "loss": 0.06057687401771546, + "step": 22740 + }, + { + "epoch": 3.229240596167495, + "grad_norm": 14.895809173583984, + "learning_rate": 9.677232079488999e-05, + "loss": 0.14414306879043579, + "step": 22750 + }, + { + "epoch": 3.2306600425833927, + "grad_norm": 2.03631329536438, + "learning_rate": 9.67709013484741e-05, + "loss": 0.03532655239105224, + "step": 22760 + }, + { + "epoch": 3.23207948899929, + "grad_norm": 1.9289063215255737, + "learning_rate": 9.67694819020582e-05, + "loss": 0.04410083889961243, + "step": 22770 + }, + { + "epoch": 3.233498935415188, + "grad_norm": 8.339526176452637, + "learning_rate": 9.676806245564231e-05, + "loss": 0.07176212072372437, + "step": 22780 + }, + { + "epoch": 3.234918381831086, + "grad_norm": 6.541379928588867, + "learning_rate": 9.67666430092264e-05, + "loss": 0.08053820133209229, + "step": 22790 + }, + { + "epoch": 3.2363378282469837, + "grad_norm": 3.6586859226226807, + "learning_rate": 9.67652235628105e-05, + "loss": 0.04074668884277344, + "step": 22800 + }, + { + "epoch": 3.2377572746628815, + "grad_norm": 0.39181602001190186, + "learning_rate": 9.676380411639461e-05, + "loss": 0.06584768891334533, + "step": 22810 + }, + { + "epoch": 3.2391767210787794, + "grad_norm": 4.53519868850708, + "learning_rate": 9.676238466997871e-05, + "loss": 0.10924329757690429, + "step": 22820 + }, + { + "epoch": 3.2405961674946773, + "grad_norm": 5.562971591949463, + "learning_rate": 9.676096522356282e-05, + "loss": 0.12216780185699463, + "step": 22830 + }, + { + "epoch": 3.2420156139105747, + "grad_norm": 9.106098175048828, + "learning_rate": 9.675954577714692e-05, + "loss": 0.09589399695396424, + "step": 22840 + }, + { + "epoch": 3.2434350603264726, + "grad_norm": 8.574522972106934, + "learning_rate": 9.675812633073103e-05, + "loss": 0.0823745608329773, + "step": 22850 + }, + { + "epoch": 3.2448545067423704, + "grad_norm": 8.706705093383789, + "learning_rate": 9.675670688431511e-05, + "loss": 0.14104554653167725, + "step": 22860 + }, + { + "epoch": 3.2462739531582683, + "grad_norm": 8.810419082641602, + "learning_rate": 9.675528743789922e-05, + "loss": 0.05990390777587891, + "step": 22870 + }, + { + "epoch": 3.247693399574166, + "grad_norm": 4.165992736816406, + "learning_rate": 9.675386799148332e-05, + "loss": 0.0668636441230774, + "step": 22880 + }, + { + "epoch": 3.249112845990064, + "grad_norm": 9.099569320678711, + "learning_rate": 9.675244854506743e-05, + "loss": 0.06936246156692505, + "step": 22890 + }, + { + "epoch": 3.250532292405962, + "grad_norm": 4.4353132247924805, + "learning_rate": 9.675102909865153e-05, + "loss": 0.06273015737533569, + "step": 22900 + }, + { + "epoch": 3.2519517388218593, + "grad_norm": 1.2650339603424072, + "learning_rate": 9.674960965223563e-05, + "loss": 0.06168818473815918, + "step": 22910 + }, + { + "epoch": 3.253371185237757, + "grad_norm": 4.567782402038574, + "learning_rate": 9.674819020581974e-05, + "loss": 0.10136575698852539, + "step": 22920 + }, + { + "epoch": 3.254790631653655, + "grad_norm": 6.448585510253906, + "learning_rate": 9.674677075940384e-05, + "loss": 0.07393231987953186, + "step": 22930 + }, + { + "epoch": 3.256210078069553, + "grad_norm": 10.017446517944336, + "learning_rate": 9.674535131298795e-05, + "loss": 0.10242644548416138, + "step": 22940 + }, + { + "epoch": 3.2576295244854507, + "grad_norm": 3.191063404083252, + "learning_rate": 9.674393186657204e-05, + "loss": 0.047987133264541626, + "step": 22950 + }, + { + "epoch": 3.2590489709013486, + "grad_norm": 3.556180477142334, + "learning_rate": 9.674251242015614e-05, + "loss": 0.047191986441612245, + "step": 22960 + }, + { + "epoch": 3.2604684173172465, + "grad_norm": 1.7208983898162842, + "learning_rate": 9.674109297374024e-05, + "loss": 0.08717820644378663, + "step": 22970 + }, + { + "epoch": 3.2618878637331443, + "grad_norm": 5.613543510437012, + "learning_rate": 9.673967352732435e-05, + "loss": 0.11286189556121826, + "step": 22980 + }, + { + "epoch": 3.2633073101490417, + "grad_norm": 5.163478374481201, + "learning_rate": 9.673825408090845e-05, + "loss": 0.11744798421859741, + "step": 22990 + }, + { + "epoch": 3.2647267565649396, + "grad_norm": 3.8311023712158203, + "learning_rate": 9.673683463449256e-05, + "loss": 0.0839583694934845, + "step": 23000 + }, + { + "epoch": 3.2647267565649396, + "eval_accuracy": 0.9688433903478095, + "eval_loss": 0.09206999838352203, + "eval_runtime": 32.6805, + "eval_samples_per_second": 481.235, + "eval_steps_per_second": 15.055, + "step": 23000 + }, + { + "epoch": 3.2661462029808375, + "grad_norm": 6.961423873901367, + "learning_rate": 9.673541518807666e-05, + "loss": 0.060645246505737306, + "step": 23010 + }, + { + "epoch": 3.2675656493967353, + "grad_norm": 4.491827011108398, + "learning_rate": 9.673399574166075e-05, + "loss": 0.060946452617645266, + "step": 23020 + }, + { + "epoch": 3.268985095812633, + "grad_norm": 8.529021263122559, + "learning_rate": 9.673257629524486e-05, + "loss": 0.0623835563659668, + "step": 23030 + }, + { + "epoch": 3.270404542228531, + "grad_norm": 7.560174942016602, + "learning_rate": 9.673115684882896e-05, + "loss": 0.05246782898902893, + "step": 23040 + }, + { + "epoch": 3.271823988644429, + "grad_norm": 5.852350234985352, + "learning_rate": 9.672973740241307e-05, + "loss": 0.1177408218383789, + "step": 23050 + }, + { + "epoch": 3.2732434350603263, + "grad_norm": 2.9898064136505127, + "learning_rate": 9.672831795599716e-05, + "loss": 0.10251556634902954, + "step": 23060 + }, + { + "epoch": 3.274662881476224, + "grad_norm": 0.7350359559059143, + "learning_rate": 9.672689850958127e-05, + "loss": 0.08793265223503113, + "step": 23070 + }, + { + "epoch": 3.276082327892122, + "grad_norm": 7.976613998413086, + "learning_rate": 9.672547906316536e-05, + "loss": 0.11746323108673096, + "step": 23080 + }, + { + "epoch": 3.27750177430802, + "grad_norm": 5.30941915512085, + "learning_rate": 9.672405961674948e-05, + "loss": 0.1818032145500183, + "step": 23090 + }, + { + "epoch": 3.278921220723918, + "grad_norm": 4.999229907989502, + "learning_rate": 9.672264017033357e-05, + "loss": 0.05894123911857605, + "step": 23100 + }, + { + "epoch": 3.2803406671398156, + "grad_norm": 5.794082164764404, + "learning_rate": 9.672122072391767e-05, + "loss": 0.081751549243927, + "step": 23110 + }, + { + "epoch": 3.2817601135557135, + "grad_norm": 2.565143346786499, + "learning_rate": 9.671980127750178e-05, + "loss": 0.07524069547653198, + "step": 23120 + }, + { + "epoch": 3.283179559971611, + "grad_norm": 4.894937038421631, + "learning_rate": 9.671838183108588e-05, + "loss": 0.12828075885772705, + "step": 23130 + }, + { + "epoch": 3.284599006387509, + "grad_norm": 6.212746620178223, + "learning_rate": 9.671696238466999e-05, + "loss": 0.1400521755218506, + "step": 23140 + }, + { + "epoch": 3.2860184528034067, + "grad_norm": 4.2761921882629395, + "learning_rate": 9.671554293825409e-05, + "loss": 0.09644685983657837, + "step": 23150 + }, + { + "epoch": 3.2874378992193045, + "grad_norm": 16.000354766845703, + "learning_rate": 9.671412349183818e-05, + "loss": 0.11152185201644897, + "step": 23160 + }, + { + "epoch": 3.2888573456352024, + "grad_norm": 6.135869026184082, + "learning_rate": 9.671270404542228e-05, + "loss": 0.0771405816078186, + "step": 23170 + }, + { + "epoch": 3.2902767920511002, + "grad_norm": 1.4707847833633423, + "learning_rate": 9.671128459900639e-05, + "loss": 0.09533407092094422, + "step": 23180 + }, + { + "epoch": 3.291696238466998, + "grad_norm": 1.1678895950317383, + "learning_rate": 9.670986515259049e-05, + "loss": 0.09652703404426574, + "step": 23190 + }, + { + "epoch": 3.2931156848828955, + "grad_norm": 3.4155921936035156, + "learning_rate": 9.67084457061746e-05, + "loss": 0.04331456124782562, + "step": 23200 + }, + { + "epoch": 3.2945351312987934, + "grad_norm": 3.263784408569336, + "learning_rate": 9.67070262597587e-05, + "loss": 0.12196718454360962, + "step": 23210 + }, + { + "epoch": 3.2959545777146912, + "grad_norm": 0.8338903188705444, + "learning_rate": 9.67056068133428e-05, + "loss": 0.08930212855339051, + "step": 23220 + }, + { + "epoch": 3.297374024130589, + "grad_norm": 2.3964731693267822, + "learning_rate": 9.67041873669269e-05, + "loss": 0.061741960048675534, + "step": 23230 + }, + { + "epoch": 3.298793470546487, + "grad_norm": 9.600022315979004, + "learning_rate": 9.6702767920511e-05, + "loss": 0.1284554719924927, + "step": 23240 + }, + { + "epoch": 3.300212916962385, + "grad_norm": 1.0024387836456299, + "learning_rate": 9.670134847409511e-05, + "loss": 0.09108211994171142, + "step": 23250 + }, + { + "epoch": 3.3016323633782827, + "grad_norm": 4.58043098449707, + "learning_rate": 9.669992902767921e-05, + "loss": 0.10650498867034912, + "step": 23260 + }, + { + "epoch": 3.30305180979418, + "grad_norm": 3.778592824935913, + "learning_rate": 9.669850958126331e-05, + "loss": 0.0809212327003479, + "step": 23270 + }, + { + "epoch": 3.304471256210078, + "grad_norm": 2.984292984008789, + "learning_rate": 9.669709013484741e-05, + "loss": 0.0674120306968689, + "step": 23280 + }, + { + "epoch": 3.305890702625976, + "grad_norm": 2.295304298400879, + "learning_rate": 9.669567068843152e-05, + "loss": 0.04605483114719391, + "step": 23290 + }, + { + "epoch": 3.3073101490418737, + "grad_norm": 5.067991256713867, + "learning_rate": 9.669425124201562e-05, + "loss": 0.07464765906333923, + "step": 23300 + }, + { + "epoch": 3.3087295954577716, + "grad_norm": 0.5175068974494934, + "learning_rate": 9.669283179559973e-05, + "loss": 0.10126523971557617, + "step": 23310 + }, + { + "epoch": 3.3101490418736694, + "grad_norm": 0.7718493938446045, + "learning_rate": 9.669141234918382e-05, + "loss": 0.1306004047393799, + "step": 23320 + }, + { + "epoch": 3.3115684882895673, + "grad_norm": 0.4733130931854248, + "learning_rate": 9.668999290276792e-05, + "loss": 0.07524165511131287, + "step": 23330 + }, + { + "epoch": 3.3129879347054647, + "grad_norm": 1.91227388381958, + "learning_rate": 9.668857345635203e-05, + "loss": 0.10234876871109008, + "step": 23340 + }, + { + "epoch": 3.3144073811213626, + "grad_norm": 3.8604981899261475, + "learning_rate": 9.668715400993613e-05, + "loss": 0.08232152462005615, + "step": 23350 + }, + { + "epoch": 3.3158268275372604, + "grad_norm": 4.264747619628906, + "learning_rate": 9.668573456352024e-05, + "loss": 0.08970657587051392, + "step": 23360 + }, + { + "epoch": 3.3172462739531583, + "grad_norm": 8.413162231445312, + "learning_rate": 9.668431511710432e-05, + "loss": 0.0798837423324585, + "step": 23370 + }, + { + "epoch": 3.318665720369056, + "grad_norm": 6.562158107757568, + "learning_rate": 9.668289567068843e-05, + "loss": 0.1796337842941284, + "step": 23380 + }, + { + "epoch": 3.320085166784954, + "grad_norm": 6.798343658447266, + "learning_rate": 9.668147622427253e-05, + "loss": 0.13204431533813477, + "step": 23390 + }, + { + "epoch": 3.321504613200852, + "grad_norm": 7.170462131500244, + "learning_rate": 9.668005677785664e-05, + "loss": 0.082490473985672, + "step": 23400 + }, + { + "epoch": 3.3229240596167493, + "grad_norm": 1.1640955209732056, + "learning_rate": 9.667863733144074e-05, + "loss": 0.11552011966705322, + "step": 23410 + }, + { + "epoch": 3.324343506032647, + "grad_norm": 3.5345652103424072, + "learning_rate": 9.667721788502484e-05, + "loss": 0.07584733963012695, + "step": 23420 + }, + { + "epoch": 3.325762952448545, + "grad_norm": 1.844787836074829, + "learning_rate": 9.667579843860895e-05, + "loss": 0.09344690442085266, + "step": 23430 + }, + { + "epoch": 3.327182398864443, + "grad_norm": 2.403691053390503, + "learning_rate": 9.667437899219305e-05, + "loss": 0.057882833480834964, + "step": 23440 + }, + { + "epoch": 3.3286018452803408, + "grad_norm": 2.586052894592285, + "learning_rate": 9.667295954577716e-05, + "loss": 0.07656934261322021, + "step": 23450 + }, + { + "epoch": 3.3300212916962386, + "grad_norm": 0.33396223187446594, + "learning_rate": 9.667154009936125e-05, + "loss": 0.08143852353096008, + "step": 23460 + }, + { + "epoch": 3.3314407381121365, + "grad_norm": 0.9797456860542297, + "learning_rate": 9.667012065294535e-05, + "loss": 0.032908812165260315, + "step": 23470 + }, + { + "epoch": 3.332860184528034, + "grad_norm": 0.3462522625923157, + "learning_rate": 9.666870120652945e-05, + "loss": 0.05224289894104004, + "step": 23480 + }, + { + "epoch": 3.3342796309439318, + "grad_norm": 5.588517189025879, + "learning_rate": 9.666728176011356e-05, + "loss": 0.08177621364593506, + "step": 23490 + }, + { + "epoch": 3.3356990773598296, + "grad_norm": 6.037621021270752, + "learning_rate": 9.666586231369766e-05, + "loss": 0.06431897282600403, + "step": 23500 + }, + { + "epoch": 3.3356990773598296, + "eval_accuracy": 0.9582247090990017, + "eval_loss": 0.1211514100432396, + "eval_runtime": 32.6171, + "eval_samples_per_second": 482.17, + "eval_steps_per_second": 15.084, + "step": 23500 + }, + { + "epoch": 3.3371185237757275, + "grad_norm": 4.2738142013549805, + "learning_rate": 9.666444286728177e-05, + "loss": 0.07732362151145936, + "step": 23510 + }, + { + "epoch": 3.3385379701916253, + "grad_norm": 5.357970237731934, + "learning_rate": 9.666302342086587e-05, + "loss": 0.057775235176086424, + "step": 23520 + }, + { + "epoch": 3.339957416607523, + "grad_norm": 2.4043660163879395, + "learning_rate": 9.666160397444996e-05, + "loss": 0.10017684698104859, + "step": 23530 + }, + { + "epoch": 3.341376863023421, + "grad_norm": 7.4561381340026855, + "learning_rate": 9.666018452803407e-05, + "loss": 0.14003334045410157, + "step": 23540 + }, + { + "epoch": 3.3427963094393185, + "grad_norm": 2.9771358966827393, + "learning_rate": 9.665876508161817e-05, + "loss": 0.11144789457321166, + "step": 23550 + }, + { + "epoch": 3.3442157558552164, + "grad_norm": 5.861306190490723, + "learning_rate": 9.665734563520228e-05, + "loss": 0.1083723783493042, + "step": 23560 + }, + { + "epoch": 3.345635202271114, + "grad_norm": 1.0332176685333252, + "learning_rate": 9.665592618878638e-05, + "loss": 0.08513032793998718, + "step": 23570 + }, + { + "epoch": 3.347054648687012, + "grad_norm": 6.1437177658081055, + "learning_rate": 9.665450674237048e-05, + "loss": 0.08397155404090881, + "step": 23580 + }, + { + "epoch": 3.34847409510291, + "grad_norm": 4.794635772705078, + "learning_rate": 9.665308729595457e-05, + "loss": 0.042923647165298465, + "step": 23590 + }, + { + "epoch": 3.349893541518808, + "grad_norm": 3.806190252304077, + "learning_rate": 9.665166784953869e-05, + "loss": 0.08098719120025635, + "step": 23600 + }, + { + "epoch": 3.3513129879347057, + "grad_norm": 0.2237672656774521, + "learning_rate": 9.665024840312278e-05, + "loss": 0.07011445760726928, + "step": 23610 + }, + { + "epoch": 3.352732434350603, + "grad_norm": 3.0982532501220703, + "learning_rate": 9.66488289567069e-05, + "loss": 0.061842381954193115, + "step": 23620 + }, + { + "epoch": 3.354151880766501, + "grad_norm": 2.976536512374878, + "learning_rate": 9.664740951029099e-05, + "loss": 0.10006380081176758, + "step": 23630 + }, + { + "epoch": 3.355571327182399, + "grad_norm": 4.319900035858154, + "learning_rate": 9.664599006387509e-05, + "loss": 0.13653013706207276, + "step": 23640 + }, + { + "epoch": 3.3569907735982967, + "grad_norm": 1.9102489948272705, + "learning_rate": 9.66445706174592e-05, + "loss": 0.038610780239105226, + "step": 23650 + }, + { + "epoch": 3.3584102200141945, + "grad_norm": 6.633970737457275, + "learning_rate": 9.66431511710433e-05, + "loss": 0.06831348538398743, + "step": 23660 + }, + { + "epoch": 3.3598296664300924, + "grad_norm": 1.1184673309326172, + "learning_rate": 9.664173172462741e-05, + "loss": 0.06864879727363586, + "step": 23670 + }, + { + "epoch": 3.3612491128459903, + "grad_norm": 0.8485651612281799, + "learning_rate": 9.664031227821149e-05, + "loss": 0.08388459086418151, + "step": 23680 + }, + { + "epoch": 3.3626685592618877, + "grad_norm": 1.4212796688079834, + "learning_rate": 9.66388928317956e-05, + "loss": 0.20324900150299072, + "step": 23690 + }, + { + "epoch": 3.3640880056777855, + "grad_norm": 0.2244710922241211, + "learning_rate": 9.66374733853797e-05, + "loss": 0.07268852591514588, + "step": 23700 + }, + { + "epoch": 3.3655074520936834, + "grad_norm": 0.2561863660812378, + "learning_rate": 9.663605393896381e-05, + "loss": 0.036457425355911253, + "step": 23710 + }, + { + "epoch": 3.3669268985095813, + "grad_norm": 2.078640937805176, + "learning_rate": 9.663463449254792e-05, + "loss": 0.07209231853485107, + "step": 23720 + }, + { + "epoch": 3.368346344925479, + "grad_norm": 4.892085552215576, + "learning_rate": 9.6633215046132e-05, + "loss": 0.1211774468421936, + "step": 23730 + }, + { + "epoch": 3.369765791341377, + "grad_norm": 1.651289939880371, + "learning_rate": 9.663179559971612e-05, + "loss": 0.08962616324424744, + "step": 23740 + }, + { + "epoch": 3.371185237757275, + "grad_norm": 1.4341058731079102, + "learning_rate": 9.663037615330021e-05, + "loss": 0.06757261753082275, + "step": 23750 + }, + { + "epoch": 3.3726046841731723, + "grad_norm": 0.5684829354286194, + "learning_rate": 9.662895670688432e-05, + "loss": 0.04020809531211853, + "step": 23760 + }, + { + "epoch": 3.37402413058907, + "grad_norm": 2.886730194091797, + "learning_rate": 9.662753726046842e-05, + "loss": 0.07528796195983886, + "step": 23770 + }, + { + "epoch": 3.375443577004968, + "grad_norm": 7.543295383453369, + "learning_rate": 9.662611781405252e-05, + "loss": 0.2501710891723633, + "step": 23780 + }, + { + "epoch": 3.376863023420866, + "grad_norm": 6.99386739730835, + "learning_rate": 9.662469836763662e-05, + "loss": 0.07813713550567628, + "step": 23790 + }, + { + "epoch": 3.3782824698367637, + "grad_norm": 6.142605781555176, + "learning_rate": 9.662327892122073e-05, + "loss": 0.0971024513244629, + "step": 23800 + }, + { + "epoch": 3.3797019162526616, + "grad_norm": 5.846232891082764, + "learning_rate": 9.662185947480484e-05, + "loss": 0.11326665878295898, + "step": 23810 + }, + { + "epoch": 3.3811213626685594, + "grad_norm": 3.8466222286224365, + "learning_rate": 9.662044002838894e-05, + "loss": 0.09037129282951355, + "step": 23820 + }, + { + "epoch": 3.382540809084457, + "grad_norm": 1.8509072065353394, + "learning_rate": 9.661902058197303e-05, + "loss": 0.08938190340995789, + "step": 23830 + }, + { + "epoch": 3.3839602555003547, + "grad_norm": 8.372735023498535, + "learning_rate": 9.661760113555713e-05, + "loss": 0.10955497026443481, + "step": 23840 + }, + { + "epoch": 3.3853797019162526, + "grad_norm": 10.327803611755371, + "learning_rate": 9.661618168914124e-05, + "loss": 0.10603039264678955, + "step": 23850 + }, + { + "epoch": 3.3867991483321505, + "grad_norm": 2.4464328289031982, + "learning_rate": 9.661476224272534e-05, + "loss": 0.07022827863693237, + "step": 23860 + }, + { + "epoch": 3.3882185947480483, + "grad_norm": 4.964604377746582, + "learning_rate": 9.661334279630945e-05, + "loss": 0.10754181146621704, + "step": 23870 + }, + { + "epoch": 3.389638041163946, + "grad_norm": 2.0936126708984375, + "learning_rate": 9.661192334989355e-05, + "loss": 0.07387771010398865, + "step": 23880 + }, + { + "epoch": 3.391057487579844, + "grad_norm": 1.5606902837753296, + "learning_rate": 9.661050390347764e-05, + "loss": 0.06499841809272766, + "step": 23890 + }, + { + "epoch": 3.3924769339957415, + "grad_norm": 0.09581028670072556, + "learning_rate": 9.660908445706176e-05, + "loss": 0.09357624053955078, + "step": 23900 + }, + { + "epoch": 3.3938963804116393, + "grad_norm": 2.011545181274414, + "learning_rate": 9.660766501064585e-05, + "loss": 0.04169844388961792, + "step": 23910 + }, + { + "epoch": 3.395315826827537, + "grad_norm": 0.6940661668777466, + "learning_rate": 9.660624556422996e-05, + "loss": 0.05995774269104004, + "step": 23920 + }, + { + "epoch": 3.396735273243435, + "grad_norm": 2.8684120178222656, + "learning_rate": 9.660482611781406e-05, + "loss": 0.05829171538352966, + "step": 23930 + }, + { + "epoch": 3.398154719659333, + "grad_norm": 5.727314472198486, + "learning_rate": 9.660340667139816e-05, + "loss": 0.0676846444606781, + "step": 23940 + }, + { + "epoch": 3.3995741660752308, + "grad_norm": 3.3505942821502686, + "learning_rate": 9.660198722498226e-05, + "loss": 0.12202495336532593, + "step": 23950 + }, + { + "epoch": 3.4009936124911286, + "grad_norm": 1.6798441410064697, + "learning_rate": 9.660056777856637e-05, + "loss": 0.10003808736801148, + "step": 23960 + }, + { + "epoch": 3.402413058907026, + "grad_norm": 2.8134841918945312, + "learning_rate": 9.659914833215046e-05, + "loss": 0.053173118829727174, + "step": 23970 + }, + { + "epoch": 3.403832505322924, + "grad_norm": 9.647566795349121, + "learning_rate": 9.659772888573458e-05, + "loss": 0.09169653654098511, + "step": 23980 + }, + { + "epoch": 3.405251951738822, + "grad_norm": 2.525071620941162, + "learning_rate": 9.659630943931867e-05, + "loss": 0.05470997095108032, + "step": 23990 + }, + { + "epoch": 3.4066713981547196, + "grad_norm": 7.918493270874023, + "learning_rate": 9.659488999290277e-05, + "loss": 0.12718768119812013, + "step": 24000 + }, + { + "epoch": 3.4066713981547196, + "eval_accuracy": 0.9593056526991798, + "eval_loss": 0.11784256994724274, + "eval_runtime": 34.1419, + "eval_samples_per_second": 460.636, + "eval_steps_per_second": 14.41, + "step": 24000 + }, + { + "epoch": 3.4080908445706175, + "grad_norm": 8.479427337646484, + "learning_rate": 9.659347054648688e-05, + "loss": 0.12565889358520507, + "step": 24010 + }, + { + "epoch": 3.4095102909865154, + "grad_norm": 1.4310401678085327, + "learning_rate": 9.659205110007098e-05, + "loss": 0.07409765720367431, + "step": 24020 + }, + { + "epoch": 3.4109297374024132, + "grad_norm": 1.3293160200119019, + "learning_rate": 9.659063165365509e-05, + "loss": 0.06405404210090637, + "step": 24030 + }, + { + "epoch": 3.4123491838183106, + "grad_norm": 2.3439300060272217, + "learning_rate": 9.658921220723917e-05, + "loss": 0.07644574642181397, + "step": 24040 + }, + { + "epoch": 3.4137686302342085, + "grad_norm": 5.991164207458496, + "learning_rate": 9.658779276082328e-05, + "loss": 0.1311761498451233, + "step": 24050 + }, + { + "epoch": 3.4151880766501064, + "grad_norm": 4.515506267547607, + "learning_rate": 9.658637331440738e-05, + "loss": 0.080819970369339, + "step": 24060 + }, + { + "epoch": 3.4166075230660042, + "grad_norm": 3.080458402633667, + "learning_rate": 9.658495386799149e-05, + "loss": 0.07668147087097169, + "step": 24070 + }, + { + "epoch": 3.418026969481902, + "grad_norm": 6.942470550537109, + "learning_rate": 9.658353442157559e-05, + "loss": 0.07289301753044128, + "step": 24080 + }, + { + "epoch": 3.4194464158978, + "grad_norm": 9.14225959777832, + "learning_rate": 9.658211497515969e-05, + "loss": 0.07435898780822754, + "step": 24090 + }, + { + "epoch": 3.420865862313698, + "grad_norm": 7.3029704093933105, + "learning_rate": 9.65806955287438e-05, + "loss": 0.12275665998458862, + "step": 24100 + }, + { + "epoch": 3.4222853087295952, + "grad_norm": 1.066394567489624, + "learning_rate": 9.65792760823279e-05, + "loss": 0.12547402381896972, + "step": 24110 + }, + { + "epoch": 3.423704755145493, + "grad_norm": 2.095668315887451, + "learning_rate": 9.6577856635912e-05, + "loss": 0.08885858654975891, + "step": 24120 + }, + { + "epoch": 3.425124201561391, + "grad_norm": 10.10063648223877, + "learning_rate": 9.65764371894961e-05, + "loss": 0.08219894766807556, + "step": 24130 + }, + { + "epoch": 3.426543647977289, + "grad_norm": 0.24362793564796448, + "learning_rate": 9.65750177430802e-05, + "loss": 0.07828856706619262, + "step": 24140 + }, + { + "epoch": 3.4279630943931867, + "grad_norm": 3.3321142196655273, + "learning_rate": 9.65735982966643e-05, + "loss": 0.052914398908615115, + "step": 24150 + }, + { + "epoch": 3.4293825408090846, + "grad_norm": 6.5169291496276855, + "learning_rate": 9.657217885024841e-05, + "loss": 0.0918683409690857, + "step": 24160 + }, + { + "epoch": 3.4308019872249824, + "grad_norm": 1.8033021688461304, + "learning_rate": 9.657075940383251e-05, + "loss": 0.07939133048057556, + "step": 24170 + }, + { + "epoch": 3.43222143364088, + "grad_norm": 0.5477492213249207, + "learning_rate": 9.656933995741662e-05, + "loss": 0.08349984288215637, + "step": 24180 + }, + { + "epoch": 3.4336408800567777, + "grad_norm": 5.996103763580322, + "learning_rate": 9.656792051100072e-05, + "loss": 0.05642620325088501, + "step": 24190 + }, + { + "epoch": 3.4350603264726756, + "grad_norm": 10.91261100769043, + "learning_rate": 9.656650106458481e-05, + "loss": 0.10933701992034912, + "step": 24200 + }, + { + "epoch": 3.4364797728885734, + "grad_norm": 2.225350856781006, + "learning_rate": 9.656508161816892e-05, + "loss": 0.09172443151474, + "step": 24210 + }, + { + "epoch": 3.4378992193044713, + "grad_norm": 12.634965896606445, + "learning_rate": 9.656366217175302e-05, + "loss": 0.11917402744293212, + "step": 24220 + }, + { + "epoch": 3.439318665720369, + "grad_norm": 1.6125768423080444, + "learning_rate": 9.656224272533713e-05, + "loss": 0.06305748820304871, + "step": 24230 + }, + { + "epoch": 3.440738112136267, + "grad_norm": 0.32264623045921326, + "learning_rate": 9.656082327892123e-05, + "loss": 0.053128784894943236, + "step": 24240 + }, + { + "epoch": 3.4421575585521644, + "grad_norm": 1.5485633611679077, + "learning_rate": 9.655940383250533e-05, + "loss": 0.09052397012710571, + "step": 24250 + }, + { + "epoch": 3.4435770049680623, + "grad_norm": 8.407336235046387, + "learning_rate": 9.655798438608942e-05, + "loss": 0.0869211733341217, + "step": 24260 + }, + { + "epoch": 3.44499645138396, + "grad_norm": 4.730905532836914, + "learning_rate": 9.655656493967353e-05, + "loss": 0.07399642467498779, + "step": 24270 + }, + { + "epoch": 3.446415897799858, + "grad_norm": 3.4000537395477295, + "learning_rate": 9.655514549325763e-05, + "loss": 0.047950705885887145, + "step": 24280 + }, + { + "epoch": 3.447835344215756, + "grad_norm": 1.1020469665527344, + "learning_rate": 9.655372604684174e-05, + "loss": 0.06868406534194946, + "step": 24290 + }, + { + "epoch": 3.4492547906316537, + "grad_norm": 7.190598964691162, + "learning_rate": 9.655230660042584e-05, + "loss": 0.0772173523902893, + "step": 24300 + }, + { + "epoch": 3.4506742370475516, + "grad_norm": 0.16195560991764069, + "learning_rate": 9.655088715400994e-05, + "loss": 0.05037579536437988, + "step": 24310 + }, + { + "epoch": 3.452093683463449, + "grad_norm": 5.206357955932617, + "learning_rate": 9.654946770759405e-05, + "loss": 0.05795242190361023, + "step": 24320 + }, + { + "epoch": 3.453513129879347, + "grad_norm": 3.8032917976379395, + "learning_rate": 9.654804826117815e-05, + "loss": 0.06166144609451294, + "step": 24330 + }, + { + "epoch": 3.4549325762952448, + "grad_norm": 7.195924282073975, + "learning_rate": 9.654662881476226e-05, + "loss": 0.07787706851959228, + "step": 24340 + }, + { + "epoch": 3.4563520227111426, + "grad_norm": 16.916200637817383, + "learning_rate": 9.654520936834634e-05, + "loss": 0.12234679460525513, + "step": 24350 + }, + { + "epoch": 3.4577714691270405, + "grad_norm": 8.12978458404541, + "learning_rate": 9.654378992193045e-05, + "loss": 0.08913070559501649, + "step": 24360 + }, + { + "epoch": 3.4591909155429383, + "grad_norm": 5.649082660675049, + "learning_rate": 9.654237047551455e-05, + "loss": 0.06851221919059754, + "step": 24370 + }, + { + "epoch": 3.460610361958836, + "grad_norm": 9.085246086120605, + "learning_rate": 9.654095102909866e-05, + "loss": 0.07936888933181763, + "step": 24380 + }, + { + "epoch": 3.4620298083747336, + "grad_norm": 6.739210605621338, + "learning_rate": 9.653953158268276e-05, + "loss": 0.0680124282836914, + "step": 24390 + }, + { + "epoch": 3.4634492547906315, + "grad_norm": 4.914496421813965, + "learning_rate": 9.653811213626686e-05, + "loss": 0.07251676321029663, + "step": 24400 + }, + { + "epoch": 3.4648687012065293, + "grad_norm": 3.8612000942230225, + "learning_rate": 9.653669268985097e-05, + "loss": 0.08312456011772155, + "step": 24410 + }, + { + "epoch": 3.466288147622427, + "grad_norm": 5.080418109893799, + "learning_rate": 9.653527324343506e-05, + "loss": 0.08824545741081238, + "step": 24420 + }, + { + "epoch": 3.467707594038325, + "grad_norm": 2.7461204528808594, + "learning_rate": 9.653385379701917e-05, + "loss": 0.04693276584148407, + "step": 24430 + }, + { + "epoch": 3.469127040454223, + "grad_norm": 2.284554958343506, + "learning_rate": 9.653243435060327e-05, + "loss": 0.10196805000305176, + "step": 24440 + }, + { + "epoch": 3.470546486870121, + "grad_norm": 6.074938774108887, + "learning_rate": 9.653101490418737e-05, + "loss": 0.12747013568878174, + "step": 24450 + }, + { + "epoch": 3.471965933286018, + "grad_norm": 4.511362075805664, + "learning_rate": 9.652959545777147e-05, + "loss": 0.115402352809906, + "step": 24460 + }, + { + "epoch": 3.473385379701916, + "grad_norm": 2.1728434562683105, + "learning_rate": 9.652817601135558e-05, + "loss": 0.050969237089157106, + "step": 24470 + }, + { + "epoch": 3.474804826117814, + "grad_norm": 5.665693283081055, + "learning_rate": 9.652675656493967e-05, + "loss": 0.06567577123641968, + "step": 24480 + }, + { + "epoch": 3.476224272533712, + "grad_norm": 1.5518124103546143, + "learning_rate": 9.652533711852379e-05, + "loss": 0.05959618091583252, + "step": 24490 + }, + { + "epoch": 3.4776437189496097, + "grad_norm": 8.269552230834961, + "learning_rate": 9.652391767210788e-05, + "loss": 0.13263360261917115, + "step": 24500 + }, + { + "epoch": 3.4776437189496097, + "eval_accuracy": 0.9656641444649329, + "eval_loss": 0.10235972702503204, + "eval_runtime": 33.7613, + "eval_samples_per_second": 465.829, + "eval_steps_per_second": 14.573, + "step": 24500 + }, + { + "epoch": 3.4790631653655075, + "grad_norm": 4.476282119750977, + "learning_rate": 9.652249822569198e-05, + "loss": 0.06739105582237244, + "step": 24510 + }, + { + "epoch": 3.4804826117814054, + "grad_norm": 8.51496410369873, + "learning_rate": 9.652107877927609e-05, + "loss": 0.08702877759933472, + "step": 24520 + }, + { + "epoch": 3.481902058197303, + "grad_norm": 2.6295464038848877, + "learning_rate": 9.651965933286019e-05, + "loss": 0.10381957292556762, + "step": 24530 + }, + { + "epoch": 3.4833215046132007, + "grad_norm": 2.952054023742676, + "learning_rate": 9.65182398864443e-05, + "loss": 0.10116275548934936, + "step": 24540 + }, + { + "epoch": 3.4847409510290985, + "grad_norm": 7.40458869934082, + "learning_rate": 9.651682044002838e-05, + "loss": 0.1063815712928772, + "step": 24550 + }, + { + "epoch": 3.4861603974449964, + "grad_norm": 3.4271445274353027, + "learning_rate": 9.65154009936125e-05, + "loss": 0.12271822690963745, + "step": 24560 + }, + { + "epoch": 3.4875798438608943, + "grad_norm": 4.230976581573486, + "learning_rate": 9.651398154719659e-05, + "loss": 0.06680415868759156, + "step": 24570 + }, + { + "epoch": 3.488999290276792, + "grad_norm": 10.036641120910645, + "learning_rate": 9.65125621007807e-05, + "loss": 0.05150268673896789, + "step": 24580 + }, + { + "epoch": 3.49041873669269, + "grad_norm": 7.777481555938721, + "learning_rate": 9.65111426543648e-05, + "loss": 0.08956578969955445, + "step": 24590 + }, + { + "epoch": 3.4918381831085874, + "grad_norm": 5.8065266609191895, + "learning_rate": 9.650972320794891e-05, + "loss": 0.10252159833908081, + "step": 24600 + }, + { + "epoch": 3.4932576295244853, + "grad_norm": 0.654843270778656, + "learning_rate": 9.650830376153301e-05, + "loss": 0.0543290913105011, + "step": 24610 + }, + { + "epoch": 3.494677075940383, + "grad_norm": 0.9799067974090576, + "learning_rate": 9.65068843151171e-05, + "loss": 0.0650719940662384, + "step": 24620 + }, + { + "epoch": 3.496096522356281, + "grad_norm": 1.798651933670044, + "learning_rate": 9.650546486870122e-05, + "loss": 0.06141197085380554, + "step": 24630 + }, + { + "epoch": 3.497515968772179, + "grad_norm": 3.3337302207946777, + "learning_rate": 9.650404542228531e-05, + "loss": 0.07960495352745056, + "step": 24640 + }, + { + "epoch": 3.4989354151880767, + "grad_norm": 0.37414559721946716, + "learning_rate": 9.650262597586942e-05, + "loss": 0.10498731136322022, + "step": 24650 + }, + { + "epoch": 3.5003548616039746, + "grad_norm": 11.681800842285156, + "learning_rate": 9.650120652945351e-05, + "loss": 0.09571239948272706, + "step": 24660 + }, + { + "epoch": 3.501774308019872, + "grad_norm": 1.942776083946228, + "learning_rate": 9.649978708303762e-05, + "loss": 0.11830227375030518, + "step": 24670 + }, + { + "epoch": 3.5031937544357703, + "grad_norm": 3.5960655212402344, + "learning_rate": 9.649836763662172e-05, + "loss": 0.056894832849502565, + "step": 24680 + }, + { + "epoch": 3.5046132008516677, + "grad_norm": 5.279286861419678, + "learning_rate": 9.649694819020583e-05, + "loss": 0.11261140108108521, + "step": 24690 + }, + { + "epoch": 3.5060326472675656, + "grad_norm": 3.89916729927063, + "learning_rate": 9.649552874378993e-05, + "loss": 0.09311820268630981, + "step": 24700 + }, + { + "epoch": 3.5074520936834634, + "grad_norm": 0.738353431224823, + "learning_rate": 9.649410929737402e-05, + "loss": 0.08309696912765503, + "step": 24710 + }, + { + "epoch": 3.5088715400993613, + "grad_norm": 6.307223320007324, + "learning_rate": 9.649268985095813e-05, + "loss": 0.07369316220283509, + "step": 24720 + }, + { + "epoch": 3.510290986515259, + "grad_norm": 8.444607734680176, + "learning_rate": 9.649127040454223e-05, + "loss": 0.07793084383010865, + "step": 24730 + }, + { + "epoch": 3.5117104329311566, + "grad_norm": 1.6136986017227173, + "learning_rate": 9.648985095812634e-05, + "loss": 0.11156415939331055, + "step": 24740 + }, + { + "epoch": 3.513129879347055, + "grad_norm": 6.505612373352051, + "learning_rate": 9.648843151171044e-05, + "loss": 0.06763787865638733, + "step": 24750 + }, + { + "epoch": 3.5145493257629523, + "grad_norm": 8.121411323547363, + "learning_rate": 9.648701206529454e-05, + "loss": 0.09036332368850708, + "step": 24760 + }, + { + "epoch": 3.51596877217885, + "grad_norm": 4.047122001647949, + "learning_rate": 9.648559261887863e-05, + "loss": 0.10994062423706055, + "step": 24770 + }, + { + "epoch": 3.517388218594748, + "grad_norm": 0.8031113743782043, + "learning_rate": 9.648417317246275e-05, + "loss": 0.1324693441390991, + "step": 24780 + }, + { + "epoch": 3.518807665010646, + "grad_norm": 0.1589478850364685, + "learning_rate": 9.648275372604684e-05, + "loss": 0.07809120416641235, + "step": 24790 + }, + { + "epoch": 3.5202271114265438, + "grad_norm": 5.511590480804443, + "learning_rate": 9.648133427963095e-05, + "loss": 0.09465236663818359, + "step": 24800 + }, + { + "epoch": 3.521646557842441, + "grad_norm": 2.396857738494873, + "learning_rate": 9.647991483321505e-05, + "loss": 0.09788199663162231, + "step": 24810 + }, + { + "epoch": 3.5230660042583395, + "grad_norm": 3.002704381942749, + "learning_rate": 9.647849538679915e-05, + "loss": 0.05331340432167053, + "step": 24820 + }, + { + "epoch": 3.524485450674237, + "grad_norm": 0.42355236411094666, + "learning_rate": 9.647707594038326e-05, + "loss": 0.0318134218454361, + "step": 24830 + }, + { + "epoch": 3.5259048970901348, + "grad_norm": 2.182748794555664, + "learning_rate": 9.647565649396736e-05, + "loss": 0.07532593607902527, + "step": 24840 + }, + { + "epoch": 3.5273243435060326, + "grad_norm": 8.828009605407715, + "learning_rate": 9.647423704755147e-05, + "loss": 0.060152608156204226, + "step": 24850 + }, + { + "epoch": 3.5287437899219305, + "grad_norm": 4.714108943939209, + "learning_rate": 9.647281760113555e-05, + "loss": 0.029934373497962952, + "step": 24860 + }, + { + "epoch": 3.5301632363378284, + "grad_norm": 0.8313024640083313, + "learning_rate": 9.647139815471966e-05, + "loss": 0.12967721223831177, + "step": 24870 + }, + { + "epoch": 3.5315826827537258, + "grad_norm": 2.8028974533081055, + "learning_rate": 9.646997870830376e-05, + "loss": 0.08742020130157471, + "step": 24880 + }, + { + "epoch": 3.533002129169624, + "grad_norm": 5.4242143630981445, + "learning_rate": 9.646855926188787e-05, + "loss": 0.11844632625579835, + "step": 24890 + }, + { + "epoch": 3.5344215755855215, + "grad_norm": 6.741092681884766, + "learning_rate": 9.646713981547197e-05, + "loss": 0.08173008561134339, + "step": 24900 + }, + { + "epoch": 3.5358410220014194, + "grad_norm": 7.06593132019043, + "learning_rate": 9.646572036905607e-05, + "loss": 0.10493273735046386, + "step": 24910 + }, + { + "epoch": 3.5372604684173172, + "grad_norm": 0.9364591836929321, + "learning_rate": 9.646430092264018e-05, + "loss": 0.09316438436508179, + "step": 24920 + }, + { + "epoch": 3.538679914833215, + "grad_norm": 6.34156608581543, + "learning_rate": 9.646288147622427e-05, + "loss": 0.04985399842262268, + "step": 24930 + }, + { + "epoch": 3.540099361249113, + "grad_norm": 13.556730270385742, + "learning_rate": 9.646146202980838e-05, + "loss": 0.0970345377922058, + "step": 24940 + }, + { + "epoch": 3.5415188076650104, + "grad_norm": 7.102383613586426, + "learning_rate": 9.646004258339248e-05, + "loss": 0.10454981327056885, + "step": 24950 + }, + { + "epoch": 3.5429382540809087, + "grad_norm": 5.229292869567871, + "learning_rate": 9.645862313697659e-05, + "loss": 0.08302426338195801, + "step": 24960 + }, + { + "epoch": 3.544357700496806, + "grad_norm": 5.76925802230835, + "learning_rate": 9.645720369056068e-05, + "loss": 0.10274431705474854, + "step": 24970 + }, + { + "epoch": 3.545777146912704, + "grad_norm": 4.70728063583374, + "learning_rate": 9.645578424414479e-05, + "loss": 0.0719529628753662, + "step": 24980 + }, + { + "epoch": 3.547196593328602, + "grad_norm": 4.43380069732666, + "learning_rate": 9.645436479772888e-05, + "loss": 0.060102427005767824, + "step": 24990 + }, + { + "epoch": 3.5486160397444997, + "grad_norm": 4.603033542633057, + "learning_rate": 9.6452945351313e-05, + "loss": 0.10912116765975952, + "step": 25000 + }, + { + "epoch": 3.5486160397444997, + "eval_accuracy": 0.9611496153112482, + "eval_loss": 0.11311028897762299, + "eval_runtime": 33.6933, + "eval_samples_per_second": 466.769, + "eval_steps_per_second": 14.602, + "step": 25000 + }, + { + "epoch": 3.5500354861603975, + "grad_norm": 1.6893727779388428, + "learning_rate": 9.645152590489709e-05, + "loss": 0.08042104840278626, + "step": 25010 + }, + { + "epoch": 3.5514549325762954, + "grad_norm": 0.5648311376571655, + "learning_rate": 9.645010645848119e-05, + "loss": 0.059700363874435426, + "step": 25020 + }, + { + "epoch": 3.5528743789921933, + "grad_norm": 9.752403259277344, + "learning_rate": 9.64486870120653e-05, + "loss": 0.0868448257446289, + "step": 25030 + }, + { + "epoch": 3.5542938254080907, + "grad_norm": 2.9299299716949463, + "learning_rate": 9.64472675656494e-05, + "loss": 0.04335830807685852, + "step": 25040 + }, + { + "epoch": 3.5557132718239886, + "grad_norm": 10.618478775024414, + "learning_rate": 9.644584811923351e-05, + "loss": 0.1310647249221802, + "step": 25050 + }, + { + "epoch": 3.5571327182398864, + "grad_norm": 9.584770202636719, + "learning_rate": 9.644442867281761e-05, + "loss": 0.11195597648620606, + "step": 25060 + }, + { + "epoch": 3.5585521646557843, + "grad_norm": 3.4068570137023926, + "learning_rate": 9.64430092264017e-05, + "loss": 0.11045770645141602, + "step": 25070 + }, + { + "epoch": 3.559971611071682, + "grad_norm": 2.1086835861206055, + "learning_rate": 9.64415897799858e-05, + "loss": 0.04244246780872345, + "step": 25080 + }, + { + "epoch": 3.56139105748758, + "grad_norm": 0.7292802929878235, + "learning_rate": 9.644017033356991e-05, + "loss": 0.05122672319412232, + "step": 25090 + }, + { + "epoch": 3.562810503903478, + "grad_norm": 0.779449999332428, + "learning_rate": 9.643875088715401e-05, + "loss": 0.05376675724983215, + "step": 25100 + }, + { + "epoch": 3.5642299503193753, + "grad_norm": 6.790277004241943, + "learning_rate": 9.643733144073812e-05, + "loss": 0.08969722390174865, + "step": 25110 + }, + { + "epoch": 3.565649396735273, + "grad_norm": 0.9874815344810486, + "learning_rate": 9.643591199432222e-05, + "loss": 0.07819917201995849, + "step": 25120 + }, + { + "epoch": 3.567068843151171, + "grad_norm": 3.9310240745544434, + "learning_rate": 9.643449254790632e-05, + "loss": 0.07071614861488343, + "step": 25130 + }, + { + "epoch": 3.568488289567069, + "grad_norm": 13.718709945678711, + "learning_rate": 9.643307310149043e-05, + "loss": 0.09383904933929443, + "step": 25140 + }, + { + "epoch": 3.5699077359829667, + "grad_norm": 6.163698673248291, + "learning_rate": 9.643165365507452e-05, + "loss": 0.10959553718566895, + "step": 25150 + }, + { + "epoch": 3.5713271823988646, + "grad_norm": 6.060120105743408, + "learning_rate": 9.643023420865864e-05, + "loss": 0.060175150632858276, + "step": 25160 + }, + { + "epoch": 3.5727466288147625, + "grad_norm": 1.9445127248764038, + "learning_rate": 9.642881476224272e-05, + "loss": 0.10089634656906128, + "step": 25170 + }, + { + "epoch": 3.57416607523066, + "grad_norm": 8.044722557067871, + "learning_rate": 9.642739531582683e-05, + "loss": 0.12044985294342041, + "step": 25180 + }, + { + "epoch": 3.5755855216465577, + "grad_norm": 6.326447010040283, + "learning_rate": 9.642597586941093e-05, + "loss": 0.09188529253005981, + "step": 25190 + }, + { + "epoch": 3.5770049680624556, + "grad_norm": 1.62061607837677, + "learning_rate": 9.642455642299504e-05, + "loss": 0.06626140475273132, + "step": 25200 + }, + { + "epoch": 3.5784244144783535, + "grad_norm": 6.9937591552734375, + "learning_rate": 9.642313697657915e-05, + "loss": 0.12362555265426636, + "step": 25210 + }, + { + "epoch": 3.5798438608942513, + "grad_norm": 3.7871155738830566, + "learning_rate": 9.642171753016323e-05, + "loss": 0.08823931813240052, + "step": 25220 + }, + { + "epoch": 3.581263307310149, + "grad_norm": 1.0720821619033813, + "learning_rate": 9.642029808374734e-05, + "loss": 0.049576738476753236, + "step": 25230 + }, + { + "epoch": 3.582682753726047, + "grad_norm": 0.7439848780632019, + "learning_rate": 9.641887863733144e-05, + "loss": 0.07031044960021973, + "step": 25240 + }, + { + "epoch": 3.5841022001419445, + "grad_norm": 8.99116325378418, + "learning_rate": 9.641745919091555e-05, + "loss": 0.09977667331695557, + "step": 25250 + }, + { + "epoch": 3.5855216465578423, + "grad_norm": 0.37592869997024536, + "learning_rate": 9.641603974449965e-05, + "loss": 0.0772906482219696, + "step": 25260 + }, + { + "epoch": 3.58694109297374, + "grad_norm": 6.236084938049316, + "learning_rate": 9.641462029808375e-05, + "loss": 0.05990852117538452, + "step": 25270 + }, + { + "epoch": 3.588360539389638, + "grad_norm": 3.208134412765503, + "learning_rate": 9.641320085166784e-05, + "loss": 0.11211087703704833, + "step": 25280 + }, + { + "epoch": 3.589779985805536, + "grad_norm": 1.552689552307129, + "learning_rate": 9.641178140525196e-05, + "loss": 0.08635483980178833, + "step": 25290 + }, + { + "epoch": 3.591199432221434, + "grad_norm": 6.984618663787842, + "learning_rate": 9.641036195883607e-05, + "loss": 0.06257756948471069, + "step": 25300 + }, + { + "epoch": 3.5926188786373316, + "grad_norm": 0.3589995503425598, + "learning_rate": 9.640894251242016e-05, + "loss": 0.07547361850738525, + "step": 25310 + }, + { + "epoch": 3.594038325053229, + "grad_norm": 7.3257737159729, + "learning_rate": 9.640752306600427e-05, + "loss": 0.04197915494441986, + "step": 25320 + }, + { + "epoch": 3.595457771469127, + "grad_norm": 5.887513160705566, + "learning_rate": 9.640610361958836e-05, + "loss": 0.06401208639144898, + "step": 25330 + }, + { + "epoch": 3.596877217885025, + "grad_norm": 7.882718086242676, + "learning_rate": 9.640468417317247e-05, + "loss": 0.06862297058105468, + "step": 25340 + }, + { + "epoch": 3.5982966643009227, + "grad_norm": 1.1109976768493652, + "learning_rate": 9.640340667139815e-05, + "loss": 0.07234618067741394, + "step": 25350 + }, + { + "epoch": 3.5997161107168205, + "grad_norm": 11.460066795349121, + "learning_rate": 9.640198722498226e-05, + "loss": 0.07255152463912964, + "step": 25360 + }, + { + "epoch": 3.6011355571327184, + "grad_norm": 9.745214462280273, + "learning_rate": 9.640056777856636e-05, + "loss": 0.10036202669143676, + "step": 25370 + }, + { + "epoch": 3.6025550035486162, + "grad_norm": 2.1519269943237305, + "learning_rate": 9.639914833215047e-05, + "loss": 0.06883406043052673, + "step": 25380 + }, + { + "epoch": 3.6039744499645137, + "grad_norm": 2.5749173164367676, + "learning_rate": 9.639772888573457e-05, + "loss": 0.03991932868957519, + "step": 25390 + }, + { + "epoch": 3.6053938963804115, + "grad_norm": 5.585699558258057, + "learning_rate": 9.639630943931867e-05, + "loss": 0.07052261233329774, + "step": 25400 + }, + { + "epoch": 3.6068133427963094, + "grad_norm": 10.144248008728027, + "learning_rate": 9.639488999290277e-05, + "loss": 0.08440894484519959, + "step": 25410 + }, + { + "epoch": 3.6082327892122072, + "grad_norm": 7.000726222991943, + "learning_rate": 9.639347054648688e-05, + "loss": 0.11805384159088135, + "step": 25420 + }, + { + "epoch": 3.609652235628105, + "grad_norm": 4.314553737640381, + "learning_rate": 9.639205110007097e-05, + "loss": 0.04131576418876648, + "step": 25430 + }, + { + "epoch": 3.611071682044003, + "grad_norm": 6.750652313232422, + "learning_rate": 9.639063165365508e-05, + "loss": 0.16327909231185914, + "step": 25440 + }, + { + "epoch": 3.612491128459901, + "grad_norm": 4.056532859802246, + "learning_rate": 9.638921220723918e-05, + "loss": 0.08948258757591247, + "step": 25450 + }, + { + "epoch": 3.6139105748757983, + "grad_norm": 1.1540457010269165, + "learning_rate": 9.638779276082328e-05, + "loss": 0.059862494468688965, + "step": 25460 + }, + { + "epoch": 3.615330021291696, + "grad_norm": 0.861678421497345, + "learning_rate": 9.638637331440739e-05, + "loss": 0.056112641096115114, + "step": 25470 + }, + { + "epoch": 3.616749467707594, + "grad_norm": 5.665146827697754, + "learning_rate": 9.638495386799149e-05, + "loss": 0.07238735556602478, + "step": 25480 + }, + { + "epoch": 3.618168914123492, + "grad_norm": 1.3516796827316284, + "learning_rate": 9.63835344215756e-05, + "loss": 0.07645809054374694, + "step": 25490 + }, + { + "epoch": 3.6195883605393897, + "grad_norm": 4.757277011871338, + "learning_rate": 9.638211497515968e-05, + "loss": 0.0696679949760437, + "step": 25500 + }, + { + "epoch": 3.6195883605393897, + "eval_accuracy": 0.9649011254530425, + "eval_loss": 0.10610143095254898, + "eval_runtime": 33.7176, + "eval_samples_per_second": 466.433, + "eval_steps_per_second": 14.592, + "step": 25500 + }, + { + "epoch": 3.6210078069552876, + "grad_norm": 4.6941819190979, + "learning_rate": 9.63806955287438e-05, + "loss": 0.05806577205657959, + "step": 25510 + }, + { + "epoch": 3.6224272533711854, + "grad_norm": 7.179256439208984, + "learning_rate": 9.637927608232789e-05, + "loss": 0.10271693468093872, + "step": 25520 + }, + { + "epoch": 3.623846699787083, + "grad_norm": 2.660531997680664, + "learning_rate": 9.6377856635912e-05, + "loss": 0.11873768568038941, + "step": 25530 + }, + { + "epoch": 3.6252661462029807, + "grad_norm": 2.4319779872894287, + "learning_rate": 9.63764371894961e-05, + "loss": 0.05240858197212219, + "step": 25540 + }, + { + "epoch": 3.6266855926188786, + "grad_norm": 3.6984822750091553, + "learning_rate": 9.63750177430802e-05, + "loss": 0.05370069146156311, + "step": 25550 + }, + { + "epoch": 3.6281050390347764, + "grad_norm": 4.340889930725098, + "learning_rate": 9.637359829666431e-05, + "loss": 0.08285000324249267, + "step": 25560 + }, + { + "epoch": 3.6295244854506743, + "grad_norm": 9.616756439208984, + "learning_rate": 9.63721788502484e-05, + "loss": 0.07987736463546753, + "step": 25570 + }, + { + "epoch": 3.630943931866572, + "grad_norm": 1.1144945621490479, + "learning_rate": 9.637075940383252e-05, + "loss": 0.08109164237976074, + "step": 25580 + }, + { + "epoch": 3.63236337828247, + "grad_norm": 5.484223365783691, + "learning_rate": 9.636933995741661e-05, + "loss": 0.1028984785079956, + "step": 25590 + }, + { + "epoch": 3.6337828246983674, + "grad_norm": 1.9237008094787598, + "learning_rate": 9.636792051100071e-05, + "loss": 0.11396080255508423, + "step": 25600 + }, + { + "epoch": 3.6352022711142653, + "grad_norm": 4.099696159362793, + "learning_rate": 9.636650106458481e-05, + "loss": 0.10055809020996094, + "step": 25610 + }, + { + "epoch": 3.636621717530163, + "grad_norm": 1.0165332555770874, + "learning_rate": 9.636508161816892e-05, + "loss": 0.03974857628345489, + "step": 25620 + }, + { + "epoch": 3.638041163946061, + "grad_norm": 1.9846147298812866, + "learning_rate": 9.636366217175302e-05, + "loss": 0.11356563568115234, + "step": 25630 + }, + { + "epoch": 3.639460610361959, + "grad_norm": 0.7101534008979797, + "learning_rate": 9.636224272533713e-05, + "loss": 0.043464869260787964, + "step": 25640 + }, + { + "epoch": 3.6408800567778568, + "grad_norm": 10.363176345825195, + "learning_rate": 9.636082327892122e-05, + "loss": 0.10717108249664306, + "step": 25650 + }, + { + "epoch": 3.6422995031937546, + "grad_norm": 9.283759117126465, + "learning_rate": 9.635940383250532e-05, + "loss": 0.11051251888275146, + "step": 25660 + }, + { + "epoch": 3.643718949609652, + "grad_norm": 0.6984942555427551, + "learning_rate": 9.635798438608943e-05, + "loss": 0.06172139048576355, + "step": 25670 + }, + { + "epoch": 3.64513839602555, + "grad_norm": 8.953624725341797, + "learning_rate": 9.635656493967353e-05, + "loss": 0.08708047866821289, + "step": 25680 + }, + { + "epoch": 3.6465578424414478, + "grad_norm": 3.1702566146850586, + "learning_rate": 9.635514549325764e-05, + "loss": 0.10060502290725708, + "step": 25690 + }, + { + "epoch": 3.6479772888573456, + "grad_norm": 1.4352515935897827, + "learning_rate": 9.635372604684174e-05, + "loss": 0.05796252489089966, + "step": 25700 + }, + { + "epoch": 3.6493967352732435, + "grad_norm": 2.140181303024292, + "learning_rate": 9.635230660042584e-05, + "loss": 0.09123912453651428, + "step": 25710 + }, + { + "epoch": 3.6508161816891413, + "grad_norm": 0.572270929813385, + "learning_rate": 9.635088715400993e-05, + "loss": 0.060646504163742065, + "step": 25720 + }, + { + "epoch": 3.652235628105039, + "grad_norm": 0.5034018158912659, + "learning_rate": 9.634946770759404e-05, + "loss": 0.07777016758918762, + "step": 25730 + }, + { + "epoch": 3.6536550745209366, + "grad_norm": 1.8546146154403687, + "learning_rate": 9.634804826117814e-05, + "loss": 0.1366788387298584, + "step": 25740 + }, + { + "epoch": 3.6550745209368345, + "grad_norm": 1.924846887588501, + "learning_rate": 9.634662881476225e-05, + "loss": 0.0842927873134613, + "step": 25750 + }, + { + "epoch": 3.6564939673527324, + "grad_norm": 4.480966567993164, + "learning_rate": 9.634520936834635e-05, + "loss": 0.051267868280410765, + "step": 25760 + }, + { + "epoch": 3.65791341376863, + "grad_norm": 6.783929347991943, + "learning_rate": 9.634378992193045e-05, + "loss": 0.07230629920959472, + "step": 25770 + }, + { + "epoch": 3.659332860184528, + "grad_norm": 0.09694766253232956, + "learning_rate": 9.634237047551456e-05, + "loss": 0.032146582007408143, + "step": 25780 + }, + { + "epoch": 3.660752306600426, + "grad_norm": 1.94701087474823, + "learning_rate": 9.634095102909866e-05, + "loss": 0.08497151136398315, + "step": 25790 + }, + { + "epoch": 3.662171753016324, + "grad_norm": 4.432292461395264, + "learning_rate": 9.633953158268277e-05, + "loss": 0.06812145113945008, + "step": 25800 + }, + { + "epoch": 3.6635911994322212, + "grad_norm": 1.2834193706512451, + "learning_rate": 9.633811213626685e-05, + "loss": 0.045406836271286014, + "step": 25810 + }, + { + "epoch": 3.665010645848119, + "grad_norm": 10.543720245361328, + "learning_rate": 9.633669268985096e-05, + "loss": 0.11493253707885742, + "step": 25820 + }, + { + "epoch": 3.666430092264017, + "grad_norm": 8.311552047729492, + "learning_rate": 9.633527324343506e-05, + "loss": 0.12239972352981568, + "step": 25830 + }, + { + "epoch": 3.667849538679915, + "grad_norm": 3.1987102031707764, + "learning_rate": 9.633385379701917e-05, + "loss": 0.0902472972869873, + "step": 25840 + }, + { + "epoch": 3.6692689850958127, + "grad_norm": 3.2787559032440186, + "learning_rate": 9.633243435060327e-05, + "loss": 0.09204012155532837, + "step": 25850 + }, + { + "epoch": 3.6706884315117105, + "grad_norm": 0.667934775352478, + "learning_rate": 9.633101490418736e-05, + "loss": 0.12997353076934814, + "step": 25860 + }, + { + "epoch": 3.6721078779276084, + "grad_norm": 0.9998012185096741, + "learning_rate": 9.632959545777148e-05, + "loss": 0.1022377371788025, + "step": 25870 + }, + { + "epoch": 3.673527324343506, + "grad_norm": 6.183043479919434, + "learning_rate": 9.632817601135557e-05, + "loss": 0.11707621812820435, + "step": 25880 + }, + { + "epoch": 3.6749467707594037, + "grad_norm": 2.924884080886841, + "learning_rate": 9.632675656493968e-05, + "loss": 0.06619247198104858, + "step": 25890 + }, + { + "epoch": 3.6763662171753015, + "grad_norm": 5.365716934204102, + "learning_rate": 9.632533711852378e-05, + "loss": 0.06179612874984741, + "step": 25900 + }, + { + "epoch": 3.6777856635911994, + "grad_norm": 1.3756598234176636, + "learning_rate": 9.632391767210788e-05, + "loss": 0.06040409207344055, + "step": 25910 + }, + { + "epoch": 3.6792051100070973, + "grad_norm": 2.291795015335083, + "learning_rate": 9.632249822569198e-05, + "loss": 0.0542915940284729, + "step": 25920 + }, + { + "epoch": 3.680624556422995, + "grad_norm": 11.811894416809082, + "learning_rate": 9.632107877927609e-05, + "loss": 0.11134748458862305, + "step": 25930 + }, + { + "epoch": 3.682044002838893, + "grad_norm": 7.955319881439209, + "learning_rate": 9.631965933286018e-05, + "loss": 0.05479052066802979, + "step": 25940 + }, + { + "epoch": 3.6834634492547904, + "grad_norm": 4.885499954223633, + "learning_rate": 9.63182398864443e-05, + "loss": 0.07002484798431396, + "step": 25950 + }, + { + "epoch": 3.6848828956706883, + "grad_norm": 10.359343528747559, + "learning_rate": 9.631682044002839e-05, + "loss": 0.08828185796737671, + "step": 25960 + }, + { + "epoch": 3.686302342086586, + "grad_norm": 0.5266070365905762, + "learning_rate": 9.631540099361249e-05, + "loss": 0.06352581977844238, + "step": 25970 + }, + { + "epoch": 3.687721788502484, + "grad_norm": 4.642971515655518, + "learning_rate": 9.63139815471966e-05, + "loss": 0.10442217588424682, + "step": 25980 + }, + { + "epoch": 3.689141234918382, + "grad_norm": 9.688862800598145, + "learning_rate": 9.63125621007807e-05, + "loss": 0.11465591192245483, + "step": 25990 + }, + { + "epoch": 3.6905606813342797, + "grad_norm": 3.961071252822876, + "learning_rate": 9.631114265436481e-05, + "loss": 0.06878976821899414, + "step": 26000 + }, + { + "epoch": 3.6905606813342797, + "eval_accuracy": 0.9624213136643988, + "eval_loss": 0.10850615799427032, + "eval_runtime": 33.8166, + "eval_samples_per_second": 465.068, + "eval_steps_per_second": 14.549, + "step": 26000 + }, + { + "epoch": 3.6919801277501776, + "grad_norm": 4.773269176483154, + "learning_rate": 9.63097232079489e-05, + "loss": 0.1178863525390625, + "step": 26010 + }, + { + "epoch": 3.693399574166075, + "grad_norm": 2.3169288635253906, + "learning_rate": 9.6308303761533e-05, + "loss": 0.11722581386566162, + "step": 26020 + }, + { + "epoch": 3.694819020581973, + "grad_norm": 0.1308957189321518, + "learning_rate": 9.63068843151171e-05, + "loss": 0.10760715007781982, + "step": 26030 + }, + { + "epoch": 3.6962384669978707, + "grad_norm": 1.4586437940597534, + "learning_rate": 9.630546486870121e-05, + "loss": 0.05567214488983154, + "step": 26040 + }, + { + "epoch": 3.6976579134137686, + "grad_norm": 2.1539924144744873, + "learning_rate": 9.630404542228532e-05, + "loss": 0.06053451895713806, + "step": 26050 + }, + { + "epoch": 3.6990773598296665, + "grad_norm": 4.069761276245117, + "learning_rate": 9.630262597586942e-05, + "loss": 0.08152814507484436, + "step": 26060 + }, + { + "epoch": 3.7004968062455643, + "grad_norm": 7.161952018737793, + "learning_rate": 9.630120652945352e-05, + "loss": 0.12550090551376342, + "step": 26070 + }, + { + "epoch": 3.701916252661462, + "grad_norm": 2.8434369564056396, + "learning_rate": 9.629978708303761e-05, + "loss": 0.07767623662948608, + "step": 26080 + }, + { + "epoch": 3.7033356990773596, + "grad_norm": 0.979110062122345, + "learning_rate": 9.629836763662173e-05, + "loss": 0.11028465032577514, + "step": 26090 + }, + { + "epoch": 3.704755145493258, + "grad_norm": 0.522153913974762, + "learning_rate": 9.629694819020582e-05, + "loss": 0.04352694153785706, + "step": 26100 + }, + { + "epoch": 3.7061745919091553, + "grad_norm": 12.08892822265625, + "learning_rate": 9.629552874378993e-05, + "loss": 0.0692097783088684, + "step": 26110 + }, + { + "epoch": 3.707594038325053, + "grad_norm": 1.5270307064056396, + "learning_rate": 9.629410929737402e-05, + "loss": 0.07068887948989869, + "step": 26120 + }, + { + "epoch": 3.709013484740951, + "grad_norm": 8.704063415527344, + "learning_rate": 9.629268985095813e-05, + "loss": 0.1482228994369507, + "step": 26130 + }, + { + "epoch": 3.710432931156849, + "grad_norm": 5.276047706604004, + "learning_rate": 9.629127040454224e-05, + "loss": 0.12095144987106324, + "step": 26140 + }, + { + "epoch": 3.7118523775727468, + "grad_norm": 6.4501566886901855, + "learning_rate": 9.628985095812634e-05, + "loss": 0.11314345598220825, + "step": 26150 + }, + { + "epoch": 3.713271823988644, + "grad_norm": 1.444300889968872, + "learning_rate": 9.628843151171045e-05, + "loss": 0.15359948873519896, + "step": 26160 + }, + { + "epoch": 3.7146912704045425, + "grad_norm": 0.7862725853919983, + "learning_rate": 9.628701206529453e-05, + "loss": 0.04201339781284332, + "step": 26170 + }, + { + "epoch": 3.71611071682044, + "grad_norm": 4.127621650695801, + "learning_rate": 9.628559261887864e-05, + "loss": 0.08473769426345826, + "step": 26180 + }, + { + "epoch": 3.717530163236338, + "grad_norm": 1.7975977659225464, + "learning_rate": 9.628417317246274e-05, + "loss": 0.06359924674034119, + "step": 26190 + }, + { + "epoch": 3.7189496096522356, + "grad_norm": 3.1404027938842773, + "learning_rate": 9.628275372604685e-05, + "loss": 0.10828089714050293, + "step": 26200 + }, + { + "epoch": 3.7203690560681335, + "grad_norm": 10.088000297546387, + "learning_rate": 9.628133427963095e-05, + "loss": 0.0649361789226532, + "step": 26210 + }, + { + "epoch": 3.7217885024840314, + "grad_norm": 4.136114120483398, + "learning_rate": 9.627991483321505e-05, + "loss": 0.09006186723709106, + "step": 26220 + }, + { + "epoch": 3.723207948899929, + "grad_norm": 0.03255090489983559, + "learning_rate": 9.627849538679916e-05, + "loss": 0.11616590023040771, + "step": 26230 + }, + { + "epoch": 3.724627395315827, + "grad_norm": 1.3517063856124878, + "learning_rate": 9.627707594038325e-05, + "loss": 0.05594800710678101, + "step": 26240 + }, + { + "epoch": 3.7260468417317245, + "grad_norm": 0.47422000765800476, + "learning_rate": 9.627565649396737e-05, + "loss": 0.11615034341812133, + "step": 26250 + }, + { + "epoch": 3.7274662881476224, + "grad_norm": 4.764305591583252, + "learning_rate": 9.627423704755146e-05, + "loss": 0.09596173763275147, + "step": 26260 + }, + { + "epoch": 3.7288857345635202, + "grad_norm": 7.1653008460998535, + "learning_rate": 9.627281760113556e-05, + "loss": 0.09863389730453491, + "step": 26270 + }, + { + "epoch": 3.730305180979418, + "grad_norm": 6.6298418045043945, + "learning_rate": 9.627139815471966e-05, + "loss": 0.08772293925285339, + "step": 26280 + }, + { + "epoch": 3.731724627395316, + "grad_norm": 2.4701497554779053, + "learning_rate": 9.626997870830377e-05, + "loss": 0.05061078667640686, + "step": 26290 + }, + { + "epoch": 3.7331440738112134, + "grad_norm": 10.637267112731934, + "learning_rate": 9.626855926188787e-05, + "loss": 0.134352707862854, + "step": 26300 + }, + { + "epoch": 3.7345635202271117, + "grad_norm": 1.4602038860321045, + "learning_rate": 9.626713981547198e-05, + "loss": 0.0697918713092804, + "step": 26310 + }, + { + "epoch": 3.735982966643009, + "grad_norm": 2.9405529499053955, + "learning_rate": 9.626572036905607e-05, + "loss": 0.08264508247375488, + "step": 26320 + }, + { + "epoch": 3.737402413058907, + "grad_norm": 1.7835215330123901, + "learning_rate": 9.626430092264017e-05, + "loss": 0.08539316654205323, + "step": 26330 + }, + { + "epoch": 3.738821859474805, + "grad_norm": 4.365605354309082, + "learning_rate": 9.626288147622428e-05, + "loss": 0.07182769775390625, + "step": 26340 + }, + { + "epoch": 3.7402413058907027, + "grad_norm": 0.8774107098579407, + "learning_rate": 9.626146202980838e-05, + "loss": 0.06151903867721557, + "step": 26350 + }, + { + "epoch": 3.7416607523066006, + "grad_norm": 6.543043613433838, + "learning_rate": 9.626004258339249e-05, + "loss": 0.08403420448303223, + "step": 26360 + }, + { + "epoch": 3.743080198722498, + "grad_norm": 8.38376522064209, + "learning_rate": 9.625862313697659e-05, + "loss": 0.09423916339874268, + "step": 26370 + }, + { + "epoch": 3.7444996451383963, + "grad_norm": 0.7524133920669556, + "learning_rate": 9.625720369056069e-05, + "loss": 0.09477906823158264, + "step": 26380 + }, + { + "epoch": 3.7459190915542937, + "grad_norm": 4.932705879211426, + "learning_rate": 9.625578424414478e-05, + "loss": 0.09350728392601013, + "step": 26390 + }, + { + "epoch": 3.7473385379701916, + "grad_norm": 1.0093165636062622, + "learning_rate": 9.62543647977289e-05, + "loss": 0.08254989981651306, + "step": 26400 + }, + { + "epoch": 3.7487579843860894, + "grad_norm": 10.72624683380127, + "learning_rate": 9.625294535131299e-05, + "loss": 0.14923367500305176, + "step": 26410 + }, + { + "epoch": 3.7501774308019873, + "grad_norm": 2.8247926235198975, + "learning_rate": 9.62515259048971e-05, + "loss": 0.048303854465484616, + "step": 26420 + }, + { + "epoch": 3.751596877217885, + "grad_norm": 1.634414792060852, + "learning_rate": 9.62501064584812e-05, + "loss": 0.04281752109527588, + "step": 26430 + }, + { + "epoch": 3.7530163236337826, + "grad_norm": 7.190004825592041, + "learning_rate": 9.62486870120653e-05, + "loss": 0.12459969520568848, + "step": 26440 + }, + { + "epoch": 3.754435770049681, + "grad_norm": 0.9193140864372253, + "learning_rate": 9.624726756564941e-05, + "loss": 0.10240849256515502, + "step": 26450 + }, + { + "epoch": 3.7558552164655783, + "grad_norm": 5.948113918304443, + "learning_rate": 9.62458481192335e-05, + "loss": 0.07362242937088012, + "step": 26460 + }, + { + "epoch": 3.757274662881476, + "grad_norm": 6.859321117401123, + "learning_rate": 9.624442867281762e-05, + "loss": 0.06381948590278626, + "step": 26470 + }, + { + "epoch": 3.758694109297374, + "grad_norm": 8.806060791015625, + "learning_rate": 9.62430092264017e-05, + "loss": 0.10844473838806153, + "step": 26480 + }, + { + "epoch": 3.760113555713272, + "grad_norm": 6.027776718139648, + "learning_rate": 9.624158977998581e-05, + "loss": 0.044374221563339235, + "step": 26490 + }, + { + "epoch": 3.7615330021291697, + "grad_norm": 12.30217456817627, + "learning_rate": 9.624017033356991e-05, + "loss": 0.08910216689109803, + "step": 26500 + }, + { + "epoch": 3.7615330021291697, + "eval_accuracy": 0.9674445221593438, + "eval_loss": 0.09445588290691376, + "eval_runtime": 35.0795, + "eval_samples_per_second": 448.325, + "eval_steps_per_second": 14.025, + "step": 26500 + }, + { + "epoch": 3.762952448545067, + "grad_norm": 9.912172317504883, + "learning_rate": 9.623875088715402e-05, + "loss": 0.07598323225975037, + "step": 26510 + }, + { + "epoch": 3.7643718949609655, + "grad_norm": 6.507425785064697, + "learning_rate": 9.623733144073812e-05, + "loss": 0.10796759128570557, + "step": 26520 + }, + { + "epoch": 3.765791341376863, + "grad_norm": 0.8920461535453796, + "learning_rate": 9.623591199432221e-05, + "loss": 0.0660994827747345, + "step": 26530 + }, + { + "epoch": 3.7672107877927608, + "grad_norm": 0.7246550917625427, + "learning_rate": 9.623449254790632e-05, + "loss": 0.0721944272518158, + "step": 26540 + }, + { + "epoch": 3.7686302342086586, + "grad_norm": 8.431264877319336, + "learning_rate": 9.623307310149042e-05, + "loss": 0.11602548360824586, + "step": 26550 + }, + { + "epoch": 3.7700496806245565, + "grad_norm": 2.8587067127227783, + "learning_rate": 9.623165365507453e-05, + "loss": 0.0831636905670166, + "step": 26560 + }, + { + "epoch": 3.7714691270404543, + "grad_norm": 6.571961402893066, + "learning_rate": 9.623023420865863e-05, + "loss": 0.08520074486732483, + "step": 26570 + }, + { + "epoch": 3.7728885734563518, + "grad_norm": 9.178510665893555, + "learning_rate": 9.622881476224273e-05, + "loss": 0.10961424112319947, + "step": 26580 + }, + { + "epoch": 3.77430801987225, + "grad_norm": 5.433183670043945, + "learning_rate": 9.622739531582682e-05, + "loss": 0.035625565052032473, + "step": 26590 + }, + { + "epoch": 3.7757274662881475, + "grad_norm": 3.318091869354248, + "learning_rate": 9.622597586941094e-05, + "loss": 0.08876525163650513, + "step": 26600 + }, + { + "epoch": 3.7771469127040453, + "grad_norm": 5.857662677764893, + "learning_rate": 9.622455642299503e-05, + "loss": 0.09441637992858887, + "step": 26610 + }, + { + "epoch": 3.778566359119943, + "grad_norm": 0.6901429891586304, + "learning_rate": 9.622313697657914e-05, + "loss": 0.1499311089515686, + "step": 26620 + }, + { + "epoch": 3.779985805535841, + "grad_norm": 4.66740608215332, + "learning_rate": 9.622171753016324e-05, + "loss": 0.08082123398780823, + "step": 26630 + }, + { + "epoch": 3.781405251951739, + "grad_norm": 1.0753750801086426, + "learning_rate": 9.622029808374734e-05, + "loss": 0.11261454820632935, + "step": 26640 + }, + { + "epoch": 3.7828246983676364, + "grad_norm": 1.8048758506774902, + "learning_rate": 9.621887863733145e-05, + "loss": 0.04800321459770203, + "step": 26650 + }, + { + "epoch": 3.7842441447835347, + "grad_norm": 4.112722396850586, + "learning_rate": 9.621745919091555e-05, + "loss": 0.05345563292503357, + "step": 26660 + }, + { + "epoch": 3.785663591199432, + "grad_norm": 4.393552303314209, + "learning_rate": 9.621603974449966e-05, + "loss": 0.06797432899475098, + "step": 26670 + }, + { + "epoch": 3.78708303761533, + "grad_norm": 5.8782057762146, + "learning_rate": 9.621462029808376e-05, + "loss": 0.10248314142227173, + "step": 26680 + }, + { + "epoch": 3.788502484031228, + "grad_norm": 4.203194618225098, + "learning_rate": 9.621320085166785e-05, + "loss": 0.07673492431640624, + "step": 26690 + }, + { + "epoch": 3.7899219304471257, + "grad_norm": 6.538164138793945, + "learning_rate": 9.621178140525195e-05, + "loss": 0.14795901775360107, + "step": 26700 + }, + { + "epoch": 3.7913413768630235, + "grad_norm": 11.21495532989502, + "learning_rate": 9.621036195883606e-05, + "loss": 0.11105353832244873, + "step": 26710 + }, + { + "epoch": 3.792760823278921, + "grad_norm": 9.616037368774414, + "learning_rate": 9.620894251242016e-05, + "loss": 0.06156564950942993, + "step": 26720 + }, + { + "epoch": 3.7941802696948193, + "grad_norm": 1.3275951147079468, + "learning_rate": 9.620752306600427e-05, + "loss": 0.10542140007019044, + "step": 26730 + }, + { + "epoch": 3.7955997161107167, + "grad_norm": 0.3900807797908783, + "learning_rate": 9.620610361958837e-05, + "loss": 0.07363483309745789, + "step": 26740 + }, + { + "epoch": 3.7970191625266145, + "grad_norm": 0.9365026950836182, + "learning_rate": 9.620468417317246e-05, + "loss": 0.13672546148300171, + "step": 26750 + }, + { + "epoch": 3.7984386089425124, + "grad_norm": 5.348412990570068, + "learning_rate": 9.620326472675658e-05, + "loss": 0.08223192691802979, + "step": 26760 + }, + { + "epoch": 3.7998580553584103, + "grad_norm": 5.438634872436523, + "learning_rate": 9.620184528034067e-05, + "loss": 0.0626322865486145, + "step": 26770 + }, + { + "epoch": 3.801277501774308, + "grad_norm": 3.877898693084717, + "learning_rate": 9.620042583392478e-05, + "loss": 0.08450109958648681, + "step": 26780 + }, + { + "epoch": 3.802696948190206, + "grad_norm": 1.7447080612182617, + "learning_rate": 9.619900638750887e-05, + "loss": 0.0839974820613861, + "step": 26790 + }, + { + "epoch": 3.804116394606104, + "grad_norm": 3.0330183506011963, + "learning_rate": 9.619758694109298e-05, + "loss": 0.09644001126289367, + "step": 26800 + }, + { + "epoch": 3.8055358410220013, + "grad_norm": 1.270749568939209, + "learning_rate": 9.619616749467708e-05, + "loss": 0.0933254063129425, + "step": 26810 + }, + { + "epoch": 3.806955287437899, + "grad_norm": 7.720582485198975, + "learning_rate": 9.619474804826119e-05, + "loss": 0.10009632110595704, + "step": 26820 + }, + { + "epoch": 3.808374733853797, + "grad_norm": 0.09740854054689407, + "learning_rate": 9.619332860184528e-05, + "loss": 0.055448722839355466, + "step": 26830 + }, + { + "epoch": 3.809794180269695, + "grad_norm": 7.304279327392578, + "learning_rate": 9.619190915542938e-05, + "loss": 0.07708572149276734, + "step": 26840 + }, + { + "epoch": 3.8112136266855927, + "grad_norm": 2.435711622238159, + "learning_rate": 9.619048970901349e-05, + "loss": 0.03736964464187622, + "step": 26850 + }, + { + "epoch": 3.8126330731014906, + "grad_norm": 1.575897455215454, + "learning_rate": 9.618907026259759e-05, + "loss": 0.08769443035125732, + "step": 26860 + }, + { + "epoch": 3.8140525195173884, + "grad_norm": 2.552058458328247, + "learning_rate": 9.61876508161817e-05, + "loss": 0.08449045419692994, + "step": 26870 + }, + { + "epoch": 3.815471965933286, + "grad_norm": 5.736693382263184, + "learning_rate": 9.61862313697658e-05, + "loss": 0.037884673476219176, + "step": 26880 + }, + { + "epoch": 3.8168914123491837, + "grad_norm": 1.4216078519821167, + "learning_rate": 9.61848119233499e-05, + "loss": 0.08100056648254395, + "step": 26890 + }, + { + "epoch": 3.8183108587650816, + "grad_norm": 2.799546480178833, + "learning_rate": 9.618339247693399e-05, + "loss": 0.06523057818412781, + "step": 26900 + }, + { + "epoch": 3.8197303051809794, + "grad_norm": 1.2117537260055542, + "learning_rate": 9.61819730305181e-05, + "loss": 0.049738320708274844, + "step": 26910 + }, + { + "epoch": 3.8211497515968773, + "grad_norm": 4.893204212188721, + "learning_rate": 9.61805535841022e-05, + "loss": 0.07888569235801697, + "step": 26920 + }, + { + "epoch": 3.822569198012775, + "grad_norm": 7.409163475036621, + "learning_rate": 9.617913413768631e-05, + "loss": 0.07680698037147522, + "step": 26930 + }, + { + "epoch": 3.823988644428673, + "grad_norm": 3.720153331756592, + "learning_rate": 9.617771469127041e-05, + "loss": 0.09198516607284546, + "step": 26940 + }, + { + "epoch": 3.8254080908445705, + "grad_norm": 6.0060224533081055, + "learning_rate": 9.61762952448545e-05, + "loss": 0.07883418202400208, + "step": 26950 + }, + { + "epoch": 3.8268275372604683, + "grad_norm": 1.143011212348938, + "learning_rate": 9.617487579843862e-05, + "loss": 0.04225245714187622, + "step": 26960 + }, + { + "epoch": 3.828246983676366, + "grad_norm": 12.127354621887207, + "learning_rate": 9.617345635202271e-05, + "loss": 0.05237703919410706, + "step": 26970 + }, + { + "epoch": 3.829666430092264, + "grad_norm": 5.843609809875488, + "learning_rate": 9.617203690560683e-05, + "loss": 0.08967744708061218, + "step": 26980 + }, + { + "epoch": 3.831085876508162, + "grad_norm": 2.9347615242004395, + "learning_rate": 9.617061745919091e-05, + "loss": 0.08393052220344543, + "step": 26990 + }, + { + "epoch": 3.8325053229240598, + "grad_norm": 4.1316657066345215, + "learning_rate": 9.616919801277502e-05, + "loss": 0.09489677548408508, + "step": 27000 + }, + { + "epoch": 3.8325053229240598, + "eval_accuracy": 0.9656641444649329, + "eval_loss": 0.10252257436513901, + "eval_runtime": 33.8687, + "eval_samples_per_second": 464.352, + "eval_steps_per_second": 14.527, + "step": 27000 + }, + { + "epoch": 3.8339247693399576, + "grad_norm": 2.383795976638794, + "learning_rate": 9.616777856635912e-05, + "loss": 0.062172305583953855, + "step": 27010 + }, + { + "epoch": 3.835344215755855, + "grad_norm": 4.4788737297058105, + "learning_rate": 9.616635911994323e-05, + "loss": 0.08685917258262635, + "step": 27020 + }, + { + "epoch": 3.836763662171753, + "grad_norm": 4.786855220794678, + "learning_rate": 9.616493967352733e-05, + "loss": 0.04608690142631531, + "step": 27030 + }, + { + "epoch": 3.8381831085876508, + "grad_norm": 5.334591388702393, + "learning_rate": 9.616352022711144e-05, + "loss": 0.08949378728866578, + "step": 27040 + }, + { + "epoch": 3.8396025550035486, + "grad_norm": 6.602955341339111, + "learning_rate": 9.616210078069553e-05, + "loss": 0.08425779342651367, + "step": 27050 + }, + { + "epoch": 3.8410220014194465, + "grad_norm": 6.772454261779785, + "learning_rate": 9.616068133427963e-05, + "loss": 0.08206725120544434, + "step": 27060 + }, + { + "epoch": 3.8424414478353444, + "grad_norm": 2.036407709121704, + "learning_rate": 9.615926188786374e-05, + "loss": 0.12470332384109498, + "step": 27070 + }, + { + "epoch": 3.843860894251242, + "grad_norm": 5.146540641784668, + "learning_rate": 9.615784244144784e-05, + "loss": 0.041440796852111814, + "step": 27080 + }, + { + "epoch": 3.8452803406671396, + "grad_norm": 4.734882831573486, + "learning_rate": 9.615642299503195e-05, + "loss": 0.10296415090560913, + "step": 27090 + }, + { + "epoch": 3.8466997870830375, + "grad_norm": 4.153848171234131, + "learning_rate": 9.615500354861604e-05, + "loss": 0.10323355197906495, + "step": 27100 + }, + { + "epoch": 3.8481192334989354, + "grad_norm": 3.4101874828338623, + "learning_rate": 9.615358410220015e-05, + "loss": 0.05214914083480835, + "step": 27110 + }, + { + "epoch": 3.8495386799148332, + "grad_norm": 2.5987448692321777, + "learning_rate": 9.615216465578424e-05, + "loss": 0.06731126308441163, + "step": 27120 + }, + { + "epoch": 3.850958126330731, + "grad_norm": 0.9972496628761292, + "learning_rate": 9.615074520936835e-05, + "loss": 0.09721655249595643, + "step": 27130 + }, + { + "epoch": 3.852377572746629, + "grad_norm": 2.8008410930633545, + "learning_rate": 9.614932576295245e-05, + "loss": 0.09973838925361633, + "step": 27140 + }, + { + "epoch": 3.853797019162527, + "grad_norm": 6.48881721496582, + "learning_rate": 9.614790631653655e-05, + "loss": 0.09506261944770814, + "step": 27150 + }, + { + "epoch": 3.8552164655784242, + "grad_norm": 4.826530933380127, + "learning_rate": 9.614648687012066e-05, + "loss": 0.07496817111968994, + "step": 27160 + }, + { + "epoch": 3.856635911994322, + "grad_norm": 1.649975061416626, + "learning_rate": 9.614506742370476e-05, + "loss": 0.07133917212486267, + "step": 27170 + }, + { + "epoch": 3.85805535841022, + "grad_norm": 5.264161586761475, + "learning_rate": 9.614364797728887e-05, + "loss": 0.0537201464176178, + "step": 27180 + }, + { + "epoch": 3.859474804826118, + "grad_norm": 4.296278953552246, + "learning_rate": 9.614222853087297e-05, + "loss": 0.08852277994155884, + "step": 27190 + }, + { + "epoch": 3.8608942512420157, + "grad_norm": 3.727269172668457, + "learning_rate": 9.614080908445706e-05, + "loss": 0.12326380014419555, + "step": 27200 + }, + { + "epoch": 3.8623136976579135, + "grad_norm": 1.0287762880325317, + "learning_rate": 9.613938963804116e-05, + "loss": 0.08480355739593506, + "step": 27210 + }, + { + "epoch": 3.8637331440738114, + "grad_norm": 2.53420090675354, + "learning_rate": 9.613797019162527e-05, + "loss": 0.08443946838378906, + "step": 27220 + }, + { + "epoch": 3.865152590489709, + "grad_norm": 2.7054736614227295, + "learning_rate": 9.613655074520937e-05, + "loss": 0.12175383567810058, + "step": 27230 + }, + { + "epoch": 3.8665720369056067, + "grad_norm": 0.3844822943210602, + "learning_rate": 9.613513129879348e-05, + "loss": 0.05505117177963257, + "step": 27240 + }, + { + "epoch": 3.8679914833215046, + "grad_norm": 1.9096373319625854, + "learning_rate": 9.613371185237758e-05, + "loss": 0.08993933200836182, + "step": 27250 + }, + { + "epoch": 3.8694109297374024, + "grad_norm": 9.668591499328613, + "learning_rate": 9.613229240596167e-05, + "loss": 0.09555359482765198, + "step": 27260 + }, + { + "epoch": 3.8708303761533003, + "grad_norm": 6.2174787521362305, + "learning_rate": 9.613087295954579e-05, + "loss": 0.07780954241752625, + "step": 27270 + }, + { + "epoch": 3.872249822569198, + "grad_norm": 6.8190741539001465, + "learning_rate": 9.612945351312988e-05, + "loss": 0.06228452920913696, + "step": 27280 + }, + { + "epoch": 3.873669268985096, + "grad_norm": 5.052826881408691, + "learning_rate": 9.6128034066714e-05, + "loss": 0.05573546886444092, + "step": 27290 + }, + { + "epoch": 3.8750887154009934, + "grad_norm": 2.147706985473633, + "learning_rate": 9.612661462029808e-05, + "loss": 0.030921798944473267, + "step": 27300 + }, + { + "epoch": 3.8765081618168913, + "grad_norm": 1.844710350036621, + "learning_rate": 9.612519517388219e-05, + "loss": 0.02291310876607895, + "step": 27310 + }, + { + "epoch": 3.877927608232789, + "grad_norm": 4.277228355407715, + "learning_rate": 9.612377572746629e-05, + "loss": 0.09191072583198548, + "step": 27320 + }, + { + "epoch": 3.879347054648687, + "grad_norm": 0.5011488199234009, + "learning_rate": 9.61223562810504e-05, + "loss": 0.09018791913986206, + "step": 27330 + }, + { + "epoch": 3.880766501064585, + "grad_norm": 7.728168964385986, + "learning_rate": 9.61209368346345e-05, + "loss": 0.17614935636520385, + "step": 27340 + }, + { + "epoch": 3.8821859474804827, + "grad_norm": 7.640387058258057, + "learning_rate": 9.611951738821859e-05, + "loss": 0.053128105401992795, + "step": 27350 + }, + { + "epoch": 3.8836053938963806, + "grad_norm": 4.586857795715332, + "learning_rate": 9.61180979418027e-05, + "loss": 0.07566349506378174, + "step": 27360 + }, + { + "epoch": 3.885024840312278, + "grad_norm": 9.557601928710938, + "learning_rate": 9.61166784953868e-05, + "loss": 0.16053431034088134, + "step": 27370 + }, + { + "epoch": 3.886444286728176, + "grad_norm": 2.9451968669891357, + "learning_rate": 9.61154009936125e-05, + "loss": 0.10314490795135497, + "step": 27380 + }, + { + "epoch": 3.8878637331440737, + "grad_norm": 2.856229305267334, + "learning_rate": 9.61139815471966e-05, + "loss": 0.03152236342430115, + "step": 27390 + }, + { + "epoch": 3.8892831795599716, + "grad_norm": 10.189471244812012, + "learning_rate": 9.61125621007807e-05, + "loss": 0.07608083486557007, + "step": 27400 + }, + { + "epoch": 3.8907026259758695, + "grad_norm": 4.597877025604248, + "learning_rate": 9.61111426543648e-05, + "loss": 0.09993425607681275, + "step": 27410 + }, + { + "epoch": 3.8921220723917673, + "grad_norm": 4.780374050140381, + "learning_rate": 9.610972320794891e-05, + "loss": 0.12308070659637452, + "step": 27420 + }, + { + "epoch": 3.893541518807665, + "grad_norm": 3.518791675567627, + "learning_rate": 9.6108303761533e-05, + "loss": 0.06599195003509521, + "step": 27430 + }, + { + "epoch": 3.8949609652235626, + "grad_norm": 0.6286087036132812, + "learning_rate": 9.610688431511711e-05, + "loss": 0.0612760066986084, + "step": 27440 + }, + { + "epoch": 3.8963804116394605, + "grad_norm": 4.189844608306885, + "learning_rate": 9.610546486870121e-05, + "loss": 0.09225243330001831, + "step": 27450 + }, + { + "epoch": 3.8977998580553583, + "grad_norm": 0.4884074032306671, + "learning_rate": 9.610404542228532e-05, + "loss": 0.09044739603996277, + "step": 27460 + }, + { + "epoch": 3.899219304471256, + "grad_norm": 4.86144495010376, + "learning_rate": 9.610262597586942e-05, + "loss": 0.09649211764335633, + "step": 27470 + }, + { + "epoch": 3.900638750887154, + "grad_norm": 0.6673234701156616, + "learning_rate": 9.610120652945351e-05, + "loss": 0.06789471507072449, + "step": 27480 + }, + { + "epoch": 3.902058197303052, + "grad_norm": 5.458202362060547, + "learning_rate": 9.609978708303762e-05, + "loss": 0.06328256726264954, + "step": 27490 + }, + { + "epoch": 3.90347764371895, + "grad_norm": 3.2472307682037354, + "learning_rate": 9.609836763662172e-05, + "loss": 0.03994499444961548, + "step": 27500 + }, + { + "epoch": 3.90347764371895, + "eval_accuracy": 0.9700515037833026, + "eval_loss": 0.08563879877328873, + "eval_runtime": 35.5026, + "eval_samples_per_second": 442.982, + "eval_steps_per_second": 13.858, + "step": 27500 + }, + { + "epoch": 3.904897090134847, + "grad_norm": 6.66304874420166, + "learning_rate": 9.609694819020583e-05, + "loss": 0.10047676563262939, + "step": 27510 + }, + { + "epoch": 3.906316536550745, + "grad_norm": 4.747247219085693, + "learning_rate": 9.609552874378993e-05, + "loss": 0.0649915337562561, + "step": 27520 + }, + { + "epoch": 3.907735982966643, + "grad_norm": 6.686343669891357, + "learning_rate": 9.609410929737403e-05, + "loss": 0.07071633338928222, + "step": 27530 + }, + { + "epoch": 3.909155429382541, + "grad_norm": 6.8346266746521, + "learning_rate": 9.609268985095812e-05, + "loss": 0.08757308721542359, + "step": 27540 + }, + { + "epoch": 3.9105748757984387, + "grad_norm": 4.510685443878174, + "learning_rate": 9.609127040454223e-05, + "loss": 0.057412338256835935, + "step": 27550 + }, + { + "epoch": 3.9119943222143365, + "grad_norm": 3.572941541671753, + "learning_rate": 9.608985095812633e-05, + "loss": 0.11244267225265503, + "step": 27560 + }, + { + "epoch": 3.9134137686302344, + "grad_norm": 1.7603306770324707, + "learning_rate": 9.608843151171044e-05, + "loss": 0.08775643706321716, + "step": 27570 + }, + { + "epoch": 3.914833215046132, + "grad_norm": 1.7416644096374512, + "learning_rate": 9.608701206529454e-05, + "loss": 0.04108691513538361, + "step": 27580 + }, + { + "epoch": 3.9162526614620297, + "grad_norm": 11.181812286376953, + "learning_rate": 9.608559261887864e-05, + "loss": 0.09978156089782715, + "step": 27590 + }, + { + "epoch": 3.9176721078779275, + "grad_norm": 13.193679809570312, + "learning_rate": 9.608417317246275e-05, + "loss": 0.0903174638748169, + "step": 27600 + }, + { + "epoch": 3.9190915542938254, + "grad_norm": 4.218955993652344, + "learning_rate": 9.608275372604685e-05, + "loss": 0.0756367027759552, + "step": 27610 + }, + { + "epoch": 3.9205110007097232, + "grad_norm": 3.685628652572632, + "learning_rate": 9.608133427963096e-05, + "loss": 0.15287163257598876, + "step": 27620 + }, + { + "epoch": 3.921930447125621, + "grad_norm": 3.4429805278778076, + "learning_rate": 9.607991483321504e-05, + "loss": 0.06430520415306092, + "step": 27630 + }, + { + "epoch": 3.923349893541519, + "grad_norm": 2.0270931720733643, + "learning_rate": 9.607849538679915e-05, + "loss": 0.10801482200622559, + "step": 27640 + }, + { + "epoch": 3.9247693399574164, + "grad_norm": 6.879272937774658, + "learning_rate": 9.607707594038325e-05, + "loss": 0.06597111821174621, + "step": 27650 + }, + { + "epoch": 3.9261887863733143, + "grad_norm": 1.249678611755371, + "learning_rate": 9.607565649396736e-05, + "loss": 0.03797664046287537, + "step": 27660 + }, + { + "epoch": 3.927608232789212, + "grad_norm": 0.24534237384796143, + "learning_rate": 9.607423704755146e-05, + "loss": 0.02991785407066345, + "step": 27670 + }, + { + "epoch": 3.92902767920511, + "grad_norm": 0.6622792482376099, + "learning_rate": 9.607281760113555e-05, + "loss": 0.02254675328731537, + "step": 27680 + }, + { + "epoch": 3.930447125621008, + "grad_norm": 0.3865921199321747, + "learning_rate": 9.607139815471967e-05, + "loss": 0.05718799233436585, + "step": 27690 + }, + { + "epoch": 3.9318665720369057, + "grad_norm": 4.140193462371826, + "learning_rate": 9.606997870830376e-05, + "loss": 0.08032118082046509, + "step": 27700 + }, + { + "epoch": 3.9332860184528036, + "grad_norm": 4.738871097564697, + "learning_rate": 9.606855926188787e-05, + "loss": 0.04987369179725647, + "step": 27710 + }, + { + "epoch": 3.934705464868701, + "grad_norm": 9.0142183303833, + "learning_rate": 9.606713981547197e-05, + "loss": 0.08996903896331787, + "step": 27720 + }, + { + "epoch": 3.936124911284599, + "grad_norm": 0.1371709704399109, + "learning_rate": 9.606572036905608e-05, + "loss": 0.05264982581138611, + "step": 27730 + }, + { + "epoch": 3.9375443577004967, + "grad_norm": 4.215709209442139, + "learning_rate": 9.606430092264017e-05, + "loss": 0.09554726481437684, + "step": 27740 + }, + { + "epoch": 3.9389638041163946, + "grad_norm": 1.1737656593322754, + "learning_rate": 9.606288147622428e-05, + "loss": 0.06068928241729736, + "step": 27750 + }, + { + "epoch": 3.9403832505322924, + "grad_norm": 1.0329593420028687, + "learning_rate": 9.606146202980837e-05, + "loss": 0.0832507848739624, + "step": 27760 + }, + { + "epoch": 3.9418026969481903, + "grad_norm": 1.4225534200668335, + "learning_rate": 9.606004258339249e-05, + "loss": 0.10741715431213379, + "step": 27770 + }, + { + "epoch": 3.943222143364088, + "grad_norm": 2.706094741821289, + "learning_rate": 9.605862313697658e-05, + "loss": 0.09146757125854492, + "step": 27780 + }, + { + "epoch": 3.9446415897799856, + "grad_norm": 7.178990364074707, + "learning_rate": 9.605720369056068e-05, + "loss": 0.09984794855117798, + "step": 27790 + }, + { + "epoch": 3.9460610361958834, + "grad_norm": 3.8521437644958496, + "learning_rate": 9.605578424414479e-05, + "loss": 0.1205756664276123, + "step": 27800 + }, + { + "epoch": 3.9474804826117813, + "grad_norm": 5.163641929626465, + "learning_rate": 9.605436479772889e-05, + "loss": 0.029975688457489012, + "step": 27810 + }, + { + "epoch": 3.948899929027679, + "grad_norm": 2.2490506172180176, + "learning_rate": 9.605308729595457e-05, + "loss": 0.1587289094924927, + "step": 27820 + }, + { + "epoch": 3.950319375443577, + "grad_norm": 6.972119331359863, + "learning_rate": 9.605166784953868e-05, + "loss": 0.09115952849388123, + "step": 27830 + }, + { + "epoch": 3.951738821859475, + "grad_norm": 0.4664672017097473, + "learning_rate": 9.60502484031228e-05, + "loss": 0.07655965089797974, + "step": 27840 + }, + { + "epoch": 3.9531582682753728, + "grad_norm": 4.678897380828857, + "learning_rate": 9.604882895670689e-05, + "loss": 0.052338707447052005, + "step": 27850 + }, + { + "epoch": 3.95457771469127, + "grad_norm": 4.429093837738037, + "learning_rate": 9.604740951029099e-05, + "loss": 0.054144054651260376, + "step": 27860 + }, + { + "epoch": 3.9559971611071685, + "grad_norm": 2.022493600845337, + "learning_rate": 9.604599006387509e-05, + "loss": 0.012421280145645142, + "step": 27870 + }, + { + "epoch": 3.957416607523066, + "grad_norm": 8.385174751281738, + "learning_rate": 9.60445706174592e-05, + "loss": 0.0690504789352417, + "step": 27880 + }, + { + "epoch": 3.9588360539389638, + "grad_norm": 1.3558812141418457, + "learning_rate": 9.60431511710433e-05, + "loss": 0.051670241355896, + "step": 27890 + }, + { + "epoch": 3.9602555003548616, + "grad_norm": 7.869537830352783, + "learning_rate": 9.60417317246274e-05, + "loss": 0.14572601318359374, + "step": 27900 + }, + { + "epoch": 3.9616749467707595, + "grad_norm": 3.734588623046875, + "learning_rate": 9.604031227821149e-05, + "loss": 0.10068864822387695, + "step": 27910 + }, + { + "epoch": 3.9630943931866573, + "grad_norm": 6.552592754364014, + "learning_rate": 9.60388928317956e-05, + "loss": 0.06942902803421021, + "step": 27920 + }, + { + "epoch": 3.9645138396025548, + "grad_norm": 1.7775979042053223, + "learning_rate": 9.603747338537971e-05, + "loss": 0.07231731414794922, + "step": 27930 + }, + { + "epoch": 3.965933286018453, + "grad_norm": 8.738762855529785, + "learning_rate": 9.603605393896381e-05, + "loss": 0.07564049959182739, + "step": 27940 + }, + { + "epoch": 3.9673527324343505, + "grad_norm": 3.3592703342437744, + "learning_rate": 9.603463449254792e-05, + "loss": 0.09379636645317077, + "step": 27950 + }, + { + "epoch": 3.9687721788502484, + "grad_norm": 7.767439842224121, + "learning_rate": 9.6033215046132e-05, + "loss": 0.09403069615364075, + "step": 27960 + }, + { + "epoch": 3.970191625266146, + "grad_norm": 11.272348403930664, + "learning_rate": 9.603179559971612e-05, + "loss": 0.18936721086502076, + "step": 27970 + }, + { + "epoch": 3.971611071682044, + "grad_norm": 5.963432312011719, + "learning_rate": 9.603037615330021e-05, + "loss": 0.12643344402313234, + "step": 27980 + }, + { + "epoch": 3.973030518097942, + "grad_norm": 6.23541784286499, + "learning_rate": 9.602895670688432e-05, + "loss": 0.13420791625976564, + "step": 27990 + }, + { + "epoch": 3.9744499645138394, + "grad_norm": 6.839860439300537, + "learning_rate": 9.602753726046842e-05, + "loss": 0.07564538717269897, + "step": 28000 + }, + { + "epoch": 3.9744499645138394, + "eval_accuracy": 0.9512939530743307, + "eval_loss": 0.14611348509788513, + "eval_runtime": 35.5209, + "eval_samples_per_second": 442.754, + "eval_steps_per_second": 13.851, + "step": 28000 + }, + { + "epoch": 3.9758694109297377, + "grad_norm": 5.825727939605713, + "learning_rate": 9.602611781405252e-05, + "loss": 0.1116330862045288, + "step": 28010 + }, + { + "epoch": 3.977288857345635, + "grad_norm": 2.08404541015625, + "learning_rate": 9.602469836763663e-05, + "loss": 0.046886110305786134, + "step": 28020 + }, + { + "epoch": 3.978708303761533, + "grad_norm": 16.22827911376953, + "learning_rate": 9.602327892122073e-05, + "loss": 0.0687228798866272, + "step": 28030 + }, + { + "epoch": 3.980127750177431, + "grad_norm": 4.588597774505615, + "learning_rate": 9.602185947480484e-05, + "loss": 0.09896028637886048, + "step": 28040 + }, + { + "epoch": 3.9815471965933287, + "grad_norm": 0.9230074882507324, + "learning_rate": 9.602044002838894e-05, + "loss": 0.07626842856407165, + "step": 28050 + }, + { + "epoch": 3.9829666430092265, + "grad_norm": 5.006918907165527, + "learning_rate": 9.601902058197305e-05, + "loss": 0.06256378293037415, + "step": 28060 + }, + { + "epoch": 3.984386089425124, + "grad_norm": 0.970280647277832, + "learning_rate": 9.601760113555713e-05, + "loss": 0.05959609746932983, + "step": 28070 + }, + { + "epoch": 3.9858055358410223, + "grad_norm": 1.083361268043518, + "learning_rate": 9.601618168914124e-05, + "loss": 0.10017790794372558, + "step": 28080 + }, + { + "epoch": 3.9872249822569197, + "grad_norm": 1.0272605419158936, + "learning_rate": 9.601476224272534e-05, + "loss": 0.10394766330718994, + "step": 28090 + }, + { + "epoch": 3.9886444286728175, + "grad_norm": 2.8747732639312744, + "learning_rate": 9.601334279630945e-05, + "loss": 0.08298658728599548, + "step": 28100 + }, + { + "epoch": 3.9900638750887154, + "grad_norm": 1.3981420993804932, + "learning_rate": 9.601192334989355e-05, + "loss": 0.09550594687461852, + "step": 28110 + }, + { + "epoch": 3.9914833215046133, + "grad_norm": 2.6726458072662354, + "learning_rate": 9.601050390347764e-05, + "loss": 0.06938835978507996, + "step": 28120 + }, + { + "epoch": 3.992902767920511, + "grad_norm": 8.076497077941895, + "learning_rate": 9.600908445706175e-05, + "loss": 0.12478160858154297, + "step": 28130 + }, + { + "epoch": 3.9943222143364085, + "grad_norm": 4.511760234832764, + "learning_rate": 9.600766501064585e-05, + "loss": 0.040727350115776065, + "step": 28140 + }, + { + "epoch": 3.995741660752307, + "grad_norm": 5.094786167144775, + "learning_rate": 9.600624556422996e-05, + "loss": 0.046387788653373715, + "step": 28150 + }, + { + "epoch": 3.9971611071682043, + "grad_norm": 1.6641865968704224, + "learning_rate": 9.600482611781406e-05, + "loss": 0.09861783981323242, + "step": 28160 + }, + { + "epoch": 3.998580553584102, + "grad_norm": 1.3645977973937988, + "learning_rate": 9.600340667139816e-05, + "loss": 0.046815165877342226, + "step": 28170 + }, + { + "epoch": 4.0, + "grad_norm": 3.4103214740753174, + "learning_rate": 9.600198722498226e-05, + "loss": 0.07011319398880005, + "step": 28180 + }, + { + "epoch": 4.001419446415897, + "grad_norm": 4.563676834106445, + "learning_rate": 9.600056777856637e-05, + "loss": 0.08069857358932495, + "step": 28190 + }, + { + "epoch": 4.002838892831796, + "grad_norm": 7.2024407386779785, + "learning_rate": 9.599914833215046e-05, + "loss": 0.07496459484100342, + "step": 28200 + }, + { + "epoch": 4.004258339247693, + "grad_norm": 6.962199687957764, + "learning_rate": 9.599772888573457e-05, + "loss": 0.04304870367050171, + "step": 28210 + }, + { + "epoch": 4.0056777856635915, + "grad_norm": 3.1831023693084717, + "learning_rate": 9.599630943931867e-05, + "loss": 0.03946310579776764, + "step": 28220 + }, + { + "epoch": 4.007097232079489, + "grad_norm": 6.079113006591797, + "learning_rate": 9.599488999290277e-05, + "loss": 0.11042921543121338, + "step": 28230 + }, + { + "epoch": 4.008516678495387, + "grad_norm": 4.9173688888549805, + "learning_rate": 9.599347054648688e-05, + "loss": 0.09752132892608642, + "step": 28240 + }, + { + "epoch": 4.009936124911285, + "grad_norm": 7.992283821105957, + "learning_rate": 9.599205110007098e-05, + "loss": 0.08114546537399292, + "step": 28250 + }, + { + "epoch": 4.011355571327182, + "grad_norm": 10.832806587219238, + "learning_rate": 9.599063165365509e-05, + "loss": 0.0714793860912323, + "step": 28260 + }, + { + "epoch": 4.01277501774308, + "grad_norm": 8.594995498657227, + "learning_rate": 9.598921220723917e-05, + "loss": 0.1365175724029541, + "step": 28270 + }, + { + "epoch": 4.014194464158978, + "grad_norm": 3.8112146854400635, + "learning_rate": 9.598779276082328e-05, + "loss": 0.09683020114898681, + "step": 28280 + }, + { + "epoch": 4.015613910574876, + "grad_norm": 3.5960965156555176, + "learning_rate": 9.598637331440738e-05, + "loss": 0.07476208806037903, + "step": 28290 + }, + { + "epoch": 4.0170333569907735, + "grad_norm": 7.426855564117432, + "learning_rate": 9.598495386799149e-05, + "loss": 0.04781417846679688, + "step": 28300 + }, + { + "epoch": 4.018452803406672, + "grad_norm": 3.124751091003418, + "learning_rate": 9.598353442157559e-05, + "loss": 0.020880359411239623, + "step": 28310 + }, + { + "epoch": 4.019872249822569, + "grad_norm": 0.4350399672985077, + "learning_rate": 9.598211497515969e-05, + "loss": 0.031051820516586302, + "step": 28320 + }, + { + "epoch": 4.021291696238467, + "grad_norm": 4.507806301116943, + "learning_rate": 9.59806955287438e-05, + "loss": 0.07177828550338745, + "step": 28330 + }, + { + "epoch": 4.022711142654365, + "grad_norm": 3.546592950820923, + "learning_rate": 9.59792760823279e-05, + "loss": 0.07378742098808289, + "step": 28340 + }, + { + "epoch": 4.024130589070262, + "grad_norm": 0.4405544102191925, + "learning_rate": 9.5977856635912e-05, + "loss": 0.056821930408477786, + "step": 28350 + }, + { + "epoch": 4.025550035486161, + "grad_norm": 5.474564075469971, + "learning_rate": 9.59764371894961e-05, + "loss": 0.07283146381378174, + "step": 28360 + }, + { + "epoch": 4.026969481902058, + "grad_norm": 0.8094128966331482, + "learning_rate": 9.59750177430802e-05, + "loss": 0.05205709934234619, + "step": 28370 + }, + { + "epoch": 4.028388928317956, + "grad_norm": 1.3830199241638184, + "learning_rate": 9.59735982966643e-05, + "loss": 0.035252746939659116, + "step": 28380 + }, + { + "epoch": 4.029808374733854, + "grad_norm": 0.34760722517967224, + "learning_rate": 9.597217885024841e-05, + "loss": 0.06989213228225707, + "step": 28390 + }, + { + "epoch": 4.031227821149751, + "grad_norm": 2.2948784828186035, + "learning_rate": 9.59707594038325e-05, + "loss": 0.09394473433494568, + "step": 28400 + }, + { + "epoch": 4.0326472675656495, + "grad_norm": 0.513525664806366, + "learning_rate": 9.596933995741662e-05, + "loss": 0.07552390694618225, + "step": 28410 + }, + { + "epoch": 4.034066713981547, + "grad_norm": 10.444016456604004, + "learning_rate": 9.596792051100071e-05, + "loss": 0.09061986804008484, + "step": 28420 + }, + { + "epoch": 4.035486160397445, + "grad_norm": 1.8548258543014526, + "learning_rate": 9.596650106458481e-05, + "loss": 0.05329904556274414, + "step": 28430 + }, + { + "epoch": 4.036905606813343, + "grad_norm": 6.657663345336914, + "learning_rate": 9.596508161816892e-05, + "loss": 0.09772533774375916, + "step": 28440 + }, + { + "epoch": 4.038325053229241, + "grad_norm": 0.976019561290741, + "learning_rate": 9.596366217175302e-05, + "loss": 0.06526297330856323, + "step": 28450 + }, + { + "epoch": 4.039744499645138, + "grad_norm": 0.7490736246109009, + "learning_rate": 9.596224272533713e-05, + "loss": 0.057075291872024536, + "step": 28460 + }, + { + "epoch": 4.041163946061036, + "grad_norm": 0.49151191115379333, + "learning_rate": 9.596082327892123e-05, + "loss": 0.03589344024658203, + "step": 28470 + }, + { + "epoch": 4.042583392476934, + "grad_norm": 1.0588443279266357, + "learning_rate": 9.595940383250533e-05, + "loss": 0.06447117924690246, + "step": 28480 + }, + { + "epoch": 4.0440028388928315, + "grad_norm": 0.5945330262184143, + "learning_rate": 9.595798438608942e-05, + "loss": 0.031689152121543884, + "step": 28490 + }, + { + "epoch": 4.04542228530873, + "grad_norm": 2.2356131076812744, + "learning_rate": 9.595656493967353e-05, + "loss": 0.042199242115020755, + "step": 28500 + }, + { + "epoch": 4.04542228530873, + "eval_accuracy": 0.9701150887009601, + "eval_loss": 0.09118034690618515, + "eval_runtime": 34.2314, + "eval_samples_per_second": 459.432, + "eval_steps_per_second": 14.373, + "step": 28500 + }, + { + "epoch": 4.046841731724627, + "grad_norm": 7.246707439422607, + "learning_rate": 9.595514549325763e-05, + "loss": 0.07293486595153809, + "step": 28510 + }, + { + "epoch": 4.0482611781405256, + "grad_norm": 7.857562065124512, + "learning_rate": 9.595372604684174e-05, + "loss": 0.07997156977653504, + "step": 28520 + }, + { + "epoch": 4.049680624556423, + "grad_norm": 6.5758795738220215, + "learning_rate": 9.595230660042584e-05, + "loss": 0.03434431552886963, + "step": 28530 + }, + { + "epoch": 4.05110007097232, + "grad_norm": 4.938292980194092, + "learning_rate": 9.595088715400994e-05, + "loss": 0.03129143714904785, + "step": 28540 + }, + { + "epoch": 4.052519517388219, + "grad_norm": 4.353482246398926, + "learning_rate": 9.594946770759405e-05, + "loss": 0.05996691584587097, + "step": 28550 + }, + { + "epoch": 4.053938963804116, + "grad_norm": 2.334413528442383, + "learning_rate": 9.594804826117815e-05, + "loss": 0.08024400472640991, + "step": 28560 + }, + { + "epoch": 4.055358410220014, + "grad_norm": 8.563901901245117, + "learning_rate": 9.594662881476226e-05, + "loss": 0.10906833410263062, + "step": 28570 + }, + { + "epoch": 4.056777856635912, + "grad_norm": 6.423247814178467, + "learning_rate": 9.594520936834634e-05, + "loss": 0.058406251668930056, + "step": 28580 + }, + { + "epoch": 4.05819730305181, + "grad_norm": 7.802096843719482, + "learning_rate": 9.594378992193045e-05, + "loss": 0.07235055565834045, + "step": 28590 + }, + { + "epoch": 4.059616749467708, + "grad_norm": 2.2815730571746826, + "learning_rate": 9.594237047551455e-05, + "loss": 0.09480493068695069, + "step": 28600 + }, + { + "epoch": 4.061036195883605, + "grad_norm": 0.11063522100448608, + "learning_rate": 9.594095102909866e-05, + "loss": 0.028444766998291016, + "step": 28610 + }, + { + "epoch": 4.062455642299503, + "grad_norm": 4.243860244750977, + "learning_rate": 9.593953158268276e-05, + "loss": 0.07847819924354553, + "step": 28620 + }, + { + "epoch": 4.063875088715401, + "grad_norm": 0.3890465795993805, + "learning_rate": 9.593811213626685e-05, + "loss": 0.09325323104858399, + "step": 28630 + }, + { + "epoch": 4.065294535131299, + "grad_norm": 6.783263206481934, + "learning_rate": 9.593669268985096e-05, + "loss": 0.052077090740203856, + "step": 28640 + }, + { + "epoch": 4.066713981547196, + "grad_norm": 0.21521657705307007, + "learning_rate": 9.593527324343506e-05, + "loss": 0.03206589519977569, + "step": 28650 + }, + { + "epoch": 4.068133427963095, + "grad_norm": 2.6312716007232666, + "learning_rate": 9.593385379701917e-05, + "loss": 0.049369195103645326, + "step": 28660 + }, + { + "epoch": 4.069552874378992, + "grad_norm": 3.080799102783203, + "learning_rate": 9.593243435060327e-05, + "loss": 0.0701833188533783, + "step": 28670 + }, + { + "epoch": 4.07097232079489, + "grad_norm": 3.443047285079956, + "learning_rate": 9.593101490418737e-05, + "loss": 0.06615790724754333, + "step": 28680 + }, + { + "epoch": 4.072391767210788, + "grad_norm": 0.24704548716545105, + "learning_rate": 9.592959545777147e-05, + "loss": 0.08545000553131103, + "step": 28690 + }, + { + "epoch": 4.073811213626685, + "grad_norm": 6.151279449462891, + "learning_rate": 9.592817601135558e-05, + "loss": 0.034337058663368225, + "step": 28700 + }, + { + "epoch": 4.075230660042584, + "grad_norm": 3.560441017150879, + "learning_rate": 9.592675656493967e-05, + "loss": 0.051172637939453126, + "step": 28710 + }, + { + "epoch": 4.076650106458481, + "grad_norm": 2.4762368202209473, + "learning_rate": 9.592533711852378e-05, + "loss": 0.07133134603500366, + "step": 28720 + }, + { + "epoch": 4.078069552874379, + "grad_norm": 6.441526889801025, + "learning_rate": 9.592391767210788e-05, + "loss": 0.07345221638679504, + "step": 28730 + }, + { + "epoch": 4.079488999290277, + "grad_norm": 2.405494213104248, + "learning_rate": 9.592249822569198e-05, + "loss": 0.048948812484741214, + "step": 28740 + }, + { + "epoch": 4.080908445706174, + "grad_norm": 0.5363373160362244, + "learning_rate": 9.592107877927609e-05, + "loss": 0.07454119324684143, + "step": 28750 + }, + { + "epoch": 4.0823278921220725, + "grad_norm": 0.1072138175368309, + "learning_rate": 9.591965933286019e-05, + "loss": 0.05140770673751831, + "step": 28760 + }, + { + "epoch": 4.08374733853797, + "grad_norm": 0.17401359975337982, + "learning_rate": 9.59182398864443e-05, + "loss": 0.08340293169021606, + "step": 28770 + }, + { + "epoch": 4.085166784953868, + "grad_norm": 3.663013458251953, + "learning_rate": 9.59168204400284e-05, + "loss": 0.06304811835289001, + "step": 28780 + }, + { + "epoch": 4.086586231369766, + "grad_norm": 6.273396968841553, + "learning_rate": 9.59154009936125e-05, + "loss": 0.12202763557434082, + "step": 28790 + }, + { + "epoch": 4.088005677785664, + "grad_norm": 9.233271598815918, + "learning_rate": 9.591398154719659e-05, + "loss": 0.13653677701950073, + "step": 28800 + }, + { + "epoch": 4.089425124201561, + "grad_norm": 1.3640882968902588, + "learning_rate": 9.59125621007807e-05, + "loss": 0.06013572812080383, + "step": 28810 + }, + { + "epoch": 4.090844570617459, + "grad_norm": 10.00602912902832, + "learning_rate": 9.59111426543648e-05, + "loss": 0.10673294067382813, + "step": 28820 + }, + { + "epoch": 4.092264017033357, + "grad_norm": 4.630044460296631, + "learning_rate": 9.590972320794891e-05, + "loss": 0.13828219175338746, + "step": 28830 + }, + { + "epoch": 4.0936834634492545, + "grad_norm": 5.386570930480957, + "learning_rate": 9.590830376153301e-05, + "loss": 0.05448143482208252, + "step": 28840 + }, + { + "epoch": 4.095102909865153, + "grad_norm": 1.139168620109558, + "learning_rate": 9.59068843151171e-05, + "loss": 0.04355934262275696, + "step": 28850 + }, + { + "epoch": 4.09652235628105, + "grad_norm": 3.865908145904541, + "learning_rate": 9.590546486870122e-05, + "loss": 0.025386843085289, + "step": 28860 + }, + { + "epoch": 4.0979418026969485, + "grad_norm": 0.1300564855337143, + "learning_rate": 9.590404542228531e-05, + "loss": 0.08789493441581726, + "step": 28870 + }, + { + "epoch": 4.099361249112846, + "grad_norm": 9.505627632141113, + "learning_rate": 9.590262597586942e-05, + "loss": 0.06938197016716004, + "step": 28880 + }, + { + "epoch": 4.100780695528743, + "grad_norm": 5.77639102935791, + "learning_rate": 9.590120652945351e-05, + "loss": 0.08774335384368896, + "step": 28890 + }, + { + "epoch": 4.102200141944642, + "grad_norm": 7.798095703125, + "learning_rate": 9.589978708303762e-05, + "loss": 0.049100473523139954, + "step": 28900 + }, + { + "epoch": 4.103619588360539, + "grad_norm": 4.503985404968262, + "learning_rate": 9.589836763662172e-05, + "loss": 0.12068946361541748, + "step": 28910 + }, + { + "epoch": 4.105039034776437, + "grad_norm": 0.352927565574646, + "learning_rate": 9.589694819020583e-05, + "loss": 0.09867768883705139, + "step": 28920 + }, + { + "epoch": 4.106458481192335, + "grad_norm": 8.994272232055664, + "learning_rate": 9.589552874378992e-05, + "loss": 0.0767949104309082, + "step": 28930 + }, + { + "epoch": 4.107877927608233, + "grad_norm": 10.914958953857422, + "learning_rate": 9.589410929737402e-05, + "loss": 0.11344790458679199, + "step": 28940 + }, + { + "epoch": 4.1092973740241305, + "grad_norm": 4.673663139343262, + "learning_rate": 9.589268985095813e-05, + "loss": 0.09324707984924316, + "step": 28950 + }, + { + "epoch": 4.110716820440028, + "grad_norm": 0.63322913646698, + "learning_rate": 9.589127040454223e-05, + "loss": 0.09688594937324524, + "step": 28960 + }, + { + "epoch": 4.112136266855926, + "grad_norm": 1.9962878227233887, + "learning_rate": 9.588985095812634e-05, + "loss": 0.10838373899459838, + "step": 28970 + }, + { + "epoch": 4.113555713271824, + "grad_norm": 0.5679922699928284, + "learning_rate": 9.588843151171044e-05, + "loss": 0.04196344316005707, + "step": 28980 + }, + { + "epoch": 4.114975159687722, + "grad_norm": 1.6682865619659424, + "learning_rate": 9.588701206529454e-05, + "loss": 0.0966885507106781, + "step": 28990 + }, + { + "epoch": 4.116394606103619, + "grad_norm": 1.2570011615753174, + "learning_rate": 9.588559261887863e-05, + "loss": 0.0969314455986023, + "step": 29000 + }, + { + "epoch": 4.116394606103619, + "eval_accuracy": 0.9755198067018503, + "eval_loss": 0.06681427359580994, + "eval_runtime": 34.1546, + "eval_samples_per_second": 460.466, + "eval_steps_per_second": 14.405, + "step": 29000 + }, + { + "epoch": 4.117814052519518, + "grad_norm": 1.8310188055038452, + "learning_rate": 9.588417317246274e-05, + "loss": 0.03523149788379669, + "step": 29010 + }, + { + "epoch": 4.119233498935415, + "grad_norm": 2.215496778488159, + "learning_rate": 9.588275372604684e-05, + "loss": 0.06970539093017578, + "step": 29020 + }, + { + "epoch": 4.120652945351313, + "grad_norm": 1.687964677810669, + "learning_rate": 9.588133427963095e-05, + "loss": 0.07161665558815003, + "step": 29030 + }, + { + "epoch": 4.122072391767211, + "grad_norm": 4.007593154907227, + "learning_rate": 9.587991483321505e-05, + "loss": 0.02978900372982025, + "step": 29040 + }, + { + "epoch": 4.123491838183108, + "grad_norm": 0.38011181354522705, + "learning_rate": 9.587849538679915e-05, + "loss": 0.06865530610084533, + "step": 29050 + }, + { + "epoch": 4.124911284599007, + "grad_norm": 3.137512683868408, + "learning_rate": 9.587707594038326e-05, + "loss": 0.03670695722103119, + "step": 29060 + }, + { + "epoch": 4.126330731014904, + "grad_norm": 7.8316874504089355, + "learning_rate": 9.587565649396736e-05, + "loss": 0.08162494897842407, + "step": 29070 + }, + { + "epoch": 4.127750177430802, + "grad_norm": 0.1684914380311966, + "learning_rate": 9.587423704755147e-05, + "loss": 0.07897425889968872, + "step": 29080 + }, + { + "epoch": 4.1291696238467, + "grad_norm": 2.3186583518981934, + "learning_rate": 9.587281760113556e-05, + "loss": 0.07271428108215332, + "step": 29090 + }, + { + "epoch": 4.130589070262598, + "grad_norm": 0.6711585521697998, + "learning_rate": 9.587139815471966e-05, + "loss": 0.12461971044540406, + "step": 29100 + }, + { + "epoch": 4.1320085166784954, + "grad_norm": 2.4304239749908447, + "learning_rate": 9.586997870830376e-05, + "loss": 0.0756750762462616, + "step": 29110 + }, + { + "epoch": 4.133427963094393, + "grad_norm": 3.95882248878479, + "learning_rate": 9.586855926188787e-05, + "loss": 0.1011937141418457, + "step": 29120 + }, + { + "epoch": 4.134847409510291, + "grad_norm": 1.7454277276992798, + "learning_rate": 9.586713981547198e-05, + "loss": 0.06007967591285705, + "step": 29130 + }, + { + "epoch": 4.136266855926189, + "grad_norm": 2.151155710220337, + "learning_rate": 9.586572036905608e-05, + "loss": 0.07004793882369995, + "step": 29140 + }, + { + "epoch": 4.137686302342087, + "grad_norm": 3.816464900970459, + "learning_rate": 9.586430092264018e-05, + "loss": 0.11677614450454712, + "step": 29150 + }, + { + "epoch": 4.139105748757984, + "grad_norm": 1.8618497848510742, + "learning_rate": 9.586288147622427e-05, + "loss": 0.03786920607089996, + "step": 29160 + }, + { + "epoch": 4.140525195173883, + "grad_norm": 5.731049060821533, + "learning_rate": 9.586146202980838e-05, + "loss": 0.07805479168891907, + "step": 29170 + }, + { + "epoch": 4.14194464158978, + "grad_norm": 5.20023775100708, + "learning_rate": 9.586004258339248e-05, + "loss": 0.047267353534698485, + "step": 29180 + }, + { + "epoch": 4.1433640880056775, + "grad_norm": 2.555107355117798, + "learning_rate": 9.585862313697659e-05, + "loss": 0.052921724319458005, + "step": 29190 + }, + { + "epoch": 4.144783534421576, + "grad_norm": 6.4647650718688965, + "learning_rate": 9.585720369056068e-05, + "loss": 0.05531458258628845, + "step": 29200 + }, + { + "epoch": 4.146202980837473, + "grad_norm": 2.4424901008605957, + "learning_rate": 9.585578424414479e-05, + "loss": 0.06641347408294677, + "step": 29210 + }, + { + "epoch": 4.1476224272533715, + "grad_norm": 3.6472041606903076, + "learning_rate": 9.58543647977289e-05, + "loss": 0.0659880816936493, + "step": 29220 + }, + { + "epoch": 4.149041873669269, + "grad_norm": 7.028792381286621, + "learning_rate": 9.5852945351313e-05, + "loss": 0.07326681613922119, + "step": 29230 + }, + { + "epoch": 4.150461320085167, + "grad_norm": 5.249692916870117, + "learning_rate": 9.58515259048971e-05, + "loss": 0.0626168668270111, + "step": 29240 + }, + { + "epoch": 4.151880766501065, + "grad_norm": 1.918457269668579, + "learning_rate": 9.585010645848119e-05, + "loss": 0.0697676658630371, + "step": 29250 + }, + { + "epoch": 4.153300212916962, + "grad_norm": 3.7116684913635254, + "learning_rate": 9.58486870120653e-05, + "loss": 0.05905971527099609, + "step": 29260 + }, + { + "epoch": 4.15471965933286, + "grad_norm": 2.3320891857147217, + "learning_rate": 9.58472675656494e-05, + "loss": 0.04564954340457916, + "step": 29270 + }, + { + "epoch": 4.156139105748758, + "grad_norm": 0.35267460346221924, + "learning_rate": 9.584584811923351e-05, + "loss": 0.07400223016738891, + "step": 29280 + }, + { + "epoch": 4.157558552164656, + "grad_norm": 0.4503205120563507, + "learning_rate": 9.58444286728176e-05, + "loss": 0.04973529279232025, + "step": 29290 + }, + { + "epoch": 4.1589779985805535, + "grad_norm": 1.6949446201324463, + "learning_rate": 9.58430092264017e-05, + "loss": 0.0649847149848938, + "step": 29300 + }, + { + "epoch": 4.160397444996452, + "grad_norm": 3.4858663082122803, + "learning_rate": 9.58415897799858e-05, + "loss": 0.045249027013778684, + "step": 29310 + }, + { + "epoch": 4.161816891412349, + "grad_norm": 1.861283302307129, + "learning_rate": 9.584017033356991e-05, + "loss": 0.04219701588153839, + "step": 29320 + }, + { + "epoch": 4.163236337828247, + "grad_norm": 1.1282949447631836, + "learning_rate": 9.583875088715402e-05, + "loss": 0.06556923389434814, + "step": 29330 + }, + { + "epoch": 4.164655784244145, + "grad_norm": 3.5685386657714844, + "learning_rate": 9.583733144073812e-05, + "loss": 0.07912471294403076, + "step": 29340 + }, + { + "epoch": 4.166075230660042, + "grad_norm": 0.43480032682418823, + "learning_rate": 9.583591199432222e-05, + "loss": 0.06162703037261963, + "step": 29350 + }, + { + "epoch": 4.167494677075941, + "grad_norm": 14.023776054382324, + "learning_rate": 9.583449254790631e-05, + "loss": 0.06618826985359191, + "step": 29360 + }, + { + "epoch": 4.168914123491838, + "grad_norm": 5.220623016357422, + "learning_rate": 9.583307310149043e-05, + "loss": 0.03890301287174225, + "step": 29370 + }, + { + "epoch": 4.170333569907736, + "grad_norm": 2.791459560394287, + "learning_rate": 9.583165365507452e-05, + "loss": 0.0788652777671814, + "step": 29380 + }, + { + "epoch": 4.171753016323634, + "grad_norm": 6.876852035522461, + "learning_rate": 9.583023420865863e-05, + "loss": 0.05101839303970337, + "step": 29390 + }, + { + "epoch": 4.173172462739531, + "grad_norm": 3.1492698192596436, + "learning_rate": 9.582881476224272e-05, + "loss": 0.02205464094877243, + "step": 29400 + }, + { + "epoch": 4.1745919091554295, + "grad_norm": 1.2465417385101318, + "learning_rate": 9.582739531582683e-05, + "loss": 0.07565943002700806, + "step": 29410 + }, + { + "epoch": 4.176011355571327, + "grad_norm": 3.89546275138855, + "learning_rate": 9.582597586941094e-05, + "loss": 0.048539546132087705, + "step": 29420 + }, + { + "epoch": 4.177430801987225, + "grad_norm": 6.265598773956299, + "learning_rate": 9.582455642299504e-05, + "loss": 0.03970089554786682, + "step": 29430 + }, + { + "epoch": 4.178850248403123, + "grad_norm": 3.304530143737793, + "learning_rate": 9.582313697657915e-05, + "loss": 0.0281475305557251, + "step": 29440 + }, + { + "epoch": 4.180269694819021, + "grad_norm": 3.612251043319702, + "learning_rate": 9.582171753016325e-05, + "loss": 0.040489718317985535, + "step": 29450 + }, + { + "epoch": 4.181689141234918, + "grad_norm": 5.561727523803711, + "learning_rate": 9.582029808374734e-05, + "loss": 0.028734493255615234, + "step": 29460 + }, + { + "epoch": 4.183108587650816, + "grad_norm": 8.159090042114258, + "learning_rate": 9.581887863733144e-05, + "loss": 0.06577027440071107, + "step": 29470 + }, + { + "epoch": 4.184528034066714, + "grad_norm": 9.581611633300781, + "learning_rate": 9.581745919091555e-05, + "loss": 0.10467482805252075, + "step": 29480 + }, + { + "epoch": 4.185947480482612, + "grad_norm": 0.0830698311328888, + "learning_rate": 9.581603974449965e-05, + "loss": 0.06047796607017517, + "step": 29490 + }, + { + "epoch": 4.18736692689851, + "grad_norm": 10.371894836425781, + "learning_rate": 9.581462029808376e-05, + "loss": 0.08939828872680664, + "step": 29500 + }, + { + "epoch": 4.18736692689851, + "eval_accuracy": 0.9636294270998919, + "eval_loss": 0.1139441579580307, + "eval_runtime": 32.7881, + "eval_samples_per_second": 479.656, + "eval_steps_per_second": 15.005, + "step": 29500 + }, + { + "epoch": 4.188786373314407, + "grad_norm": 3.3707029819488525, + "learning_rate": 9.581320085166786e-05, + "loss": 0.13373112678527832, + "step": 29510 + }, + { + "epoch": 4.190205819730306, + "grad_norm": 8.710700988769531, + "learning_rate": 9.581178140525195e-05, + "loss": 0.08344519138336182, + "step": 29520 + }, + { + "epoch": 4.191625266146203, + "grad_norm": 4.869105339050293, + "learning_rate": 9.581036195883607e-05, + "loss": 0.027658921480178834, + "step": 29530 + }, + { + "epoch": 4.1930447125621, + "grad_norm": 5.363608360290527, + "learning_rate": 9.580894251242016e-05, + "loss": 0.10114845037460327, + "step": 29540 + }, + { + "epoch": 4.194464158977999, + "grad_norm": 5.194397926330566, + "learning_rate": 9.580752306600427e-05, + "loss": 0.06530258655548096, + "step": 29550 + }, + { + "epoch": 4.195883605393896, + "grad_norm": 7.159872531890869, + "learning_rate": 9.580610361958836e-05, + "loss": 0.049175983667373656, + "step": 29560 + }, + { + "epoch": 4.1973030518097945, + "grad_norm": 1.903594970703125, + "learning_rate": 9.580468417317247e-05, + "loss": 0.04481973648071289, + "step": 29570 + }, + { + "epoch": 4.198722498225692, + "grad_norm": 0.6888973116874695, + "learning_rate": 9.580326472675657e-05, + "loss": 0.05650795102119446, + "step": 29580 + }, + { + "epoch": 4.20014194464159, + "grad_norm": 1.0285824537277222, + "learning_rate": 9.580184528034068e-05, + "loss": 0.05506778359413147, + "step": 29590 + }, + { + "epoch": 4.201561391057488, + "grad_norm": 0.3579859733581543, + "learning_rate": 9.580042583392477e-05, + "loss": 0.08136498928070068, + "step": 29600 + }, + { + "epoch": 4.202980837473385, + "grad_norm": 7.7916483879089355, + "learning_rate": 9.579900638750887e-05, + "loss": 0.11614023447036743, + "step": 29610 + }, + { + "epoch": 4.204400283889283, + "grad_norm": 2.5529000759124756, + "learning_rate": 9.579758694109298e-05, + "loss": 0.06429266929626465, + "step": 29620 + }, + { + "epoch": 4.205819730305181, + "grad_norm": 2.441297769546509, + "learning_rate": 9.579616749467708e-05, + "loss": 0.06408776044845581, + "step": 29630 + }, + { + "epoch": 4.207239176721079, + "grad_norm": 0.708437979221344, + "learning_rate": 9.579474804826119e-05, + "loss": 0.042554271221160886, + "step": 29640 + }, + { + "epoch": 4.2086586231369765, + "grad_norm": 2.543992757797241, + "learning_rate": 9.579332860184529e-05, + "loss": 0.03778640329837799, + "step": 29650 + }, + { + "epoch": 4.210078069552875, + "grad_norm": 2.1270813941955566, + "learning_rate": 9.579190915542939e-05, + "loss": 0.059389245510101316, + "step": 29660 + }, + { + "epoch": 4.211497515968772, + "grad_norm": 5.076752662658691, + "learning_rate": 9.579048970901348e-05, + "loss": 0.097792249917984, + "step": 29670 + }, + { + "epoch": 4.21291696238467, + "grad_norm": 0.8046549558639526, + "learning_rate": 9.57890702625976e-05, + "loss": 0.08327121138572693, + "step": 29680 + }, + { + "epoch": 4.214336408800568, + "grad_norm": 5.163249969482422, + "learning_rate": 9.578765081618169e-05, + "loss": 0.0650902271270752, + "step": 29690 + }, + { + "epoch": 4.215755855216465, + "grad_norm": 6.078125, + "learning_rate": 9.57862313697658e-05, + "loss": 0.02894149422645569, + "step": 29700 + }, + { + "epoch": 4.217175301632364, + "grad_norm": 2.5657360553741455, + "learning_rate": 9.57848119233499e-05, + "loss": 0.0827182650566101, + "step": 29710 + }, + { + "epoch": 4.218594748048261, + "grad_norm": 11.177501678466797, + "learning_rate": 9.5783392476934e-05, + "loss": 0.10848512649536132, + "step": 29720 + }, + { + "epoch": 4.220014194464159, + "grad_norm": 0.30279305577278137, + "learning_rate": 9.578197303051811e-05, + "loss": 0.04139010012149811, + "step": 29730 + }, + { + "epoch": 4.221433640880057, + "grad_norm": 5.513749599456787, + "learning_rate": 9.57805535841022e-05, + "loss": 0.07512015104293823, + "step": 29740 + }, + { + "epoch": 4.222853087295954, + "grad_norm": 0.548879086971283, + "learning_rate": 9.577913413768632e-05, + "loss": 0.0479542076587677, + "step": 29750 + }, + { + "epoch": 4.2242725337118525, + "grad_norm": 4.692379951477051, + "learning_rate": 9.57777146912704e-05, + "loss": 0.050175410509109494, + "step": 29760 + }, + { + "epoch": 4.22569198012775, + "grad_norm": 5.73818302154541, + "learning_rate": 9.577629524485451e-05, + "loss": 0.04169095158576965, + "step": 29770 + }, + { + "epoch": 4.227111426543648, + "grad_norm": 2.4854674339294434, + "learning_rate": 9.577487579843861e-05, + "loss": 0.11341255903244019, + "step": 29780 + }, + { + "epoch": 4.228530872959546, + "grad_norm": 5.217370510101318, + "learning_rate": 9.577345635202272e-05, + "loss": 0.08126076459884643, + "step": 29790 + }, + { + "epoch": 4.229950319375444, + "grad_norm": 4.68181037902832, + "learning_rate": 9.577203690560682e-05, + "loss": 0.07262782454490661, + "step": 29800 + }, + { + "epoch": 4.231369765791341, + "grad_norm": 5.919559955596924, + "learning_rate": 9.577061745919093e-05, + "loss": 0.0595442533493042, + "step": 29810 + }, + { + "epoch": 4.232789212207239, + "grad_norm": 2.606187582015991, + "learning_rate": 9.576919801277502e-05, + "loss": 0.07997668385505677, + "step": 29820 + }, + { + "epoch": 4.234208658623137, + "grad_norm": 8.676187515258789, + "learning_rate": 9.576777856635912e-05, + "loss": 0.09488874673843384, + "step": 29830 + }, + { + "epoch": 4.2356281050390345, + "grad_norm": 0.4818950295448303, + "learning_rate": 9.576635911994323e-05, + "loss": 0.04851639866828918, + "step": 29840 + }, + { + "epoch": 4.237047551454933, + "grad_norm": 2.5943143367767334, + "learning_rate": 9.576493967352733e-05, + "loss": 0.07941715121269226, + "step": 29850 + }, + { + "epoch": 4.23846699787083, + "grad_norm": 2.3599934577941895, + "learning_rate": 9.576352022711144e-05, + "loss": 0.048725086450576785, + "step": 29860 + }, + { + "epoch": 4.239886444286729, + "grad_norm": 3.0058441162109375, + "learning_rate": 9.576210078069552e-05, + "loss": 0.057837444543838504, + "step": 29870 + }, + { + "epoch": 4.241305890702626, + "grad_norm": 6.7187700271606445, + "learning_rate": 9.576068133427964e-05, + "loss": 0.0750263512134552, + "step": 29880 + }, + { + "epoch": 4.242725337118523, + "grad_norm": 8.70337200164795, + "learning_rate": 9.575926188786373e-05, + "loss": 0.10019056797027588, + "step": 29890 + }, + { + "epoch": 4.244144783534422, + "grad_norm": 6.477367401123047, + "learning_rate": 9.575784244144784e-05, + "loss": 0.0946097731590271, + "step": 29900 + }, + { + "epoch": 4.245564229950319, + "grad_norm": 2.6386449337005615, + "learning_rate": 9.575642299503194e-05, + "loss": 0.04361373782157898, + "step": 29910 + }, + { + "epoch": 4.246983676366217, + "grad_norm": 1.9583840370178223, + "learning_rate": 9.575500354861604e-05, + "loss": 0.08449437022209168, + "step": 29920 + }, + { + "epoch": 4.248403122782115, + "grad_norm": 3.64117693901062, + "learning_rate": 9.575358410220015e-05, + "loss": 0.05209539532661438, + "step": 29930 + }, + { + "epoch": 4.249822569198013, + "grad_norm": 0.24194148182868958, + "learning_rate": 9.575216465578425e-05, + "loss": 0.06707227230072021, + "step": 29940 + }, + { + "epoch": 4.251242015613911, + "grad_norm": 0.11548645794391632, + "learning_rate": 9.575074520936836e-05, + "loss": 0.04010304808616638, + "step": 29950 + }, + { + "epoch": 4.252661462029808, + "grad_norm": 5.685769557952881, + "learning_rate": 9.574932576295246e-05, + "loss": 0.049138793349266054, + "step": 29960 + }, + { + "epoch": 4.254080908445706, + "grad_norm": 4.332538604736328, + "learning_rate": 9.574790631653655e-05, + "loss": 0.05995388031005859, + "step": 29970 + }, + { + "epoch": 4.255500354861604, + "grad_norm": 7.861778259277344, + "learning_rate": 9.574648687012065e-05, + "loss": 0.10826575756072998, + "step": 29980 + }, + { + "epoch": 4.256919801277502, + "grad_norm": 3.978131055831909, + "learning_rate": 9.574506742370476e-05, + "loss": 0.03147943317890167, + "step": 29990 + }, + { + "epoch": 4.258339247693399, + "grad_norm": 2.1925299167633057, + "learning_rate": 9.574364797728886e-05, + "loss": 0.025776213407516478, + "step": 30000 + }, + { + "epoch": 4.258339247693399, + "eval_accuracy": 0.9660456539708782, + "eval_loss": 0.09928029775619507, + "eval_runtime": 35.5615, + "eval_samples_per_second": 442.248, + "eval_steps_per_second": 13.835, + "step": 30000 + }, + { + "epoch": 4.259758694109298, + "grad_norm": 0.1545456051826477, + "learning_rate": 9.574222853087297e-05, + "loss": 0.04762240052223206, + "step": 30010 + }, + { + "epoch": 4.261178140525195, + "grad_norm": 1.7185860872268677, + "learning_rate": 9.574080908445707e-05, + "loss": 0.05865336060523987, + "step": 30020 + }, + { + "epoch": 4.262597586941093, + "grad_norm": 8.793628692626953, + "learning_rate": 9.573938963804116e-05, + "loss": 0.0683655321598053, + "step": 30030 + }, + { + "epoch": 4.264017033356991, + "grad_norm": 0.2601637840270996, + "learning_rate": 9.573797019162528e-05, + "loss": 0.06883899569511413, + "step": 30040 + }, + { + "epoch": 4.265436479772888, + "grad_norm": 3.019463062286377, + "learning_rate": 9.573655074520937e-05, + "loss": 0.061128252744674684, + "step": 30050 + }, + { + "epoch": 4.266855926188787, + "grad_norm": 5.62221622467041, + "learning_rate": 9.573513129879348e-05, + "loss": 0.04778895676136017, + "step": 30060 + }, + { + "epoch": 4.268275372604684, + "grad_norm": 3.3797640800476074, + "learning_rate": 9.573371185237757e-05, + "loss": 0.03307014107704163, + "step": 30070 + }, + { + "epoch": 4.269694819020582, + "grad_norm": 1.279465675354004, + "learning_rate": 9.573229240596168e-05, + "loss": 0.06789458394050599, + "step": 30080 + }, + { + "epoch": 4.27111426543648, + "grad_norm": 7.307963848114014, + "learning_rate": 9.573087295954578e-05, + "loss": 0.060098963975906375, + "step": 30090 + }, + { + "epoch": 4.272533711852377, + "grad_norm": 1.0080957412719727, + "learning_rate": 9.572945351312989e-05, + "loss": 0.048102378845214844, + "step": 30100 + }, + { + "epoch": 4.2739531582682755, + "grad_norm": 3.0274055004119873, + "learning_rate": 9.572803406671398e-05, + "loss": 0.026803615689277648, + "step": 30110 + }, + { + "epoch": 4.275372604684173, + "grad_norm": 10.846437454223633, + "learning_rate": 9.572661462029808e-05, + "loss": 0.06357570886611938, + "step": 30120 + }, + { + "epoch": 4.276792051100071, + "grad_norm": 4.810959815979004, + "learning_rate": 9.572519517388219e-05, + "loss": 0.13068749904632568, + "step": 30130 + }, + { + "epoch": 4.278211497515969, + "grad_norm": 9.625565528869629, + "learning_rate": 9.572377572746629e-05, + "loss": 0.12544605731964112, + "step": 30140 + }, + { + "epoch": 4.279630943931867, + "grad_norm": 6.736867904663086, + "learning_rate": 9.57223562810504e-05, + "loss": 0.060517168045043944, + "step": 30150 + }, + { + "epoch": 4.281050390347764, + "grad_norm": 7.1672868728637695, + "learning_rate": 9.57209368346345e-05, + "loss": 0.10456200838088989, + "step": 30160 + }, + { + "epoch": 4.282469836763662, + "grad_norm": 2.916855812072754, + "learning_rate": 9.571951738821861e-05, + "loss": 0.09013462662696839, + "step": 30170 + }, + { + "epoch": 4.28388928317956, + "grad_norm": 0.9611710906028748, + "learning_rate": 9.571809794180269e-05, + "loss": 0.052769911289215085, + "step": 30180 + }, + { + "epoch": 4.2853087295954575, + "grad_norm": 8.947700500488281, + "learning_rate": 9.57166784953868e-05, + "loss": 0.07875499725341797, + "step": 30190 + }, + { + "epoch": 4.286728176011356, + "grad_norm": 11.802430152893066, + "learning_rate": 9.57152590489709e-05, + "loss": 0.08428643941879273, + "step": 30200 + }, + { + "epoch": 4.288147622427253, + "grad_norm": 8.532755851745605, + "learning_rate": 9.571383960255501e-05, + "loss": 0.061253076791763304, + "step": 30210 + }, + { + "epoch": 4.2895670688431515, + "grad_norm": 7.606026649475098, + "learning_rate": 9.571242015613911e-05, + "loss": 0.04498938620090485, + "step": 30220 + }, + { + "epoch": 4.290986515259049, + "grad_norm": 5.194207191467285, + "learning_rate": 9.57110007097232e-05, + "loss": 0.053921067714691163, + "step": 30230 + }, + { + "epoch": 4.292405961674946, + "grad_norm": 2.436835527420044, + "learning_rate": 9.570958126330732e-05, + "loss": 0.0879688322544098, + "step": 30240 + }, + { + "epoch": 4.293825408090845, + "grad_norm": 5.799166202545166, + "learning_rate": 9.570816181689141e-05, + "loss": 0.027118155360221864, + "step": 30250 + }, + { + "epoch": 4.295244854506742, + "grad_norm": 7.482603073120117, + "learning_rate": 9.570674237047553e-05, + "loss": 0.07994829416275025, + "step": 30260 + }, + { + "epoch": 4.29666430092264, + "grad_norm": 0.9970318675041199, + "learning_rate": 9.570532292405962e-05, + "loss": 0.10803250074386597, + "step": 30270 + }, + { + "epoch": 4.298083747338538, + "grad_norm": 8.933618545532227, + "learning_rate": 9.570390347764372e-05, + "loss": 0.07788187265396118, + "step": 30280 + }, + { + "epoch": 4.299503193754436, + "grad_norm": 3.686373472213745, + "learning_rate": 9.570248403122782e-05, + "loss": 0.05900847911834717, + "step": 30290 + }, + { + "epoch": 4.3009226401703335, + "grad_norm": 5.501690864562988, + "learning_rate": 9.570106458481193e-05, + "loss": 0.09661505222320557, + "step": 30300 + }, + { + "epoch": 4.302342086586231, + "grad_norm": 6.418631553649902, + "learning_rate": 9.569964513839603e-05, + "loss": 0.07957556247711181, + "step": 30310 + }, + { + "epoch": 4.303761533002129, + "grad_norm": 6.4076032638549805, + "learning_rate": 9.569822569198014e-05, + "loss": 0.07458949685096741, + "step": 30320 + }, + { + "epoch": 4.305180979418027, + "grad_norm": 0.4291207790374756, + "learning_rate": 9.569680624556423e-05, + "loss": 0.0629243791103363, + "step": 30330 + }, + { + "epoch": 4.306600425833925, + "grad_norm": 1.803011178970337, + "learning_rate": 9.569538679914833e-05, + "loss": 0.07003772854804993, + "step": 30340 + }, + { + "epoch": 4.308019872249822, + "grad_norm": 2.3013916015625, + "learning_rate": 9.569396735273244e-05, + "loss": 0.058683961629867554, + "step": 30350 + }, + { + "epoch": 4.309439318665721, + "grad_norm": 1.1245123147964478, + "learning_rate": 9.569254790631654e-05, + "loss": 0.06056886911392212, + "step": 30360 + }, + { + "epoch": 4.310858765081618, + "grad_norm": 0.9130068421363831, + "learning_rate": 9.569112845990065e-05, + "loss": 0.03365835845470429, + "step": 30370 + }, + { + "epoch": 4.312278211497516, + "grad_norm": 14.019575119018555, + "learning_rate": 9.568970901348473e-05, + "loss": 0.08868294954299927, + "step": 30380 + }, + { + "epoch": 4.313697657913414, + "grad_norm": 5.392879009246826, + "learning_rate": 9.568828956706885e-05, + "loss": 0.09387872219085694, + "step": 30390 + }, + { + "epoch": 4.315117104329311, + "grad_norm": 1.6492738723754883, + "learning_rate": 9.568687012065294e-05, + "loss": 0.041343361139297485, + "step": 30400 + }, + { + "epoch": 4.31653655074521, + "grad_norm": 12.153800010681152, + "learning_rate": 9.568545067423705e-05, + "loss": 0.12785712480545045, + "step": 30410 + }, + { + "epoch": 4.317955997161107, + "grad_norm": 0.7066358923912048, + "learning_rate": 9.568403122782115e-05, + "loss": 0.048977088928222653, + "step": 30420 + }, + { + "epoch": 4.319375443577005, + "grad_norm": 4.7776055335998535, + "learning_rate": 9.568261178140525e-05, + "loss": 0.13408685922622682, + "step": 30430 + }, + { + "epoch": 4.320794889992903, + "grad_norm": 0.4777391850948334, + "learning_rate": 9.568119233498936e-05, + "loss": 0.03871457576751709, + "step": 30440 + }, + { + "epoch": 4.3222143364088, + "grad_norm": 1.3897401094436646, + "learning_rate": 9.567977288857346e-05, + "loss": 0.08946848511695862, + "step": 30450 + }, + { + "epoch": 4.3236337828246985, + "grad_norm": 4.937811851501465, + "learning_rate": 9.567835344215757e-05, + "loss": 0.06665077805519104, + "step": 30460 + }, + { + "epoch": 4.325053229240596, + "grad_norm": 6.3543195724487305, + "learning_rate": 9.567693399574167e-05, + "loss": 0.05886696577072144, + "step": 30470 + }, + { + "epoch": 4.326472675656494, + "grad_norm": 0.3217252194881439, + "learning_rate": 9.567551454932576e-05, + "loss": 0.06166156530380249, + "step": 30480 + }, + { + "epoch": 4.327892122072392, + "grad_norm": 7.3612494468688965, + "learning_rate": 9.567409510290986e-05, + "loss": 0.06469403505325318, + "step": 30490 + }, + { + "epoch": 4.32931156848829, + "grad_norm": 5.631328582763672, + "learning_rate": 9.567267565649397e-05, + "loss": 0.05832824110984802, + "step": 30500 + }, + { + "epoch": 4.32931156848829, + "eval_accuracy": 0.9713232021364532, + "eval_loss": 0.08604324609041214, + "eval_runtime": 34.5081, + "eval_samples_per_second": 455.749, + "eval_steps_per_second": 14.258, + "step": 30500 + }, + { + "epoch": 4.330731014904187, + "grad_norm": 2.4201743602752686, + "learning_rate": 9.567125621007807e-05, + "loss": 0.052916485071182254, + "step": 30510 + }, + { + "epoch": 4.332150461320085, + "grad_norm": 4.678896427154541, + "learning_rate": 9.566983676366218e-05, + "loss": 0.06698215603828431, + "step": 30520 + }, + { + "epoch": 4.333569907735983, + "grad_norm": 8.06701946258545, + "learning_rate": 9.566841731724629e-05, + "loss": 0.09104944467544555, + "step": 30530 + }, + { + "epoch": 4.3349893541518805, + "grad_norm": 3.8929500579833984, + "learning_rate": 9.566699787083037e-05, + "loss": 0.0670436143875122, + "step": 30540 + }, + { + "epoch": 4.336408800567779, + "grad_norm": 7.440217018127441, + "learning_rate": 9.566557842441449e-05, + "loss": 0.08899651765823365, + "step": 30550 + }, + { + "epoch": 4.337828246983676, + "grad_norm": 0.19266436994075775, + "learning_rate": 9.566415897799858e-05, + "loss": 0.1018686056137085, + "step": 30560 + }, + { + "epoch": 4.3392476933995745, + "grad_norm": 6.425552845001221, + "learning_rate": 9.56627395315827e-05, + "loss": 0.06009323000907898, + "step": 30570 + }, + { + "epoch": 4.340667139815472, + "grad_norm": 1.528925895690918, + "learning_rate": 9.566132008516679e-05, + "loss": 0.0655640721321106, + "step": 30580 + }, + { + "epoch": 4.342086586231369, + "grad_norm": 4.253498077392578, + "learning_rate": 9.565990063875089e-05, + "loss": 0.039850252866745, + "step": 30590 + }, + { + "epoch": 4.343506032647268, + "grad_norm": 3.8556296825408936, + "learning_rate": 9.565848119233499e-05, + "loss": 0.09143298268318176, + "step": 30600 + }, + { + "epoch": 4.344925479063165, + "grad_norm": 3.4295029640197754, + "learning_rate": 9.56570617459191e-05, + "loss": 0.06332918405532836, + "step": 30610 + }, + { + "epoch": 4.346344925479063, + "grad_norm": 6.733371734619141, + "learning_rate": 9.565564229950321e-05, + "loss": 0.054583460092544556, + "step": 30620 + }, + { + "epoch": 4.347764371894961, + "grad_norm": 6.199682235717773, + "learning_rate": 9.56542228530873e-05, + "loss": 0.04864825010299682, + "step": 30630 + }, + { + "epoch": 4.349183818310859, + "grad_norm": 1.4338998794555664, + "learning_rate": 9.56528034066714e-05, + "loss": 0.0877656877040863, + "step": 30640 + }, + { + "epoch": 4.3506032647267565, + "grad_norm": 0.36631831526756287, + "learning_rate": 9.56513839602555e-05, + "loss": 0.06484183073043823, + "step": 30650 + }, + { + "epoch": 4.352022711142654, + "grad_norm": 1.5596531629562378, + "learning_rate": 9.564996451383961e-05, + "loss": 0.08244015574455262, + "step": 30660 + }, + { + "epoch": 4.353442157558552, + "grad_norm": 3.677886962890625, + "learning_rate": 9.564854506742371e-05, + "loss": 0.09137284755706787, + "step": 30670 + }, + { + "epoch": 4.35486160397445, + "grad_norm": 0.5668706297874451, + "learning_rate": 9.564712562100782e-05, + "loss": 0.08606833815574647, + "step": 30680 + }, + { + "epoch": 4.356281050390348, + "grad_norm": 2.088705062866211, + "learning_rate": 9.56457061745919e-05, + "loss": 0.053562283515930176, + "step": 30690 + }, + { + "epoch": 4.357700496806245, + "grad_norm": 6.448215484619141, + "learning_rate": 9.564428672817601e-05, + "loss": 0.06233048439025879, + "step": 30700 + }, + { + "epoch": 4.359119943222144, + "grad_norm": 1.7031103372573853, + "learning_rate": 9.564286728176012e-05, + "loss": 0.035029086470603946, + "step": 30710 + }, + { + "epoch": 4.360539389638041, + "grad_norm": 7.886801719665527, + "learning_rate": 9.564144783534422e-05, + "loss": 0.13027390241622924, + "step": 30720 + }, + { + "epoch": 4.3619588360539385, + "grad_norm": 3.4515767097473145, + "learning_rate": 9.564002838892833e-05, + "loss": 0.08356902599334717, + "step": 30730 + }, + { + "epoch": 4.363378282469837, + "grad_norm": 2.99210786819458, + "learning_rate": 9.563860894251242e-05, + "loss": 0.09152278900146485, + "step": 30740 + }, + { + "epoch": 4.364797728885734, + "grad_norm": 2.528543710708618, + "learning_rate": 9.563718949609653e-05, + "loss": 0.029031559824943542, + "step": 30750 + }, + { + "epoch": 4.366217175301633, + "grad_norm": 1.8579683303833008, + "learning_rate": 9.563577004968063e-05, + "loss": 0.0629112720489502, + "step": 30760 + }, + { + "epoch": 4.36763662171753, + "grad_norm": 8.009541511535645, + "learning_rate": 9.563435060326474e-05, + "loss": 0.06843248009681702, + "step": 30770 + }, + { + "epoch": 4.369056068133428, + "grad_norm": 9.022368431091309, + "learning_rate": 9.563293115684883e-05, + "loss": 0.07185935974121094, + "step": 30780 + }, + { + "epoch": 4.370475514549326, + "grad_norm": 10.9826021194458, + "learning_rate": 9.563151171043293e-05, + "loss": 0.05564273595809936, + "step": 30790 + }, + { + "epoch": 4.371894960965223, + "grad_norm": 4.034165859222412, + "learning_rate": 9.563009226401704e-05, + "loss": 0.07726762294769288, + "step": 30800 + }, + { + "epoch": 4.373314407381121, + "grad_norm": 1.4694591760635376, + "learning_rate": 9.562867281760114e-05, + "loss": 0.044573986530303956, + "step": 30810 + }, + { + "epoch": 4.374733853797019, + "grad_norm": 8.172507286071777, + "learning_rate": 9.562725337118525e-05, + "loss": 0.06269959807395935, + "step": 30820 + }, + { + "epoch": 4.376153300212917, + "grad_norm": 2.3326151371002197, + "learning_rate": 9.562583392476935e-05, + "loss": 0.03975181579589844, + "step": 30830 + }, + { + "epoch": 4.377572746628815, + "grad_norm": 1.3543591499328613, + "learning_rate": 9.562441447835344e-05, + "loss": 0.062531578540802, + "step": 30840 + }, + { + "epoch": 4.378992193044713, + "grad_norm": 0.6076768636703491, + "learning_rate": 9.562299503193754e-05, + "loss": 0.06527051329612732, + "step": 30850 + }, + { + "epoch": 4.38041163946061, + "grad_norm": 2.1021621227264404, + "learning_rate": 9.562157558552165e-05, + "loss": 0.03416385054588318, + "step": 30860 + }, + { + "epoch": 4.381831085876508, + "grad_norm": 0.2831692099571228, + "learning_rate": 9.562015613910575e-05, + "loss": 0.051375854015350345, + "step": 30870 + }, + { + "epoch": 4.383250532292406, + "grad_norm": 4.700906753540039, + "learning_rate": 9.561873669268986e-05, + "loss": 0.055076032876968384, + "step": 30880 + }, + { + "epoch": 4.384669978708303, + "grad_norm": 8.541457176208496, + "learning_rate": 9.561731724627396e-05, + "loss": 0.09593217372894287, + "step": 30890 + }, + { + "epoch": 4.386089425124202, + "grad_norm": 3.044889450073242, + "learning_rate": 9.561589779985806e-05, + "loss": 0.08643177151679993, + "step": 30900 + }, + { + "epoch": 4.387508871540099, + "grad_norm": 5.985223293304443, + "learning_rate": 9.561447835344217e-05, + "loss": 0.0780254065990448, + "step": 30910 + }, + { + "epoch": 4.3889283179559975, + "grad_norm": 10.373763084411621, + "learning_rate": 9.561305890702626e-05, + "loss": 0.06224080324172974, + "step": 30920 + }, + { + "epoch": 4.390347764371895, + "grad_norm": 0.9211413264274597, + "learning_rate": 9.561163946061038e-05, + "loss": 0.049274078011512755, + "step": 30930 + }, + { + "epoch": 4.391767210787792, + "grad_norm": 6.857527732849121, + "learning_rate": 9.561022001419447e-05, + "loss": 0.08677098751068116, + "step": 30940 + }, + { + "epoch": 4.393186657203691, + "grad_norm": 4.365863800048828, + "learning_rate": 9.560880056777857e-05, + "loss": 0.09109071493148804, + "step": 30950 + }, + { + "epoch": 4.394606103619588, + "grad_norm": 0.203486829996109, + "learning_rate": 9.560738112136267e-05, + "loss": 0.044797495007514954, + "step": 30960 + }, + { + "epoch": 4.396025550035486, + "grad_norm": 0.31215134263038635, + "learning_rate": 9.560596167494678e-05, + "loss": 0.0890379548072815, + "step": 30970 + }, + { + "epoch": 4.397444996451384, + "grad_norm": 0.34271034598350525, + "learning_rate": 9.560454222853088e-05, + "loss": 0.07275915145874023, + "step": 30980 + }, + { + "epoch": 4.398864442867282, + "grad_norm": 4.180636882781982, + "learning_rate": 9.560312278211499e-05, + "loss": 0.03852737843990326, + "step": 30990 + }, + { + "epoch": 4.4002838892831795, + "grad_norm": 0.41002199053764343, + "learning_rate": 9.560170333569908e-05, + "loss": 0.07415003180503846, + "step": 31000 + }, + { + "epoch": 4.4002838892831795, + "eval_accuracy": 0.9713232021364532, + "eval_loss": 0.0869520977139473, + "eval_runtime": 34.925, + "eval_samples_per_second": 450.307, + "eval_steps_per_second": 14.087, + "step": 31000 + }, + { + "epoch": 4.401703335699077, + "grad_norm": 0.16246958076953888, + "learning_rate": 9.560028388928318e-05, + "loss": 0.04245265126228333, + "step": 31010 + }, + { + "epoch": 4.403122782114975, + "grad_norm": 3.3791964054107666, + "learning_rate": 9.559886444286729e-05, + "loss": 0.08227800130844116, + "step": 31020 + }, + { + "epoch": 4.404542228530873, + "grad_norm": 0.25004979968070984, + "learning_rate": 9.559744499645139e-05, + "loss": 0.05259315967559815, + "step": 31030 + }, + { + "epoch": 4.405961674946771, + "grad_norm": 5.847822666168213, + "learning_rate": 9.55960255500355e-05, + "loss": 0.08936032056808471, + "step": 31040 + }, + { + "epoch": 4.407381121362668, + "grad_norm": 0.43625226616859436, + "learning_rate": 9.559460610361958e-05, + "loss": 0.06444858312606812, + "step": 31050 + }, + { + "epoch": 4.408800567778567, + "grad_norm": 6.678699493408203, + "learning_rate": 9.55931866572037e-05, + "loss": 0.09537352323532104, + "step": 31060 + }, + { + "epoch": 4.410220014194464, + "grad_norm": 11.696027755737305, + "learning_rate": 9.559176721078779e-05, + "loss": 0.11680700778961181, + "step": 31070 + }, + { + "epoch": 4.4116394606103615, + "grad_norm": 6.337532043457031, + "learning_rate": 9.55903477643719e-05, + "loss": 0.04228192269802093, + "step": 31080 + }, + { + "epoch": 4.41305890702626, + "grad_norm": 1.4329833984375, + "learning_rate": 9.5588928317956e-05, + "loss": 0.03295584321022034, + "step": 31090 + }, + { + "epoch": 4.414478353442157, + "grad_norm": 7.40330171585083, + "learning_rate": 9.55875088715401e-05, + "loss": 0.07505521774291993, + "step": 31100 + }, + { + "epoch": 4.4158977998580555, + "grad_norm": 0.8913317918777466, + "learning_rate": 9.558608942512421e-05, + "loss": 0.11474095582962036, + "step": 31110 + }, + { + "epoch": 4.417317246273953, + "grad_norm": 2.8797833919525146, + "learning_rate": 9.55846699787083e-05, + "loss": 0.04045186638832092, + "step": 31120 + }, + { + "epoch": 4.418736692689851, + "grad_norm": 7.083196640014648, + "learning_rate": 9.558325053229242e-05, + "loss": 0.08086020350456238, + "step": 31130 + }, + { + "epoch": 4.420156139105749, + "grad_norm": 7.3477349281311035, + "learning_rate": 9.558183108587652e-05, + "loss": 0.09126662015914917, + "step": 31140 + }, + { + "epoch": 4.421575585521646, + "grad_norm": 2.6615936756134033, + "learning_rate": 9.558041163946061e-05, + "loss": 0.11547311544418334, + "step": 31150 + }, + { + "epoch": 4.422995031937544, + "grad_norm": 0.38098084926605225, + "learning_rate": 9.557899219304471e-05, + "loss": 0.09409705996513366, + "step": 31160 + }, + { + "epoch": 4.424414478353442, + "grad_norm": 4.258413314819336, + "learning_rate": 9.557757274662882e-05, + "loss": 0.07088276147842407, + "step": 31170 + }, + { + "epoch": 4.42583392476934, + "grad_norm": 0.6552639603614807, + "learning_rate": 9.557615330021292e-05, + "loss": 0.0480211079120636, + "step": 31180 + }, + { + "epoch": 4.4272533711852375, + "grad_norm": 0.469436377286911, + "learning_rate": 9.557473385379703e-05, + "loss": 0.03103916049003601, + "step": 31190 + }, + { + "epoch": 4.428672817601136, + "grad_norm": 0.1175021231174469, + "learning_rate": 9.557331440738113e-05, + "loss": 0.050581169128417966, + "step": 31200 + }, + { + "epoch": 4.430092264017033, + "grad_norm": 2.0325958728790283, + "learning_rate": 9.557189496096522e-05, + "loss": 0.08802063465118408, + "step": 31210 + }, + { + "epoch": 4.431511710432932, + "grad_norm": 3.3236446380615234, + "learning_rate": 9.557047551454933e-05, + "loss": 0.040520912408828734, + "step": 31220 + }, + { + "epoch": 4.432931156848829, + "grad_norm": 6.1703596115112305, + "learning_rate": 9.556905606813343e-05, + "loss": 0.051608556509017946, + "step": 31230 + }, + { + "epoch": 4.434350603264726, + "grad_norm": 4.604761123657227, + "learning_rate": 9.556763662171754e-05, + "loss": 0.06144062280654907, + "step": 31240 + }, + { + "epoch": 4.435770049680625, + "grad_norm": 11.01284408569336, + "learning_rate": 9.556621717530164e-05, + "loss": 0.16597810983657837, + "step": 31250 + }, + { + "epoch": 4.437189496096522, + "grad_norm": 8.154959678649902, + "learning_rate": 9.556479772888574e-05, + "loss": 0.048260855674743655, + "step": 31260 + }, + { + "epoch": 4.43860894251242, + "grad_norm": 8.059030532836914, + "learning_rate": 9.556337828246984e-05, + "loss": 0.1349133014678955, + "step": 31270 + }, + { + "epoch": 4.440028388928318, + "grad_norm": 3.50076961517334, + "learning_rate": 9.556195883605395e-05, + "loss": 0.1305612087249756, + "step": 31280 + }, + { + "epoch": 4.441447835344216, + "grad_norm": 2.289435386657715, + "learning_rate": 9.556053938963804e-05, + "loss": 0.05613391995429993, + "step": 31290 + }, + { + "epoch": 4.442867281760114, + "grad_norm": 1.1918927431106567, + "learning_rate": 9.555911994322215e-05, + "loss": 0.03883382380008697, + "step": 31300 + }, + { + "epoch": 4.444286728176011, + "grad_norm": 2.9775784015655518, + "learning_rate": 9.555770049680625e-05, + "loss": 0.06229335069656372, + "step": 31310 + }, + { + "epoch": 4.445706174591909, + "grad_norm": 11.473634719848633, + "learning_rate": 9.555628105039035e-05, + "loss": 0.10125520229339599, + "step": 31320 + }, + { + "epoch": 4.447125621007807, + "grad_norm": 0.14798550307750702, + "learning_rate": 9.555486160397446e-05, + "loss": 0.042588675022125246, + "step": 31330 + }, + { + "epoch": 4.448545067423705, + "grad_norm": 0.5757963061332703, + "learning_rate": 9.555344215755856e-05, + "loss": 0.04882889091968536, + "step": 31340 + }, + { + "epoch": 4.4499645138396025, + "grad_norm": 3.5108330249786377, + "learning_rate": 9.555202271114267e-05, + "loss": 0.11374999284744262, + "step": 31350 + }, + { + "epoch": 4.451383960255501, + "grad_norm": 0.21131829917430878, + "learning_rate": 9.555060326472675e-05, + "loss": 0.06701189875602723, + "step": 31360 + }, + { + "epoch": 4.452803406671398, + "grad_norm": 0.859306812286377, + "learning_rate": 9.554918381831086e-05, + "loss": 0.059961211681365964, + "step": 31370 + }, + { + "epoch": 4.454222853087296, + "grad_norm": 10.532944679260254, + "learning_rate": 9.554776437189496e-05, + "loss": 0.07974779605865479, + "step": 31380 + }, + { + "epoch": 4.455642299503194, + "grad_norm": 16.47031593322754, + "learning_rate": 9.554634492547907e-05, + "loss": 0.0871815800666809, + "step": 31390 + }, + { + "epoch": 4.457061745919091, + "grad_norm": 1.5741618871688843, + "learning_rate": 9.554492547906317e-05, + "loss": 0.036674460768699645, + "step": 31400 + }, + { + "epoch": 4.45848119233499, + "grad_norm": 0.6788744330406189, + "learning_rate": 9.554350603264727e-05, + "loss": 0.06158609390258789, + "step": 31410 + }, + { + "epoch": 4.459900638750887, + "grad_norm": 8.353523254394531, + "learning_rate": 9.554208658623138e-05, + "loss": 0.037455737590789795, + "step": 31420 + }, + { + "epoch": 4.461320085166785, + "grad_norm": 5.952844142913818, + "learning_rate": 9.554066713981547e-05, + "loss": 0.06949056386947632, + "step": 31430 + }, + { + "epoch": 4.462739531582683, + "grad_norm": 6.8104143142700195, + "learning_rate": 9.553924769339959e-05, + "loss": 0.04634210467338562, + "step": 31440 + }, + { + "epoch": 4.46415897799858, + "grad_norm": 1.9237818717956543, + "learning_rate": 9.553782824698368e-05, + "loss": 0.045291933417320254, + "step": 31450 + }, + { + "epoch": 4.4655784244144785, + "grad_norm": 1.631089448928833, + "learning_rate": 9.553640880056778e-05, + "loss": 0.05780811309814453, + "step": 31460 + }, + { + "epoch": 4.466997870830376, + "grad_norm": 1.2360000610351562, + "learning_rate": 9.553498935415188e-05, + "loss": 0.047595706582069394, + "step": 31470 + }, + { + "epoch": 4.468417317246274, + "grad_norm": 4.273767471313477, + "learning_rate": 9.553356990773599e-05, + "loss": 0.029149368405342102, + "step": 31480 + }, + { + "epoch": 4.469836763662172, + "grad_norm": 5.198413848876953, + "learning_rate": 9.553215046132009e-05, + "loss": 0.08129512071609497, + "step": 31490 + }, + { + "epoch": 4.47125621007807, + "grad_norm": 1.2532944679260254, + "learning_rate": 9.55307310149042e-05, + "loss": 0.06522347927093505, + "step": 31500 + }, + { + "epoch": 4.47125621007807, + "eval_accuracy": 0.9718318814777135, + "eval_loss": 0.08366374671459198, + "eval_runtime": 34.8059, + "eval_samples_per_second": 451.849, + "eval_steps_per_second": 14.136, + "step": 31500 + }, + { + "epoch": 4.472675656493967, + "grad_norm": 2.549074411392212, + "learning_rate": 9.55293115684883e-05, + "loss": 0.07184516191482544, + "step": 31510 + }, + { + "epoch": 4.474095102909865, + "grad_norm": 0.25022396445274353, + "learning_rate": 9.552789212207239e-05, + "loss": 0.10292150974273681, + "step": 31520 + }, + { + "epoch": 4.475514549325763, + "grad_norm": 3.1270029544830322, + "learning_rate": 9.55264726756565e-05, + "loss": 0.04953309595584869, + "step": 31530 + }, + { + "epoch": 4.4769339957416605, + "grad_norm": 0.2376168668270111, + "learning_rate": 9.55250532292406e-05, + "loss": 0.029788446426391602, + "step": 31540 + }, + { + "epoch": 4.478353442157559, + "grad_norm": 0.9615975618362427, + "learning_rate": 9.552363378282471e-05, + "loss": 0.04785667657852173, + "step": 31550 + }, + { + "epoch": 4.479772888573456, + "grad_norm": 3.677119255065918, + "learning_rate": 9.552221433640881e-05, + "loss": 0.04802153706550598, + "step": 31560 + }, + { + "epoch": 4.4811923349893545, + "grad_norm": 5.188876152038574, + "learning_rate": 9.55207948899929e-05, + "loss": 0.054722541570663454, + "step": 31570 + }, + { + "epoch": 4.482611781405252, + "grad_norm": 4.769455909729004, + "learning_rate": 9.5519375443577e-05, + "loss": 0.116557776927948, + "step": 31580 + }, + { + "epoch": 4.484031227821149, + "grad_norm": 9.027771949768066, + "learning_rate": 9.551795599716111e-05, + "loss": 0.13705774545669555, + "step": 31590 + }, + { + "epoch": 4.485450674237048, + "grad_norm": 5.028712749481201, + "learning_rate": 9.551653655074521e-05, + "loss": 0.07994774580001832, + "step": 31600 + }, + { + "epoch": 4.486870120652945, + "grad_norm": 3.692356824874878, + "learning_rate": 9.551511710432932e-05, + "loss": 0.049330982565879825, + "step": 31610 + }, + { + "epoch": 4.488289567068843, + "grad_norm": 3.6784050464630127, + "learning_rate": 9.551369765791342e-05, + "loss": 0.03654472827911377, + "step": 31620 + }, + { + "epoch": 4.489709013484741, + "grad_norm": 2.1743340492248535, + "learning_rate": 9.551227821149752e-05, + "loss": 0.11464802026748658, + "step": 31630 + }, + { + "epoch": 4.491128459900639, + "grad_norm": 9.176986694335938, + "learning_rate": 9.551085876508163e-05, + "loss": 0.11709423065185547, + "step": 31640 + }, + { + "epoch": 4.4925479063165366, + "grad_norm": 4.8150811195373535, + "learning_rate": 9.550943931866573e-05, + "loss": 0.06142100691795349, + "step": 31650 + }, + { + "epoch": 4.493967352732434, + "grad_norm": 1.1660048961639404, + "learning_rate": 9.550801987224984e-05, + "loss": 0.05719171762466431, + "step": 31660 + }, + { + "epoch": 4.495386799148332, + "grad_norm": 0.7825644016265869, + "learning_rate": 9.550660042583392e-05, + "loss": 0.08388499021530152, + "step": 31670 + }, + { + "epoch": 4.49680624556423, + "grad_norm": 2.4386682510375977, + "learning_rate": 9.550518097941803e-05, + "loss": 0.08802780508995056, + "step": 31680 + }, + { + "epoch": 4.498225691980128, + "grad_norm": 2.295851230621338, + "learning_rate": 9.550376153300213e-05, + "loss": 0.055353093147277835, + "step": 31690 + }, + { + "epoch": 4.499645138396025, + "grad_norm": 6.695101737976074, + "learning_rate": 9.550234208658624e-05, + "loss": 0.11167588233947753, + "step": 31700 + }, + { + "epoch": 4.501064584811924, + "grad_norm": 2.08304500579834, + "learning_rate": 9.550092264017034e-05, + "loss": 0.07536173462867737, + "step": 31710 + }, + { + "epoch": 4.502484031227821, + "grad_norm": 6.953207969665527, + "learning_rate": 9.549950319375443e-05, + "loss": 0.06825330853462219, + "step": 31720 + }, + { + "epoch": 4.503903477643719, + "grad_norm": 2.6377944946289062, + "learning_rate": 9.549808374733854e-05, + "loss": 0.06334355473518372, + "step": 31730 + }, + { + "epoch": 4.505322924059617, + "grad_norm": 3.363705635070801, + "learning_rate": 9.549666430092264e-05, + "loss": 0.18799341917037965, + "step": 31740 + }, + { + "epoch": 4.506742370475514, + "grad_norm": 1.8969464302062988, + "learning_rate": 9.549524485450675e-05, + "loss": 0.04989268779754639, + "step": 31750 + }, + { + "epoch": 4.508161816891413, + "grad_norm": 1.185655951499939, + "learning_rate": 9.549382540809085e-05, + "loss": 0.08766244649887085, + "step": 31760 + }, + { + "epoch": 4.50958126330731, + "grad_norm": 2.129251003265381, + "learning_rate": 9.549240596167495e-05, + "loss": 0.06428298950195313, + "step": 31770 + }, + { + "epoch": 4.511000709723208, + "grad_norm": 1.2739909887313843, + "learning_rate": 9.549098651525905e-05, + "loss": 0.08616969585418702, + "step": 31780 + }, + { + "epoch": 4.512420156139106, + "grad_norm": 7.885573863983154, + "learning_rate": 9.548956706884316e-05, + "loss": 0.07006771564483642, + "step": 31790 + }, + { + "epoch": 4.513839602555003, + "grad_norm": 4.777563095092773, + "learning_rate": 9.548814762242725e-05, + "loss": 0.05692494511604309, + "step": 31800 + }, + { + "epoch": 4.5152590489709015, + "grad_norm": 1.3167340755462646, + "learning_rate": 9.548672817601136e-05, + "loss": 0.07449572682380676, + "step": 31810 + }, + { + "epoch": 4.516678495386799, + "grad_norm": 5.693206310272217, + "learning_rate": 9.548530872959546e-05, + "loss": 0.05288561582565308, + "step": 31820 + }, + { + "epoch": 4.518097941802697, + "grad_norm": 5.757571220397949, + "learning_rate": 9.548403122782116e-05, + "loss": 0.07998919486999512, + "step": 31830 + }, + { + "epoch": 4.519517388218595, + "grad_norm": 4.328059196472168, + "learning_rate": 9.548261178140526e-05, + "loss": 0.05517913699150086, + "step": 31840 + }, + { + "epoch": 4.520936834634493, + "grad_norm": 0.796028733253479, + "learning_rate": 9.548119233498936e-05, + "loss": 0.047811633348464964, + "step": 31850 + }, + { + "epoch": 4.52235628105039, + "grad_norm": 5.320539951324463, + "learning_rate": 9.547977288857347e-05, + "loss": 0.06639637351036072, + "step": 31860 + }, + { + "epoch": 4.523775727466289, + "grad_norm": 3.544234275817871, + "learning_rate": 9.547835344215756e-05, + "loss": 0.11271839141845703, + "step": 31870 + }, + { + "epoch": 4.525195173882186, + "grad_norm": 3.894627094268799, + "learning_rate": 9.547693399574167e-05, + "loss": 0.0395465612411499, + "step": 31880 + }, + { + "epoch": 4.5266146202980835, + "grad_norm": 4.7489752769470215, + "learning_rate": 9.547551454932577e-05, + "loss": 0.15020207166671753, + "step": 31890 + }, + { + "epoch": 4.528034066713982, + "grad_norm": 2.853905200958252, + "learning_rate": 9.547409510290987e-05, + "loss": 0.08104996681213379, + "step": 31900 + }, + { + "epoch": 4.529453513129879, + "grad_norm": 2.5537586212158203, + "learning_rate": 9.547267565649397e-05, + "loss": 0.08797919750213623, + "step": 31910 + }, + { + "epoch": 4.5308729595457775, + "grad_norm": 3.720608711242676, + "learning_rate": 9.547125621007808e-05, + "loss": 0.12364702224731446, + "step": 31920 + }, + { + "epoch": 4.532292405961675, + "grad_norm": 5.040463447570801, + "learning_rate": 9.546983676366217e-05, + "loss": 0.05389441847801209, + "step": 31930 + }, + { + "epoch": 4.533711852377573, + "grad_norm": 9.118657112121582, + "learning_rate": 9.546841731724629e-05, + "loss": 0.10287294387817383, + "step": 31940 + }, + { + "epoch": 4.535131298793471, + "grad_norm": 9.760089874267578, + "learning_rate": 9.546699787083038e-05, + "loss": 0.10598251819610596, + "step": 31950 + }, + { + "epoch": 4.536550745209368, + "grad_norm": 5.44436502456665, + "learning_rate": 9.546557842441448e-05, + "loss": 0.062908136844635, + "step": 31960 + }, + { + "epoch": 4.537970191625266, + "grad_norm": 7.934990882873535, + "learning_rate": 9.546415897799859e-05, + "loss": 0.06037212014198303, + "step": 31970 + }, + { + "epoch": 4.539389638041164, + "grad_norm": 6.5772480964660645, + "learning_rate": 9.546273953158269e-05, + "loss": 0.07325562834739685, + "step": 31980 + }, + { + "epoch": 4.540809084457062, + "grad_norm": 6.199497699737549, + "learning_rate": 9.54613200851668e-05, + "loss": 0.06814472675323487, + "step": 31990 + }, + { + "epoch": 4.5422285308729595, + "grad_norm": 1.220271348953247, + "learning_rate": 9.545990063875088e-05, + "loss": 0.11154254674911498, + "step": 32000 + }, + { + "epoch": 4.5422285308729595, + "eval_accuracy": 0.9696699942773574, + "eval_loss": 0.09096228331327438, + "eval_runtime": 35.0606, + "eval_samples_per_second": 448.566, + "eval_steps_per_second": 14.033, + "step": 32000 + }, + { + "epoch": 4.543647977288858, + "grad_norm": 1.179792046546936, + "learning_rate": 9.5458481192335e-05, + "loss": 0.033092746138572694, + "step": 32010 + }, + { + "epoch": 4.545067423704755, + "grad_norm": 4.7662553787231445, + "learning_rate": 9.545706174591909e-05, + "loss": 0.05603760480880737, + "step": 32020 + }, + { + "epoch": 4.546486870120653, + "grad_norm": 4.585840225219727, + "learning_rate": 9.54556422995032e-05, + "loss": 0.07328543066978455, + "step": 32030 + }, + { + "epoch": 4.547906316536551, + "grad_norm": 8.541946411132812, + "learning_rate": 9.54542228530873e-05, + "loss": 0.06177548766136169, + "step": 32040 + }, + { + "epoch": 4.549325762952448, + "grad_norm": 6.825544834136963, + "learning_rate": 9.54528034066714e-05, + "loss": 0.03726685047149658, + "step": 32050 + }, + { + "epoch": 4.550745209368347, + "grad_norm": 6.649820804595947, + "learning_rate": 9.545138396025551e-05, + "loss": 0.05233697891235352, + "step": 32060 + }, + { + "epoch": 4.552164655784244, + "grad_norm": 5.592628479003906, + "learning_rate": 9.54499645138396e-05, + "loss": 0.09716169834136963, + "step": 32070 + }, + { + "epoch": 4.553584102200142, + "grad_norm": 7.0894975662231445, + "learning_rate": 9.544854506742372e-05, + "loss": 0.08125547766685486, + "step": 32080 + }, + { + "epoch": 4.55500354861604, + "grad_norm": 8.739372253417969, + "learning_rate": 9.544712562100781e-05, + "loss": 0.13002804517745972, + "step": 32090 + }, + { + "epoch": 4.556422995031937, + "grad_norm": 6.633364677429199, + "learning_rate": 9.544570617459191e-05, + "loss": 0.0681275725364685, + "step": 32100 + }, + { + "epoch": 4.557842441447836, + "grad_norm": 6.804000377655029, + "learning_rate": 9.544428672817601e-05, + "loss": 0.04156455099582672, + "step": 32110 + }, + { + "epoch": 4.559261887863733, + "grad_norm": 0.5662475228309631, + "learning_rate": 9.544286728176012e-05, + "loss": 0.02463839203119278, + "step": 32120 + }, + { + "epoch": 4.560681334279631, + "grad_norm": 0.864768385887146, + "learning_rate": 9.544144783534422e-05, + "loss": 0.0701153576374054, + "step": 32130 + }, + { + "epoch": 4.562100780695529, + "grad_norm": 1.1445688009262085, + "learning_rate": 9.544002838892833e-05, + "loss": 0.023342382907867432, + "step": 32140 + }, + { + "epoch": 4.563520227111427, + "grad_norm": 1.1171879768371582, + "learning_rate": 9.543860894251243e-05, + "loss": 0.04527752697467804, + "step": 32150 + }, + { + "epoch": 4.564939673527324, + "grad_norm": 5.467402458190918, + "learning_rate": 9.543718949609652e-05, + "loss": 0.05509437322616577, + "step": 32160 + }, + { + "epoch": 4.566359119943222, + "grad_norm": 2.1214330196380615, + "learning_rate": 9.543577004968063e-05, + "loss": 0.03169718384742737, + "step": 32170 + }, + { + "epoch": 4.56777856635912, + "grad_norm": 6.447783946990967, + "learning_rate": 9.543435060326473e-05, + "loss": 0.07171815037727355, + "step": 32180 + }, + { + "epoch": 4.569198012775018, + "grad_norm": 1.2676538228988647, + "learning_rate": 9.543293115684884e-05, + "loss": 0.06441798210144042, + "step": 32190 + }, + { + "epoch": 4.570617459190916, + "grad_norm": 3.4596071243286133, + "learning_rate": 9.543151171043293e-05, + "loss": 0.07644802331924438, + "step": 32200 + }, + { + "epoch": 4.572036905606813, + "grad_norm": 4.619117259979248, + "learning_rate": 9.543009226401704e-05, + "loss": 0.04569814503192902, + "step": 32210 + }, + { + "epoch": 4.573456352022712, + "grad_norm": 0.3759884834289551, + "learning_rate": 9.542867281760113e-05, + "loss": 0.11063082218170166, + "step": 32220 + }, + { + "epoch": 4.574875798438609, + "grad_norm": 0.39691707491874695, + "learning_rate": 9.542725337118525e-05, + "loss": 0.06378236413002014, + "step": 32230 + }, + { + "epoch": 4.5762952448545064, + "grad_norm": 2.744389533996582, + "learning_rate": 9.542583392476934e-05, + "loss": 0.11052513122558594, + "step": 32240 + }, + { + "epoch": 4.577714691270405, + "grad_norm": 7.465341091156006, + "learning_rate": 9.542441447835345e-05, + "loss": 0.07533521056175232, + "step": 32250 + }, + { + "epoch": 4.579134137686302, + "grad_norm": 1.2778067588806152, + "learning_rate": 9.542299503193755e-05, + "loss": 0.05032462477684021, + "step": 32260 + }, + { + "epoch": 4.5805535841022005, + "grad_norm": 5.098127365112305, + "learning_rate": 9.542157558552165e-05, + "loss": 0.061452174186706544, + "step": 32270 + }, + { + "epoch": 4.581973030518098, + "grad_norm": 11.057262420654297, + "learning_rate": 9.542015613910576e-05, + "loss": 0.08994705080986024, + "step": 32280 + }, + { + "epoch": 4.583392476933996, + "grad_norm": 0.9384304881095886, + "learning_rate": 9.541873669268986e-05, + "loss": 0.05673693418502808, + "step": 32290 + }, + { + "epoch": 4.584811923349894, + "grad_norm": 7.694385528564453, + "learning_rate": 9.541731724627397e-05, + "loss": 0.1050719141960144, + "step": 32300 + }, + { + "epoch": 4.586231369765791, + "grad_norm": 1.4772675037384033, + "learning_rate": 9.541589779985805e-05, + "loss": 0.0497598260641098, + "step": 32310 + }, + { + "epoch": 4.587650816181689, + "grad_norm": 0.905949056148529, + "learning_rate": 9.541447835344216e-05, + "loss": 0.05204020738601685, + "step": 32320 + }, + { + "epoch": 4.589070262597587, + "grad_norm": 5.82963228225708, + "learning_rate": 9.541305890702626e-05, + "loss": 0.04697149097919464, + "step": 32330 + }, + { + "epoch": 4.590489709013485, + "grad_norm": 5.241878032684326, + "learning_rate": 9.541163946061037e-05, + "loss": 0.06778921484947205, + "step": 32340 + }, + { + "epoch": 4.5919091554293825, + "grad_norm": 0.09300513565540314, + "learning_rate": 9.541022001419447e-05, + "loss": 0.09513026475906372, + "step": 32350 + }, + { + "epoch": 4.593328601845281, + "grad_norm": 4.091070175170898, + "learning_rate": 9.540880056777857e-05, + "loss": 0.048994243144989014, + "step": 32360 + }, + { + "epoch": 4.594748048261178, + "grad_norm": 7.305341720581055, + "learning_rate": 9.540738112136268e-05, + "loss": 0.11231958866119385, + "step": 32370 + }, + { + "epoch": 4.596167494677076, + "grad_norm": 1.8714683055877686, + "learning_rate": 9.540596167494677e-05, + "loss": 0.10454151630401612, + "step": 32380 + }, + { + "epoch": 4.597586941092974, + "grad_norm": 7.5027618408203125, + "learning_rate": 9.540454222853088e-05, + "loss": 0.0959784746170044, + "step": 32390 + }, + { + "epoch": 4.599006387508871, + "grad_norm": 11.03409481048584, + "learning_rate": 9.540312278211498e-05, + "loss": 0.05968649983406067, + "step": 32400 + }, + { + "epoch": 4.60042583392477, + "grad_norm": 4.425593852996826, + "learning_rate": 9.540170333569908e-05, + "loss": 0.05584191679954529, + "step": 32410 + }, + { + "epoch": 4.601845280340667, + "grad_norm": 0.5005938410758972, + "learning_rate": 9.540028388928318e-05, + "loss": 0.07238389253616333, + "step": 32420 + }, + { + "epoch": 4.603264726756565, + "grad_norm": 9.244565963745117, + "learning_rate": 9.539886444286729e-05, + "loss": 0.05424014329910278, + "step": 32430 + }, + { + "epoch": 4.604684173172463, + "grad_norm": 5.8151445388793945, + "learning_rate": 9.539744499645138e-05, + "loss": 0.08108783960342407, + "step": 32440 + }, + { + "epoch": 4.60610361958836, + "grad_norm": 0.5076402425765991, + "learning_rate": 9.53960255500355e-05, + "loss": 0.11353771686553955, + "step": 32450 + }, + { + "epoch": 4.6075230660042585, + "grad_norm": 7.542196750640869, + "learning_rate": 9.539460610361959e-05, + "loss": 0.07132243514060974, + "step": 32460 + }, + { + "epoch": 4.608942512420156, + "grad_norm": 1.686920404434204, + "learning_rate": 9.539318665720369e-05, + "loss": 0.054832571744918825, + "step": 32470 + }, + { + "epoch": 4.610361958836054, + "grad_norm": 8.699249267578125, + "learning_rate": 9.53917672107878e-05, + "loss": 0.08153939843177796, + "step": 32480 + }, + { + "epoch": 4.611781405251952, + "grad_norm": 0.41042953729629517, + "learning_rate": 9.53903477643719e-05, + "loss": 0.054275840520858765, + "step": 32490 + }, + { + "epoch": 4.61320085166785, + "grad_norm": 10.090995788574219, + "learning_rate": 9.538892831795601e-05, + "loss": 0.13572399616241454, + "step": 32500 + }, + { + "epoch": 4.61320085166785, + "eval_accuracy": 0.9771094296432886, + "eval_loss": 0.06673765182495117, + "eval_runtime": 36.3248, + "eval_samples_per_second": 432.955, + "eval_steps_per_second": 13.544, + "step": 32500 + }, + { + "epoch": 4.614620298083747, + "grad_norm": 4.7882819175720215, + "learning_rate": 9.53875088715401e-05, + "loss": 0.05213452577590942, + "step": 32510 + }, + { + "epoch": 4.616039744499645, + "grad_norm": 3.912700653076172, + "learning_rate": 9.53860894251242e-05, + "loss": 0.05222201347351074, + "step": 32520 + }, + { + "epoch": 4.617459190915543, + "grad_norm": 4.201282024383545, + "learning_rate": 9.53846699787083e-05, + "loss": 0.07659255862236022, + "step": 32530 + }, + { + "epoch": 4.6188786373314406, + "grad_norm": 0.3415137529373169, + "learning_rate": 9.538325053229241e-05, + "loss": 0.07810273170471191, + "step": 32540 + }, + { + "epoch": 4.620298083747339, + "grad_norm": 2.1429755687713623, + "learning_rate": 9.538183108587651e-05, + "loss": 0.10537341833114625, + "step": 32550 + }, + { + "epoch": 4.621717530163236, + "grad_norm": 5.316372394561768, + "learning_rate": 9.538041163946061e-05, + "loss": 0.04490730464458466, + "step": 32560 + }, + { + "epoch": 4.623136976579135, + "grad_norm": 11.811970710754395, + "learning_rate": 9.537899219304472e-05, + "loss": 0.08929653763771057, + "step": 32570 + }, + { + "epoch": 4.624556422995032, + "grad_norm": 1.3656327724456787, + "learning_rate": 9.537757274662882e-05, + "loss": 0.028151947259902953, + "step": 32580 + }, + { + "epoch": 4.625975869410929, + "grad_norm": 4.044888973236084, + "learning_rate": 9.537615330021293e-05, + "loss": 0.09239206910133362, + "step": 32590 + }, + { + "epoch": 4.627395315826828, + "grad_norm": 0.7978225350379944, + "learning_rate": 9.537473385379702e-05, + "loss": 0.030372977256774902, + "step": 32600 + }, + { + "epoch": 4.628814762242725, + "grad_norm": 0.3269311785697937, + "learning_rate": 9.537331440738114e-05, + "loss": 0.06625695824623108, + "step": 32610 + }, + { + "epoch": 4.6302342086586235, + "grad_norm": 0.9531834721565247, + "learning_rate": 9.537189496096522e-05, + "loss": 0.02768568992614746, + "step": 32620 + }, + { + "epoch": 4.631653655074521, + "grad_norm": 11.829211235046387, + "learning_rate": 9.537047551454933e-05, + "loss": 0.07944384813308716, + "step": 32630 + }, + { + "epoch": 4.633073101490419, + "grad_norm": 1.9724338054656982, + "learning_rate": 9.536905606813343e-05, + "loss": 0.05105016231536865, + "step": 32640 + }, + { + "epoch": 4.634492547906317, + "grad_norm": 5.076968669891357, + "learning_rate": 9.536763662171754e-05, + "loss": 0.03969843983650208, + "step": 32650 + }, + { + "epoch": 4.635911994322214, + "grad_norm": 5.4567437171936035, + "learning_rate": 9.536621717530164e-05, + "loss": 0.07335436344146729, + "step": 32660 + }, + { + "epoch": 4.637331440738112, + "grad_norm": 0.4687504470348358, + "learning_rate": 9.536479772888573e-05, + "loss": 0.08072584271430969, + "step": 32670 + }, + { + "epoch": 4.63875088715401, + "grad_norm": 10.989912986755371, + "learning_rate": 9.536337828246984e-05, + "loss": 0.13782428503036498, + "step": 32680 + }, + { + "epoch": 4.640170333569908, + "grad_norm": 10.409384727478027, + "learning_rate": 9.536195883605394e-05, + "loss": 0.07241227626800537, + "step": 32690 + }, + { + "epoch": 4.6415897799858055, + "grad_norm": 5.019614219665527, + "learning_rate": 9.536053938963805e-05, + "loss": 0.053052467107772824, + "step": 32700 + }, + { + "epoch": 4.643009226401704, + "grad_norm": 5.739016056060791, + "learning_rate": 9.535911994322215e-05, + "loss": 0.09582682847976684, + "step": 32710 + }, + { + "epoch": 4.644428672817601, + "grad_norm": 7.529049396514893, + "learning_rate": 9.535770049680625e-05, + "loss": 0.025439244508743287, + "step": 32720 + }, + { + "epoch": 4.645848119233499, + "grad_norm": 4.5299835205078125, + "learning_rate": 9.535628105039034e-05, + "loss": 0.04896810054779053, + "step": 32730 + }, + { + "epoch": 4.647267565649397, + "grad_norm": 1.1995518207550049, + "learning_rate": 9.535486160397446e-05, + "loss": 0.05114521980285645, + "step": 32740 + }, + { + "epoch": 4.648687012065294, + "grad_norm": 5.880154609680176, + "learning_rate": 9.535344215755855e-05, + "loss": 0.09556569457054138, + "step": 32750 + }, + { + "epoch": 4.650106458481193, + "grad_norm": 5.677839279174805, + "learning_rate": 9.535202271114266e-05, + "loss": 0.0534260630607605, + "step": 32760 + }, + { + "epoch": 4.65152590489709, + "grad_norm": 0.137897327542305, + "learning_rate": 9.535060326472676e-05, + "loss": 0.07294431924819947, + "step": 32770 + }, + { + "epoch": 4.652945351312988, + "grad_norm": 2.987269639968872, + "learning_rate": 9.534918381831086e-05, + "loss": 0.05872194766998291, + "step": 32780 + }, + { + "epoch": 4.654364797728886, + "grad_norm": 9.59643840789795, + "learning_rate": 9.534776437189497e-05, + "loss": 0.09373766779899598, + "step": 32790 + }, + { + "epoch": 4.655784244144783, + "grad_norm": 5.86336088180542, + "learning_rate": 9.534634492547907e-05, + "loss": 0.09089065194129944, + "step": 32800 + }, + { + "epoch": 4.6572036905606815, + "grad_norm": 9.171552658081055, + "learning_rate": 9.534492547906318e-05, + "loss": 0.04677242934703827, + "step": 32810 + }, + { + "epoch": 4.658623136976579, + "grad_norm": 8.410407066345215, + "learning_rate": 9.534350603264726e-05, + "loss": 0.045021438598632814, + "step": 32820 + }, + { + "epoch": 4.660042583392477, + "grad_norm": 8.053443908691406, + "learning_rate": 9.534208658623137e-05, + "loss": 0.06634845733642578, + "step": 32830 + }, + { + "epoch": 4.661462029808375, + "grad_norm": 10.485276222229004, + "learning_rate": 9.534066713981547e-05, + "loss": 0.092935448884964, + "step": 32840 + }, + { + "epoch": 4.662881476224273, + "grad_norm": 0.24282492697238922, + "learning_rate": 9.533924769339958e-05, + "loss": 0.046061572432518, + "step": 32850 + }, + { + "epoch": 4.66430092264017, + "grad_norm": 6.648067474365234, + "learning_rate": 9.533782824698369e-05, + "loss": 0.039595258235931394, + "step": 32860 + }, + { + "epoch": 4.665720369056068, + "grad_norm": 0.3748902678489685, + "learning_rate": 9.533640880056778e-05, + "loss": 0.06567533612251282, + "step": 32870 + }, + { + "epoch": 4.667139815471966, + "grad_norm": 1.109091877937317, + "learning_rate": 9.533498935415189e-05, + "loss": 0.07075968980789185, + "step": 32880 + }, + { + "epoch": 4.6685592618878635, + "grad_norm": 0.4307088255882263, + "learning_rate": 9.533356990773598e-05, + "loss": 0.03807471692562103, + "step": 32890 + }, + { + "epoch": 4.669978708303762, + "grad_norm": 11.682064056396484, + "learning_rate": 9.53321504613201e-05, + "loss": 0.06912864446640014, + "step": 32900 + }, + { + "epoch": 4.671398154719659, + "grad_norm": 4.182003974914551, + "learning_rate": 9.533073101490419e-05, + "loss": 0.04695388674736023, + "step": 32910 + }, + { + "epoch": 4.6728176011355576, + "grad_norm": 12.598423957824707, + "learning_rate": 9.532931156848829e-05, + "loss": 0.1024010419845581, + "step": 32920 + }, + { + "epoch": 4.674237047551455, + "grad_norm": 1.3875524997711182, + "learning_rate": 9.532789212207239e-05, + "loss": 0.06368948221206665, + "step": 32930 + }, + { + "epoch": 4.675656493967352, + "grad_norm": 2.8007404804229736, + "learning_rate": 9.53264726756565e-05, + "loss": 0.1000246286392212, + "step": 32940 + }, + { + "epoch": 4.677075940383251, + "grad_norm": 1.2782186269760132, + "learning_rate": 9.532505322924061e-05, + "loss": 0.10685174465179444, + "step": 32950 + }, + { + "epoch": 4.678495386799148, + "grad_norm": 1.8826717138290405, + "learning_rate": 9.53236337828247e-05, + "loss": 0.08655181527137756, + "step": 32960 + }, + { + "epoch": 4.679914833215046, + "grad_norm": 4.107776641845703, + "learning_rate": 9.532221433640882e-05, + "loss": 0.0805110514163971, + "step": 32970 + }, + { + "epoch": 4.681334279630944, + "grad_norm": 5.16588830947876, + "learning_rate": 9.53207948899929e-05, + "loss": 0.0815092146396637, + "step": 32980 + }, + { + "epoch": 4.682753726046842, + "grad_norm": 2.872464179992676, + "learning_rate": 9.531937544357701e-05, + "loss": 0.03411953449249268, + "step": 32990 + }, + { + "epoch": 4.68417317246274, + "grad_norm": 0.06958237290382385, + "learning_rate": 9.531795599716111e-05, + "loss": 0.02626045048236847, + "step": 33000 + }, + { + "epoch": 4.68417317246274, + "eval_accuracy": 0.9627392382526865, + "eval_loss": 0.1083284318447113, + "eval_runtime": 36.6163, + "eval_samples_per_second": 429.509, + "eval_steps_per_second": 13.437, + "step": 33000 + }, + { + "epoch": 4.685592618878637, + "grad_norm": 3.2690742015838623, + "learning_rate": 9.531653655074522e-05, + "loss": 0.08817251324653626, + "step": 33010 + }, + { + "epoch": 4.687012065294535, + "grad_norm": 5.778581619262695, + "learning_rate": 9.531511710432932e-05, + "loss": 0.0841533899307251, + "step": 33020 + }, + { + "epoch": 4.688431511710433, + "grad_norm": 0.5700046420097351, + "learning_rate": 9.531369765791341e-05, + "loss": 0.08917995095252991, + "step": 33030 + }, + { + "epoch": 4.689850958126331, + "grad_norm": 1.3010187149047852, + "learning_rate": 9.531227821149753e-05, + "loss": 0.03966775238513946, + "step": 33040 + }, + { + "epoch": 4.691270404542228, + "grad_norm": 1.8215919733047485, + "learning_rate": 9.531085876508162e-05, + "loss": 0.029767876863479613, + "step": 33050 + }, + { + "epoch": 4.692689850958127, + "grad_norm": 7.450717926025391, + "learning_rate": 9.530943931866573e-05, + "loss": 0.07962870597839355, + "step": 33060 + }, + { + "epoch": 4.694109297374024, + "grad_norm": 4.49182653427124, + "learning_rate": 9.530801987224983e-05, + "loss": 0.09771084785461426, + "step": 33070 + }, + { + "epoch": 4.695528743789922, + "grad_norm": 2.3871476650238037, + "learning_rate": 9.530660042583393e-05, + "loss": 0.03243844509124756, + "step": 33080 + }, + { + "epoch": 4.69694819020582, + "grad_norm": 6.932815074920654, + "learning_rate": 9.530518097941803e-05, + "loss": 0.10589628219604492, + "step": 33090 + }, + { + "epoch": 4.698367636621717, + "grad_norm": 5.599147796630859, + "learning_rate": 9.530376153300214e-05, + "loss": 0.0905950427055359, + "step": 33100 + }, + { + "epoch": 4.699787083037616, + "grad_norm": 4.64316463470459, + "learning_rate": 9.530234208658623e-05, + "loss": 0.06359080076217652, + "step": 33110 + }, + { + "epoch": 4.701206529453513, + "grad_norm": 6.096071243286133, + "learning_rate": 9.530092264017035e-05, + "loss": 0.048346295952796936, + "step": 33120 + }, + { + "epoch": 4.702625975869411, + "grad_norm": 1.707812786102295, + "learning_rate": 9.529950319375444e-05, + "loss": 0.044125860929489134, + "step": 33130 + }, + { + "epoch": 4.704045422285309, + "grad_norm": 2.5328729152679443, + "learning_rate": 9.529808374733854e-05, + "loss": 0.0620121955871582, + "step": 33140 + }, + { + "epoch": 4.705464868701206, + "grad_norm": 2.9145994186401367, + "learning_rate": 9.529666430092265e-05, + "loss": 0.047010570764541626, + "step": 33150 + }, + { + "epoch": 4.7068843151171045, + "grad_norm": 4.588191509246826, + "learning_rate": 9.529524485450675e-05, + "loss": 0.037869596481323244, + "step": 33160 + }, + { + "epoch": 4.708303761533002, + "grad_norm": 8.48548698425293, + "learning_rate": 9.529382540809086e-05, + "loss": 0.08703064918518066, + "step": 33170 + }, + { + "epoch": 4.7097232079489, + "grad_norm": 0.5679498910903931, + "learning_rate": 9.529240596167494e-05, + "loss": 0.0526730477809906, + "step": 33180 + }, + { + "epoch": 4.711142654364798, + "grad_norm": 3.060013771057129, + "learning_rate": 9.529098651525905e-05, + "loss": 0.08811714053153992, + "step": 33190 + }, + { + "epoch": 4.712562100780696, + "grad_norm": 0.47054776549339294, + "learning_rate": 9.528956706884315e-05, + "loss": 0.06622061729431153, + "step": 33200 + }, + { + "epoch": 4.713981547196593, + "grad_norm": 1.364973783493042, + "learning_rate": 9.528814762242726e-05, + "loss": 0.03104795217514038, + "step": 33210 + }, + { + "epoch": 4.715400993612491, + "grad_norm": 1.5248595476150513, + "learning_rate": 9.528672817601136e-05, + "loss": 0.06729813814163207, + "step": 33220 + }, + { + "epoch": 4.716820440028389, + "grad_norm": 0.41258639097213745, + "learning_rate": 9.528530872959546e-05, + "loss": 0.05668038725852966, + "step": 33230 + }, + { + "epoch": 4.7182398864442865, + "grad_norm": 3.3833577632904053, + "learning_rate": 9.528388928317957e-05, + "loss": 0.0714464783668518, + "step": 33240 + }, + { + "epoch": 4.719659332860185, + "grad_norm": 5.934215545654297, + "learning_rate": 9.528246983676367e-05, + "loss": 0.06380202770233154, + "step": 33250 + }, + { + "epoch": 4.721078779276082, + "grad_norm": 0.9173101186752319, + "learning_rate": 9.528119233498935e-05, + "loss": 0.064630788564682, + "step": 33260 + }, + { + "epoch": 4.7224982256919805, + "grad_norm": 4.4683709144592285, + "learning_rate": 9.527977288857346e-05, + "loss": 0.030341708660125734, + "step": 33270 + }, + { + "epoch": 4.723917672107878, + "grad_norm": 4.554975509643555, + "learning_rate": 9.527835344215756e-05, + "loss": 0.058397090435028075, + "step": 33280 + }, + { + "epoch": 4.725337118523775, + "grad_norm": 2.262747049331665, + "learning_rate": 9.527693399574167e-05, + "loss": 0.06394088268280029, + "step": 33290 + }, + { + "epoch": 4.726756564939674, + "grad_norm": 8.058440208435059, + "learning_rate": 9.527551454932577e-05, + "loss": 0.05340535044670105, + "step": 33300 + }, + { + "epoch": 4.728176011355571, + "grad_norm": 0.3765454888343811, + "learning_rate": 9.527409510290986e-05, + "loss": 0.0275752991437912, + "step": 33310 + }, + { + "epoch": 4.729595457771469, + "grad_norm": 5.589868545532227, + "learning_rate": 9.527267565649398e-05, + "loss": 0.08953121900558472, + "step": 33320 + }, + { + "epoch": 4.731014904187367, + "grad_norm": 7.95912504196167, + "learning_rate": 9.527125621007807e-05, + "loss": 0.06377414464950562, + "step": 33330 + }, + { + "epoch": 4.732434350603265, + "grad_norm": 1.7751505374908447, + "learning_rate": 9.526983676366218e-05, + "loss": 0.12168240547180176, + "step": 33340 + }, + { + "epoch": 4.7338537970191625, + "grad_norm": 0.1998690515756607, + "learning_rate": 9.526841731724628e-05, + "loss": 0.07634644508361817, + "step": 33350 + }, + { + "epoch": 4.73527324343506, + "grad_norm": 6.665310382843018, + "learning_rate": 9.526699787083038e-05, + "loss": 0.0845515251159668, + "step": 33360 + }, + { + "epoch": 4.736692689850958, + "grad_norm": 6.774913311004639, + "learning_rate": 9.526557842441448e-05, + "loss": 0.07210742831230163, + "step": 33370 + }, + { + "epoch": 4.738112136266856, + "grad_norm": 10.878942489624023, + "learning_rate": 9.526415897799859e-05, + "loss": 0.09000579714775085, + "step": 33380 + }, + { + "epoch": 4.739531582682754, + "grad_norm": 11.169490814208984, + "learning_rate": 9.526273953158268e-05, + "loss": 0.09856179356575012, + "step": 33390 + }, + { + "epoch": 4.740951029098651, + "grad_norm": 3.1584696769714355, + "learning_rate": 9.52613200851668e-05, + "loss": 0.06054343581199646, + "step": 33400 + }, + { + "epoch": 4.74237047551455, + "grad_norm": 2.1179540157318115, + "learning_rate": 9.525990063875089e-05, + "loss": 0.08376701474189759, + "step": 33410 + }, + { + "epoch": 4.743789921930447, + "grad_norm": 7.283944606781006, + "learning_rate": 9.525848119233499e-05, + "loss": 0.0680974006652832, + "step": 33420 + }, + { + "epoch": 4.7452093683463445, + "grad_norm": 1.1434763669967651, + "learning_rate": 9.52570617459191e-05, + "loss": 0.04679420590400696, + "step": 33430 + }, + { + "epoch": 4.746628814762243, + "grad_norm": 2.1488308906555176, + "learning_rate": 9.52556422995032e-05, + "loss": 0.02475724071264267, + "step": 33440 + }, + { + "epoch": 4.74804826117814, + "grad_norm": 1.1518933773040771, + "learning_rate": 9.525422285308731e-05, + "loss": 0.0383320152759552, + "step": 33450 + }, + { + "epoch": 4.749467707594039, + "grad_norm": 3.2653729915618896, + "learning_rate": 9.525280340667139e-05, + "loss": 0.04667494595050812, + "step": 33460 + }, + { + "epoch": 4.750887154009936, + "grad_norm": 4.017123699188232, + "learning_rate": 9.52513839602555e-05, + "loss": 0.05600497722625732, + "step": 33470 + }, + { + "epoch": 4.752306600425834, + "grad_norm": 7.016024589538574, + "learning_rate": 9.52499645138396e-05, + "loss": 0.08545212745666504, + "step": 33480 + }, + { + "epoch": 4.753726046841732, + "grad_norm": 3.9017140865325928, + "learning_rate": 9.524854506742371e-05, + "loss": 0.042384487390518186, + "step": 33490 + }, + { + "epoch": 4.755145493257629, + "grad_norm": 4.046632289886475, + "learning_rate": 9.524712562100781e-05, + "loss": 0.10414813756942749, + "step": 33500 + }, + { + "epoch": 4.755145493257629, + "eval_accuracy": 0.9612767851465632, + "eval_loss": 0.12837225198745728, + "eval_runtime": 35.9316, + "eval_samples_per_second": 437.693, + "eval_steps_per_second": 13.693, + "step": 33500 + }, + { + "epoch": 4.7565649396735274, + "grad_norm": 4.732762813568115, + "learning_rate": 9.52457061745919e-05, + "loss": 0.07615302801132202, + "step": 33510 + }, + { + "epoch": 4.757984386089425, + "grad_norm": 1.0631474256515503, + "learning_rate": 9.524428672817602e-05, + "loss": 0.06459469199180604, + "step": 33520 + }, + { + "epoch": 4.759403832505323, + "grad_norm": 10.640356063842773, + "learning_rate": 9.524286728176011e-05, + "loss": 0.12269865274429322, + "step": 33530 + }, + { + "epoch": 4.760823278921221, + "grad_norm": 4.477070331573486, + "learning_rate": 9.524144783534423e-05, + "loss": 0.0742661714553833, + "step": 33540 + }, + { + "epoch": 4.762242725337119, + "grad_norm": 2.8761069774627686, + "learning_rate": 9.524002838892832e-05, + "loss": 0.07635018825531006, + "step": 33550 + }, + { + "epoch": 4.763662171753016, + "grad_norm": 2.838202476501465, + "learning_rate": 9.523860894251242e-05, + "loss": 0.030411535501480104, + "step": 33560 + }, + { + "epoch": 4.765081618168914, + "grad_norm": 5.472594261169434, + "learning_rate": 9.523718949609652e-05, + "loss": 0.07412365674972535, + "step": 33570 + }, + { + "epoch": 4.766501064584812, + "grad_norm": 5.381545543670654, + "learning_rate": 9.523577004968063e-05, + "loss": 0.059850108623504636, + "step": 33580 + }, + { + "epoch": 4.7679205110007095, + "grad_norm": 3.6640396118164062, + "learning_rate": 9.523435060326473e-05, + "loss": 0.08601389527320862, + "step": 33590 + }, + { + "epoch": 4.769339957416608, + "grad_norm": 1.533301830291748, + "learning_rate": 9.523293115684884e-05, + "loss": 0.05381173491477966, + "step": 33600 + }, + { + "epoch": 4.770759403832505, + "grad_norm": 4.691380023956299, + "learning_rate": 9.523151171043293e-05, + "loss": 0.08660387992858887, + "step": 33610 + }, + { + "epoch": 4.7721788502484035, + "grad_norm": 0.0661584809422493, + "learning_rate": 9.523009226401703e-05, + "loss": 0.0433504581451416, + "step": 33620 + }, + { + "epoch": 4.773598296664301, + "grad_norm": 5.291693687438965, + "learning_rate": 9.522867281760114e-05, + "loss": 0.08284536600112916, + "step": 33630 + }, + { + "epoch": 4.775017743080198, + "grad_norm": 4.733127117156982, + "learning_rate": 9.522725337118524e-05, + "loss": 0.08289542198181152, + "step": 33640 + }, + { + "epoch": 4.776437189496097, + "grad_norm": 10.965649604797363, + "learning_rate": 9.522583392476935e-05, + "loss": 0.09481858015060425, + "step": 33650 + }, + { + "epoch": 4.777856635911994, + "grad_norm": 1.8173820972442627, + "learning_rate": 9.522441447835345e-05, + "loss": 0.06447114944458007, + "step": 33660 + }, + { + "epoch": 4.779276082327892, + "grad_norm": 9.157655715942383, + "learning_rate": 9.522299503193755e-05, + "loss": 0.10042716264724731, + "step": 33670 + }, + { + "epoch": 4.78069552874379, + "grad_norm": 1.8306244611740112, + "learning_rate": 9.522157558552164e-05, + "loss": 0.06390793323516845, + "step": 33680 + }, + { + "epoch": 4.782114975159688, + "grad_norm": 8.975295066833496, + "learning_rate": 9.522015613910575e-05, + "loss": 0.07697048783302307, + "step": 33690 + }, + { + "epoch": 4.7835344215755855, + "grad_norm": 6.4800543785095215, + "learning_rate": 9.521873669268985e-05, + "loss": 0.04765602946281433, + "step": 33700 + }, + { + "epoch": 4.784953867991483, + "grad_norm": 0.37081560492515564, + "learning_rate": 9.521731724627396e-05, + "loss": 0.055331355333328246, + "step": 33710 + }, + { + "epoch": 4.786373314407381, + "grad_norm": 0.17576156556606293, + "learning_rate": 9.521589779985806e-05, + "loss": 0.042742720246315, + "step": 33720 + }, + { + "epoch": 4.787792760823279, + "grad_norm": 5.604384422302246, + "learning_rate": 9.521447835344216e-05, + "loss": 0.1510116219520569, + "step": 33730 + }, + { + "epoch": 4.789212207239177, + "grad_norm": 1.1139140129089355, + "learning_rate": 9.521305890702627e-05, + "loss": 0.08178800940513611, + "step": 33740 + }, + { + "epoch": 4.790631653655074, + "grad_norm": 6.628493785858154, + "learning_rate": 9.521163946061037e-05, + "loss": 0.105251944065094, + "step": 33750 + }, + { + "epoch": 4.792051100070973, + "grad_norm": 6.405097484588623, + "learning_rate": 9.521022001419448e-05, + "loss": 0.03591142892837525, + "step": 33760 + }, + { + "epoch": 4.79347054648687, + "grad_norm": 1.633231282234192, + "learning_rate": 9.520880056777856e-05, + "loss": 0.04583961963653564, + "step": 33770 + }, + { + "epoch": 4.7948899929027675, + "grad_norm": 2.0780718326568604, + "learning_rate": 9.520738112136267e-05, + "loss": 0.06329174041748047, + "step": 33780 + }, + { + "epoch": 4.796309439318666, + "grad_norm": 7.537708759307861, + "learning_rate": 9.520596167494677e-05, + "loss": 0.07685714960098267, + "step": 33790 + }, + { + "epoch": 4.797728885734563, + "grad_norm": 0.6330421566963196, + "learning_rate": 9.520454222853088e-05, + "loss": 0.025817760825157167, + "step": 33800 + }, + { + "epoch": 4.7991483321504615, + "grad_norm": 1.9811617136001587, + "learning_rate": 9.520312278211499e-05, + "loss": 0.12827333211898803, + "step": 33810 + }, + { + "epoch": 4.800567778566359, + "grad_norm": 0.6081200838088989, + "learning_rate": 9.520170333569907e-05, + "loss": 0.033173537254333495, + "step": 33820 + }, + { + "epoch": 4.801987224982257, + "grad_norm": 4.308277130126953, + "learning_rate": 9.520028388928319e-05, + "loss": 0.03841128945350647, + "step": 33830 + }, + { + "epoch": 4.803406671398155, + "grad_norm": 3.689964532852173, + "learning_rate": 9.519886444286728e-05, + "loss": 0.06301182508468628, + "step": 33840 + }, + { + "epoch": 4.804826117814052, + "grad_norm": 3.0603621006011963, + "learning_rate": 9.51974449964514e-05, + "loss": 0.029316383600234985, + "step": 33850 + }, + { + "epoch": 4.80624556422995, + "grad_norm": 0.7296545505523682, + "learning_rate": 9.519602555003549e-05, + "loss": 0.07017461061477662, + "step": 33860 + }, + { + "epoch": 4.807665010645848, + "grad_norm": 0.6221994161605835, + "learning_rate": 9.519460610361959e-05, + "loss": 0.06105865836143494, + "step": 33870 + }, + { + "epoch": 4.809084457061746, + "grad_norm": 0.613722026348114, + "learning_rate": 9.519318665720369e-05, + "loss": 0.04575749039649964, + "step": 33880 + }, + { + "epoch": 4.810503903477644, + "grad_norm": 0.2880522310733795, + "learning_rate": 9.51917672107878e-05, + "loss": 0.035099685192108154, + "step": 33890 + }, + { + "epoch": 4.811923349893542, + "grad_norm": 0.7631796598434448, + "learning_rate": 9.519034776437191e-05, + "loss": 0.09773088097572327, + "step": 33900 + }, + { + "epoch": 4.813342796309439, + "grad_norm": 5.633825778961182, + "learning_rate": 9.5188928317956e-05, + "loss": 0.07228602170944214, + "step": 33910 + }, + { + "epoch": 4.814762242725337, + "grad_norm": 1.5585920810699463, + "learning_rate": 9.51875088715401e-05, + "loss": 0.04053950607776642, + "step": 33920 + }, + { + "epoch": 4.816181689141235, + "grad_norm": 1.0823962688446045, + "learning_rate": 9.51860894251242e-05, + "loss": 0.04932558238506317, + "step": 33930 + }, + { + "epoch": 4.817601135557132, + "grad_norm": 7.6794586181640625, + "learning_rate": 9.518466997870831e-05, + "loss": 0.049499303102493286, + "step": 33940 + }, + { + "epoch": 4.819020581973031, + "grad_norm": 2.137556314468384, + "learning_rate": 9.518325053229241e-05, + "loss": 0.048341837525367734, + "step": 33950 + }, + { + "epoch": 4.820440028388928, + "grad_norm": 3.8483757972717285, + "learning_rate": 9.518183108587652e-05, + "loss": 0.060046988725662234, + "step": 33960 + }, + { + "epoch": 4.8218594748048265, + "grad_norm": 6.301821708679199, + "learning_rate": 9.518041163946062e-05, + "loss": 0.10900142192840576, + "step": 33970 + }, + { + "epoch": 4.823278921220724, + "grad_norm": 8.898664474487305, + "learning_rate": 9.517899219304471e-05, + "loss": 0.05514363646507263, + "step": 33980 + }, + { + "epoch": 4.824698367636621, + "grad_norm": 12.41129207611084, + "learning_rate": 9.517757274662882e-05, + "loss": 0.12840945720672609, + "step": 33990 + }, + { + "epoch": 4.82611781405252, + "grad_norm": 2.5299203395843506, + "learning_rate": 9.517615330021292e-05, + "loss": 0.05092512369155884, + "step": 34000 + }, + { + "epoch": 4.82611781405252, + "eval_accuracy": 0.9595599923698099, + "eval_loss": 0.12192901968955994, + "eval_runtime": 35.0097, + "eval_samples_per_second": 449.218, + "eval_steps_per_second": 14.053, + "step": 34000 + }, + { + "epoch": 4.827537260468417, + "grad_norm": 0.23498234152793884, + "learning_rate": 9.517473385379703e-05, + "loss": 0.07249274253845214, + "step": 34010 + }, + { + "epoch": 4.828956706884315, + "grad_norm": 3.2055423259735107, + "learning_rate": 9.517331440738113e-05, + "loss": 0.05897048711776733, + "step": 34020 + }, + { + "epoch": 4.830376153300213, + "grad_norm": 1.8760758638381958, + "learning_rate": 9.517189496096523e-05, + "loss": 0.05109198689460755, + "step": 34030 + }, + { + "epoch": 4.831795599716111, + "grad_norm": 6.842248916625977, + "learning_rate": 9.517047551454932e-05, + "loss": 0.07260550260543823, + "step": 34040 + }, + { + "epoch": 4.8332150461320085, + "grad_norm": 3.05704927444458, + "learning_rate": 9.516905606813344e-05, + "loss": 0.06829613447189331, + "step": 34050 + }, + { + "epoch": 4.834634492547906, + "grad_norm": 9.268196105957031, + "learning_rate": 9.516763662171753e-05, + "loss": 0.10247148275375366, + "step": 34060 + }, + { + "epoch": 4.836053938963804, + "grad_norm": 8.238924980163574, + "learning_rate": 9.516621717530164e-05, + "loss": 0.09007470607757569, + "step": 34070 + }, + { + "epoch": 4.837473385379702, + "grad_norm": 7.473556995391846, + "learning_rate": 9.516479772888574e-05, + "loss": 0.09785959720611573, + "step": 34080 + }, + { + "epoch": 4.8388928317956, + "grad_norm": 6.157594680786133, + "learning_rate": 9.516337828246984e-05, + "loss": 0.06533307433128357, + "step": 34090 + }, + { + "epoch": 4.840312278211497, + "grad_norm": 7.398861885070801, + "learning_rate": 9.516195883605395e-05, + "loss": 0.03795735538005829, + "step": 34100 + }, + { + "epoch": 4.841731724627396, + "grad_norm": 4.990109920501709, + "learning_rate": 9.516053938963805e-05, + "loss": 0.061510467529296876, + "step": 34110 + }, + { + "epoch": 4.843151171043293, + "grad_norm": 6.985716819763184, + "learning_rate": 9.515911994322216e-05, + "loss": 0.08015224933624268, + "step": 34120 + }, + { + "epoch": 4.8445706174591905, + "grad_norm": 0.4228348135948181, + "learning_rate": 9.515770049680624e-05, + "loss": 0.07228795886039734, + "step": 34130 + }, + { + "epoch": 4.845990063875089, + "grad_norm": 2.4188504219055176, + "learning_rate": 9.515628105039035e-05, + "loss": 0.038450679183006285, + "step": 34140 + }, + { + "epoch": 4.847409510290986, + "grad_norm": 0.691714882850647, + "learning_rate": 9.515486160397445e-05, + "loss": 0.04819165468215943, + "step": 34150 + }, + { + "epoch": 4.8488289567068845, + "grad_norm": 4.017334938049316, + "learning_rate": 9.515344215755856e-05, + "loss": 0.03329123556613922, + "step": 34160 + }, + { + "epoch": 4.850248403122782, + "grad_norm": 5.272017478942871, + "learning_rate": 9.515202271114266e-05, + "loss": 0.07376419901847839, + "step": 34170 + }, + { + "epoch": 4.85166784953868, + "grad_norm": 4.051192760467529, + "learning_rate": 9.515060326472676e-05, + "loss": 0.06695409417152405, + "step": 34180 + }, + { + "epoch": 4.853087295954578, + "grad_norm": 1.7410699129104614, + "learning_rate": 9.514918381831087e-05, + "loss": 0.1089176893234253, + "step": 34190 + }, + { + "epoch": 4.854506742370475, + "grad_norm": 1.8117938041687012, + "learning_rate": 9.514776437189496e-05, + "loss": 0.10838677883148193, + "step": 34200 + }, + { + "epoch": 4.855926188786373, + "grad_norm": 2.6073532104492188, + "learning_rate": 9.514634492547908e-05, + "loss": 0.05355033278465271, + "step": 34210 + }, + { + "epoch": 4.857345635202271, + "grad_norm": 5.410396099090576, + "learning_rate": 9.514492547906317e-05, + "loss": 0.06793102025985717, + "step": 34220 + }, + { + "epoch": 4.858765081618169, + "grad_norm": 6.987936019897461, + "learning_rate": 9.514350603264727e-05, + "loss": 0.07279337644577026, + "step": 34230 + }, + { + "epoch": 4.8601845280340665, + "grad_norm": 8.075188636779785, + "learning_rate": 9.514208658623137e-05, + "loss": 0.040511229634284975, + "step": 34240 + }, + { + "epoch": 4.861603974449965, + "grad_norm": 1.9225119352340698, + "learning_rate": 9.514066713981548e-05, + "loss": 0.0797190546989441, + "step": 34250 + }, + { + "epoch": 4.863023420865862, + "grad_norm": 4.796272277832031, + "learning_rate": 9.513924769339958e-05, + "loss": 0.03872755765914917, + "step": 34260 + }, + { + "epoch": 4.86444286728176, + "grad_norm": 12.368243217468262, + "learning_rate": 9.513782824698369e-05, + "loss": 0.05745645761489868, + "step": 34270 + }, + { + "epoch": 4.865862313697658, + "grad_norm": 0.11512895673513412, + "learning_rate": 9.513640880056778e-05, + "loss": 0.040886858105659486, + "step": 34280 + }, + { + "epoch": 4.867281760113555, + "grad_norm": 8.322936058044434, + "learning_rate": 9.513498935415188e-05, + "loss": 0.0884483814239502, + "step": 34290 + }, + { + "epoch": 4.868701206529454, + "grad_norm": 7.160366535186768, + "learning_rate": 9.513356990773599e-05, + "loss": 0.09933829307556152, + "step": 34300 + }, + { + "epoch": 4.870120652945351, + "grad_norm": 0.4381280243396759, + "learning_rate": 9.513215046132009e-05, + "loss": 0.049053031206130984, + "step": 34310 + }, + { + "epoch": 4.871540099361249, + "grad_norm": 3.679687738418579, + "learning_rate": 9.51307310149042e-05, + "loss": 0.08237858414649964, + "step": 34320 + }, + { + "epoch": 4.872959545777147, + "grad_norm": 0.3392900228500366, + "learning_rate": 9.51293115684883e-05, + "loss": 0.07077882885932922, + "step": 34330 + }, + { + "epoch": 4.874378992193044, + "grad_norm": 14.285609245300293, + "learning_rate": 9.51278921220724e-05, + "loss": 0.13571418523788453, + "step": 34340 + }, + { + "epoch": 4.875798438608943, + "grad_norm": 5.558465957641602, + "learning_rate": 9.512647267565649e-05, + "loss": 0.11963750123977661, + "step": 34350 + }, + { + "epoch": 4.87721788502484, + "grad_norm": 5.402983665466309, + "learning_rate": 9.51250532292406e-05, + "loss": 0.06695018410682678, + "step": 34360 + }, + { + "epoch": 4.878637331440738, + "grad_norm": 4.293505668640137, + "learning_rate": 9.51236337828247e-05, + "loss": 0.04254850745201111, + "step": 34370 + }, + { + "epoch": 4.880056777856636, + "grad_norm": 6.603433609008789, + "learning_rate": 9.512221433640881e-05, + "loss": 0.06627193689346314, + "step": 34380 + }, + { + "epoch": 4.881476224272534, + "grad_norm": 2.177635431289673, + "learning_rate": 9.512079488999291e-05, + "loss": 0.03508804738521576, + "step": 34390 + }, + { + "epoch": 4.8828956706884314, + "grad_norm": 0.21414713561534882, + "learning_rate": 9.5119375443577e-05, + "loss": 0.07482952475547791, + "step": 34400 + }, + { + "epoch": 4.884315117104329, + "grad_norm": 1.777644157409668, + "learning_rate": 9.511795599716112e-05, + "loss": 0.03625571131706238, + "step": 34410 + }, + { + "epoch": 4.885734563520227, + "grad_norm": 1.2347965240478516, + "learning_rate": 9.511653655074521e-05, + "loss": 0.04854299426078797, + "step": 34420 + }, + { + "epoch": 4.887154009936125, + "grad_norm": 1.3775534629821777, + "learning_rate": 9.511511710432933e-05, + "loss": 0.03984416425228119, + "step": 34430 + }, + { + "epoch": 4.888573456352023, + "grad_norm": 2.4557316303253174, + "learning_rate": 9.511369765791341e-05, + "loss": 0.0749228298664093, + "step": 34440 + }, + { + "epoch": 4.88999290276792, + "grad_norm": 9.009647369384766, + "learning_rate": 9.511227821149752e-05, + "loss": 0.09757251739501953, + "step": 34450 + }, + { + "epoch": 4.891412349183819, + "grad_norm": 0.7110661864280701, + "learning_rate": 9.511085876508162e-05, + "loss": 0.08272533416748047, + "step": 34460 + }, + { + "epoch": 4.892831795599716, + "grad_norm": 2.4361939430236816, + "learning_rate": 9.510943931866573e-05, + "loss": 0.042311400175094604, + "step": 34470 + }, + { + "epoch": 4.8942512420156135, + "grad_norm": 0.9503983855247498, + "learning_rate": 9.510801987224983e-05, + "loss": 0.026289787888526917, + "step": 34480 + }, + { + "epoch": 4.895670688431512, + "grad_norm": 0.559457004070282, + "learning_rate": 9.510660042583392e-05, + "loss": 0.028341618180274964, + "step": 34490 + }, + { + "epoch": 4.897090134847409, + "grad_norm": 9.252950668334961, + "learning_rate": 9.510518097941803e-05, + "loss": 0.11836056709289551, + "step": 34500 + }, + { + "epoch": 4.897090134847409, + "eval_accuracy": 0.9775545240668914, + "eval_loss": 0.06881918758153915, + "eval_runtime": 34.9972, + "eval_samples_per_second": 449.378, + "eval_steps_per_second": 14.058, + "step": 34500 + }, + { + "epoch": 4.8985095812633075, + "grad_norm": 1.470131516456604, + "learning_rate": 9.510376153300213e-05, + "loss": 0.06065631508827209, + "step": 34510 + }, + { + "epoch": 4.899929027679205, + "grad_norm": 0.8848506212234497, + "learning_rate": 9.510234208658624e-05, + "loss": 0.10929383039474487, + "step": 34520 + }, + { + "epoch": 4.901348474095103, + "grad_norm": 1.5781108140945435, + "learning_rate": 9.510092264017034e-05, + "loss": 0.055705088376998904, + "step": 34530 + }, + { + "epoch": 4.902767920511001, + "grad_norm": 4.346358299255371, + "learning_rate": 9.509950319375444e-05, + "loss": 0.040107375383377074, + "step": 34540 + }, + { + "epoch": 4.904187366926898, + "grad_norm": 3.9102401733398438, + "learning_rate": 9.509808374733854e-05, + "loss": 0.07414058446884156, + "step": 34550 + }, + { + "epoch": 4.905606813342796, + "grad_norm": 1.8277835845947266, + "learning_rate": 9.509666430092265e-05, + "loss": 0.04326414465904236, + "step": 34560 + }, + { + "epoch": 4.907026259758694, + "grad_norm": 5.239001750946045, + "learning_rate": 9.509524485450674e-05, + "loss": 0.07349801659584046, + "step": 34570 + }, + { + "epoch": 4.908445706174592, + "grad_norm": 0.15750083327293396, + "learning_rate": 9.509382540809085e-05, + "loss": 0.06307926177978515, + "step": 34580 + }, + { + "epoch": 4.9098651525904895, + "grad_norm": 1.198663353919983, + "learning_rate": 9.509240596167495e-05, + "loss": 0.05986272692680359, + "step": 34590 + }, + { + "epoch": 4.911284599006388, + "grad_norm": 0.8945283889770508, + "learning_rate": 9.509098651525905e-05, + "loss": 0.050188446044921876, + "step": 34600 + }, + { + "epoch": 4.912704045422285, + "grad_norm": 5.8996124267578125, + "learning_rate": 9.508956706884316e-05, + "loss": 0.12322105169296264, + "step": 34610 + }, + { + "epoch": 4.914123491838183, + "grad_norm": 4.365360260009766, + "learning_rate": 9.508814762242726e-05, + "loss": 0.06297153234481812, + "step": 34620 + }, + { + "epoch": 4.915542938254081, + "grad_norm": 11.717504501342773, + "learning_rate": 9.508672817601137e-05, + "loss": 0.0508009672164917, + "step": 34630 + }, + { + "epoch": 4.916962384669978, + "grad_norm": 5.38115930557251, + "learning_rate": 9.508530872959545e-05, + "loss": 0.04837482571601868, + "step": 34640 + }, + { + "epoch": 4.918381831085877, + "grad_norm": 7.246404647827148, + "learning_rate": 9.508388928317956e-05, + "loss": 0.05574921369552612, + "step": 34650 + }, + { + "epoch": 4.919801277501774, + "grad_norm": 1.3808271884918213, + "learning_rate": 9.508246983676366e-05, + "loss": 0.030999797582626342, + "step": 34660 + }, + { + "epoch": 4.921220723917672, + "grad_norm": 12.624173164367676, + "learning_rate": 9.508105039034777e-05, + "loss": 0.0817840576171875, + "step": 34670 + }, + { + "epoch": 4.92264017033357, + "grad_norm": 4.791901111602783, + "learning_rate": 9.507963094393187e-05, + "loss": 0.1045034408569336, + "step": 34680 + }, + { + "epoch": 4.924059616749467, + "grad_norm": 1.011415719985962, + "learning_rate": 9.507821149751598e-05, + "loss": 0.05733692049980164, + "step": 34690 + }, + { + "epoch": 4.9254790631653655, + "grad_norm": 3.5056240558624268, + "learning_rate": 9.507679205110008e-05, + "loss": 0.03916033804416656, + "step": 34700 + }, + { + "epoch": 4.926898509581263, + "grad_norm": 3.432095527648926, + "learning_rate": 9.507537260468417e-05, + "loss": 0.05065256357192993, + "step": 34710 + }, + { + "epoch": 4.928317955997161, + "grad_norm": 5.719847679138184, + "learning_rate": 9.507395315826829e-05, + "loss": 0.03222036361694336, + "step": 34720 + }, + { + "epoch": 4.929737402413059, + "grad_norm": 8.551188468933105, + "learning_rate": 9.507253371185238e-05, + "loss": 0.12375338077545166, + "step": 34730 + }, + { + "epoch": 4.931156848828957, + "grad_norm": 7.09042501449585, + "learning_rate": 9.50711142654365e-05, + "loss": 0.06138598918914795, + "step": 34740 + }, + { + "epoch": 4.932576295244854, + "grad_norm": 13.028380393981934, + "learning_rate": 9.506969481902058e-05, + "loss": 0.09170422554016114, + "step": 34750 + }, + { + "epoch": 4.933995741660752, + "grad_norm": 0.827090859413147, + "learning_rate": 9.506827537260469e-05, + "loss": 0.07565316557884216, + "step": 34760 + }, + { + "epoch": 4.93541518807665, + "grad_norm": 8.198225021362305, + "learning_rate": 9.506685592618879e-05, + "loss": 0.06974472999572753, + "step": 34770 + }, + { + "epoch": 4.936834634492548, + "grad_norm": 4.147625923156738, + "learning_rate": 9.50654364797729e-05, + "loss": 0.04362180233001709, + "step": 34780 + }, + { + "epoch": 4.938254080908446, + "grad_norm": 0.3720574975013733, + "learning_rate": 9.5064017033357e-05, + "loss": 0.034629127383232115, + "step": 34790 + }, + { + "epoch": 4.939673527324343, + "grad_norm": 0.04120853543281555, + "learning_rate": 9.506259758694109e-05, + "loss": 0.08741816282272338, + "step": 34800 + }, + { + "epoch": 4.941092973740242, + "grad_norm": 2.3259832859039307, + "learning_rate": 9.50611781405252e-05, + "loss": 0.04545274972915649, + "step": 34810 + }, + { + "epoch": 4.942512420156139, + "grad_norm": 6.789836406707764, + "learning_rate": 9.50597586941093e-05, + "loss": 0.05636190176010132, + "step": 34820 + }, + { + "epoch": 4.943931866572036, + "grad_norm": 3.173494815826416, + "learning_rate": 9.505833924769341e-05, + "loss": 0.050359517335891724, + "step": 34830 + }, + { + "epoch": 4.945351312987935, + "grad_norm": 1.0671789646148682, + "learning_rate": 9.505691980127751e-05, + "loss": 0.06796733736991882, + "step": 34840 + }, + { + "epoch": 4.946770759403832, + "grad_norm": 0.4082445800304413, + "learning_rate": 9.50555003548616e-05, + "loss": 0.06941262483596802, + "step": 34850 + }, + { + "epoch": 4.9481902058197305, + "grad_norm": 12.27849292755127, + "learning_rate": 9.50540809084457e-05, + "loss": 0.10136985778808594, + "step": 34860 + }, + { + "epoch": 4.949609652235628, + "grad_norm": 9.443519592285156, + "learning_rate": 9.505266146202981e-05, + "loss": 0.07273834943771362, + "step": 34870 + }, + { + "epoch": 4.951029098651526, + "grad_norm": 1.492857575416565, + "learning_rate": 9.505124201561391e-05, + "loss": 0.07884240746498108, + "step": 34880 + }, + { + "epoch": 4.952448545067424, + "grad_norm": 0.09069759398698807, + "learning_rate": 9.504982256919802e-05, + "loss": 0.03405750691890717, + "step": 34890 + }, + { + "epoch": 4.953867991483321, + "grad_norm": 3.4466402530670166, + "learning_rate": 9.504840312278212e-05, + "loss": 0.060506463050842285, + "step": 34900 + }, + { + "epoch": 4.955287437899219, + "grad_norm": 5.695678234100342, + "learning_rate": 9.504698367636622e-05, + "loss": 0.08132978677749633, + "step": 34910 + }, + { + "epoch": 4.956706884315117, + "grad_norm": 1.3141629695892334, + "learning_rate": 9.504556422995033e-05, + "loss": 0.08595213890075684, + "step": 34920 + }, + { + "epoch": 4.958126330731015, + "grad_norm": 1.775640606880188, + "learning_rate": 9.504414478353443e-05, + "loss": 0.08925971984863282, + "step": 34930 + }, + { + "epoch": 4.9595457771469125, + "grad_norm": 1.661397933959961, + "learning_rate": 9.504272533711854e-05, + "loss": 0.057273763418197635, + "step": 34940 + }, + { + "epoch": 4.960965223562811, + "grad_norm": 6.331634998321533, + "learning_rate": 9.504130589070262e-05, + "loss": 0.06684795022010803, + "step": 34950 + }, + { + "epoch": 4.962384669978708, + "grad_norm": 3.3791844844818115, + "learning_rate": 9.503988644428673e-05, + "loss": 0.02721550464630127, + "step": 34960 + }, + { + "epoch": 4.963804116394606, + "grad_norm": 3.0752956867218018, + "learning_rate": 9.503846699787083e-05, + "loss": 0.07388071417808532, + "step": 34970 + }, + { + "epoch": 4.965223562810504, + "grad_norm": 4.732667922973633, + "learning_rate": 9.503704755145494e-05, + "loss": 0.049740397930145265, + "step": 34980 + }, + { + "epoch": 4.966643009226401, + "grad_norm": 0.340250700712204, + "learning_rate": 9.503562810503904e-05, + "loss": 0.051161551475524904, + "step": 34990 + }, + { + "epoch": 4.9680624556423, + "grad_norm": 2.337019443511963, + "learning_rate": 9.503420865862313e-05, + "loss": 0.08789908289909362, + "step": 35000 + }, + { + "epoch": 4.9680624556423, + "eval_accuracy": 0.96986074903033, + "eval_loss": 0.09208517521619797, + "eval_runtime": 35.1679, + "eval_samples_per_second": 447.198, + "eval_steps_per_second": 13.99, + "step": 35000 + }, + { + "epoch": 4.969481902058197, + "grad_norm": 1.2752114534378052, + "learning_rate": 9.503278921220724e-05, + "loss": 0.031662821769714355, + "step": 35010 + }, + { + "epoch": 4.970901348474095, + "grad_norm": 4.962484359741211, + "learning_rate": 9.503136976579134e-05, + "loss": 0.03634861707687378, + "step": 35020 + }, + { + "epoch": 4.972320794889993, + "grad_norm": 3.9700536727905273, + "learning_rate": 9.502995031937545e-05, + "loss": 0.09397113919258118, + "step": 35030 + }, + { + "epoch": 4.97374024130589, + "grad_norm": 1.4326708316802979, + "learning_rate": 9.502853087295955e-05, + "loss": 0.055845755338668826, + "step": 35040 + }, + { + "epoch": 4.9751596877217885, + "grad_norm": 0.2545025646686554, + "learning_rate": 9.502711142654366e-05, + "loss": 0.07820132970809937, + "step": 35050 + }, + { + "epoch": 4.976579134137686, + "grad_norm": 0.48985761404037476, + "learning_rate": 9.502569198012775e-05, + "loss": 0.050520259141922, + "step": 35060 + }, + { + "epoch": 4.977998580553584, + "grad_norm": 2.2398428916931152, + "learning_rate": 9.502427253371186e-05, + "loss": 0.061135333776473996, + "step": 35070 + }, + { + "epoch": 4.979418026969482, + "grad_norm": 1.8560504913330078, + "learning_rate": 9.502285308729595e-05, + "loss": 0.04997407197952271, + "step": 35080 + }, + { + "epoch": 4.98083747338538, + "grad_norm": 16.011417388916016, + "learning_rate": 9.502143364088006e-05, + "loss": 0.1147346019744873, + "step": 35090 + }, + { + "epoch": 4.982256919801277, + "grad_norm": 3.327463388442993, + "learning_rate": 9.502001419446418e-05, + "loss": 0.06455175876617432, + "step": 35100 + }, + { + "epoch": 4.983676366217175, + "grad_norm": 2.0864782333374023, + "learning_rate": 9.501859474804826e-05, + "loss": 0.11143285036087036, + "step": 35110 + }, + { + "epoch": 4.985095812633073, + "grad_norm": 2.2053961753845215, + "learning_rate": 9.501717530163237e-05, + "loss": 0.05626400113105774, + "step": 35120 + }, + { + "epoch": 4.9865152590489705, + "grad_norm": 1.0315237045288086, + "learning_rate": 9.501575585521647e-05, + "loss": 0.04532181918621063, + "step": 35130 + }, + { + "epoch": 4.987934705464869, + "grad_norm": 5.396690845489502, + "learning_rate": 9.501433640880058e-05, + "loss": 0.10503163337707519, + "step": 35140 + }, + { + "epoch": 4.989354151880766, + "grad_norm": 10.326635360717773, + "learning_rate": 9.501291696238468e-05, + "loss": 0.09536985158920289, + "step": 35150 + }, + { + "epoch": 4.990773598296665, + "grad_norm": 1.1461143493652344, + "learning_rate": 9.501149751596877e-05, + "loss": 0.07849235534667968, + "step": 35160 + }, + { + "epoch": 4.992193044712562, + "grad_norm": 5.380537033081055, + "learning_rate": 9.501007806955287e-05, + "loss": 0.09231789708137512, + "step": 35170 + }, + { + "epoch": 4.99361249112846, + "grad_norm": 3.530773639678955, + "learning_rate": 9.500865862313698e-05, + "loss": 0.1057739019393921, + "step": 35180 + }, + { + "epoch": 4.995031937544358, + "grad_norm": 7.351833820343018, + "learning_rate": 9.500723917672109e-05, + "loss": 0.09145522713661194, + "step": 35190 + }, + { + "epoch": 4.996451383960255, + "grad_norm": 16.75145721435547, + "learning_rate": 9.500581973030519e-05, + "loss": 0.06758233308792114, + "step": 35200 + }, + { + "epoch": 4.997870830376153, + "grad_norm": 2.8542025089263916, + "learning_rate": 9.500440028388929e-05, + "loss": 0.0541307270526886, + "step": 35210 + }, + { + "epoch": 4.999290276792051, + "grad_norm": 0.6312423348426819, + "learning_rate": 9.500298083747338e-05, + "loss": 0.09540512561798095, + "step": 35220 + }, + { + "epoch": 5.000709723207949, + "grad_norm": 2.2039694786071777, + "learning_rate": 9.50015613910575e-05, + "loss": 0.07570767402648926, + "step": 35230 + }, + { + "epoch": 5.002129169623847, + "grad_norm": 3.426652431488037, + "learning_rate": 9.500014194464159e-05, + "loss": 0.07772423624992371, + "step": 35240 + }, + { + "epoch": 5.003548616039745, + "grad_norm": 7.052762985229492, + "learning_rate": 9.49987224982257e-05, + "loss": 0.0919622004032135, + "step": 35250 + }, + { + "epoch": 5.004968062455642, + "grad_norm": 1.8353569507598877, + "learning_rate": 9.499730305180979e-05, + "loss": 0.07758974432945251, + "step": 35260 + }, + { + "epoch": 5.00638750887154, + "grad_norm": 0.22881099581718445, + "learning_rate": 9.49958836053939e-05, + "loss": 0.06562204360961914, + "step": 35270 + }, + { + "epoch": 5.007806955287438, + "grad_norm": 6.86403751373291, + "learning_rate": 9.499446415897801e-05, + "loss": 0.056407433748245236, + "step": 35280 + }, + { + "epoch": 5.009226401703335, + "grad_norm": 4.984923362731934, + "learning_rate": 9.49930447125621e-05, + "loss": 0.04461563229560852, + "step": 35290 + }, + { + "epoch": 5.010645848119234, + "grad_norm": 6.596005916595459, + "learning_rate": 9.499162526614622e-05, + "loss": 0.08564714789390564, + "step": 35300 + }, + { + "epoch": 5.012065294535131, + "grad_norm": 1.0412707328796387, + "learning_rate": 9.49902058197303e-05, + "loss": 0.05729523301124573, + "step": 35310 + }, + { + "epoch": 5.0134847409510295, + "grad_norm": 2.90087628364563, + "learning_rate": 9.498878637331441e-05, + "loss": 0.032120704650878906, + "step": 35320 + }, + { + "epoch": 5.014904187366927, + "grad_norm": 4.575965404510498, + "learning_rate": 9.498736692689851e-05, + "loss": 0.07489084005355835, + "step": 35330 + }, + { + "epoch": 5.016323633782824, + "grad_norm": 7.065843105316162, + "learning_rate": 9.498594748048262e-05, + "loss": 0.10670671463012696, + "step": 35340 + }, + { + "epoch": 5.017743080198723, + "grad_norm": 0.6793409585952759, + "learning_rate": 9.498452803406672e-05, + "loss": 0.09802578687667847, + "step": 35350 + }, + { + "epoch": 5.01916252661462, + "grad_norm": 5.977087020874023, + "learning_rate": 9.498310858765082e-05, + "loss": 0.06022813320159912, + "step": 35360 + }, + { + "epoch": 5.020581973030518, + "grad_norm": 5.02726936340332, + "learning_rate": 9.498168914123493e-05, + "loss": 0.04116607308387756, + "step": 35370 + }, + { + "epoch": 5.022001419446416, + "grad_norm": 2.7055563926696777, + "learning_rate": 9.498026969481902e-05, + "loss": 0.02340538948774338, + "step": 35380 + }, + { + "epoch": 5.023420865862314, + "grad_norm": 8.319058418273926, + "learning_rate": 9.497885024840313e-05, + "loss": 0.03942314982414245, + "step": 35390 + }, + { + "epoch": 5.0248403122782115, + "grad_norm": 1.0293197631835938, + "learning_rate": 9.497743080198723e-05, + "loss": 0.03575259149074554, + "step": 35400 + }, + { + "epoch": 5.026259758694109, + "grad_norm": 4.049915790557861, + "learning_rate": 9.497601135557134e-05, + "loss": 0.06752086877822876, + "step": 35410 + }, + { + "epoch": 5.027679205110007, + "grad_norm": 0.5083687901496887, + "learning_rate": 9.497459190915543e-05, + "loss": 0.04853183031082153, + "step": 35420 + }, + { + "epoch": 5.029098651525905, + "grad_norm": 0.34340715408325195, + "learning_rate": 9.497317246273954e-05, + "loss": 0.04110357165336609, + "step": 35430 + }, + { + "epoch": 5.030518097941803, + "grad_norm": 6.67487096786499, + "learning_rate": 9.497175301632364e-05, + "loss": 0.0675375759601593, + "step": 35440 + }, + { + "epoch": 5.0319375443577, + "grad_norm": 2.9003336429595947, + "learning_rate": 9.497033356990775e-05, + "loss": 0.034189680218696596, + "step": 35450 + }, + { + "epoch": 5.033356990773599, + "grad_norm": 3.9349417686462402, + "learning_rate": 9.496891412349184e-05, + "loss": 0.021588873863220216, + "step": 35460 + }, + { + "epoch": 5.034776437189496, + "grad_norm": 0.9912833571434021, + "learning_rate": 9.496749467707594e-05, + "loss": 0.05585495829582214, + "step": 35470 + }, + { + "epoch": 5.0361958836053935, + "grad_norm": 1.4849556684494019, + "learning_rate": 9.496607523066005e-05, + "loss": 0.05193337798118591, + "step": 35480 + }, + { + "epoch": 5.037615330021292, + "grad_norm": 1.7137938737869263, + "learning_rate": 9.496465578424415e-05, + "loss": 0.030359289050102232, + "step": 35490 + }, + { + "epoch": 5.039034776437189, + "grad_norm": 5.067256450653076, + "learning_rate": 9.496323633782826e-05, + "loss": 0.061655843257904054, + "step": 35500 + }, + { + "epoch": 5.039034776437189, + "eval_accuracy": 0.9721498060660011, + "eval_loss": 0.08636458963155746, + "eval_runtime": 34.3468, + "eval_samples_per_second": 457.888, + "eval_steps_per_second": 14.324, + "step": 35500 + }, + { + "epoch": 5.0404542228530875, + "grad_norm": 2.4241161346435547, + "learning_rate": 9.496181689141236e-05, + "loss": 0.046561521291732785, + "step": 35510 + }, + { + "epoch": 5.041873669268985, + "grad_norm": 0.9589247703552246, + "learning_rate": 9.496039744499645e-05, + "loss": 0.028034707903861998, + "step": 35520 + }, + { + "epoch": 5.043293115684883, + "grad_norm": 0.2947952449321747, + "learning_rate": 9.495897799858055e-05, + "loss": 0.08396986126899719, + "step": 35530 + }, + { + "epoch": 5.044712562100781, + "grad_norm": 3.244903564453125, + "learning_rate": 9.495755855216466e-05, + "loss": 0.04211449921131134, + "step": 35540 + }, + { + "epoch": 5.046132008516678, + "grad_norm": 8.584826469421387, + "learning_rate": 9.495613910574876e-05, + "loss": 0.06612651944160461, + "step": 35550 + }, + { + "epoch": 5.047551454932576, + "grad_norm": 4.459551811218262, + "learning_rate": 9.495471965933287e-05, + "loss": 0.040356886386871335, + "step": 35560 + }, + { + "epoch": 5.048970901348474, + "grad_norm": 9.547006607055664, + "learning_rate": 9.495330021291697e-05, + "loss": 0.04492848515510559, + "step": 35570 + }, + { + "epoch": 5.050390347764372, + "grad_norm": 4.3004374504089355, + "learning_rate": 9.495188076650107e-05, + "loss": 0.05426920652389526, + "step": 35580 + }, + { + "epoch": 5.0518097941802695, + "grad_norm": 1.8567450046539307, + "learning_rate": 9.495046132008518e-05, + "loss": 0.028099411725997926, + "step": 35590 + }, + { + "epoch": 5.053229240596168, + "grad_norm": 2.021097183227539, + "learning_rate": 9.494904187366927e-05, + "loss": 0.04831460118293762, + "step": 35600 + }, + { + "epoch": 5.054648687012065, + "grad_norm": 6.291840076446533, + "learning_rate": 9.494762242725339e-05, + "loss": 0.04899407923221588, + "step": 35610 + }, + { + "epoch": 5.056068133427963, + "grad_norm": 2.291241407394409, + "learning_rate": 9.494620298083747e-05, + "loss": 0.052272289991378784, + "step": 35620 + }, + { + "epoch": 5.057487579843861, + "grad_norm": 9.583127975463867, + "learning_rate": 9.494478353442158e-05, + "loss": 0.05495100021362305, + "step": 35630 + }, + { + "epoch": 5.058907026259758, + "grad_norm": 7.2725830078125, + "learning_rate": 9.494336408800568e-05, + "loss": 0.07137655615806579, + "step": 35640 + }, + { + "epoch": 5.060326472675657, + "grad_norm": 9.047038078308105, + "learning_rate": 9.494194464158979e-05, + "loss": 0.0471313625574112, + "step": 35650 + }, + { + "epoch": 5.061745919091554, + "grad_norm": 3.2609193325042725, + "learning_rate": 9.494052519517389e-05, + "loss": 0.0600260317325592, + "step": 35660 + }, + { + "epoch": 5.063165365507452, + "grad_norm": 3.8911261558532715, + "learning_rate": 9.493910574875798e-05, + "loss": 0.05886413455009461, + "step": 35670 + }, + { + "epoch": 5.06458481192335, + "grad_norm": 3.445101737976074, + "learning_rate": 9.49376863023421e-05, + "loss": 0.0769159197807312, + "step": 35680 + }, + { + "epoch": 5.066004258339247, + "grad_norm": 4.021439552307129, + "learning_rate": 9.493626685592619e-05, + "loss": 0.08424595594406128, + "step": 35690 + }, + { + "epoch": 5.067423704755146, + "grad_norm": 1.7676706314086914, + "learning_rate": 9.49348474095103e-05, + "loss": 0.029396337270736695, + "step": 35700 + }, + { + "epoch": 5.068843151171043, + "grad_norm": 4.876907825469971, + "learning_rate": 9.49334279630944e-05, + "loss": 0.035354167222976685, + "step": 35710 + }, + { + "epoch": 5.070262597586941, + "grad_norm": 0.8973761796951294, + "learning_rate": 9.49320085166785e-05, + "loss": 0.04921911954879761, + "step": 35720 + }, + { + "epoch": 5.071682044002839, + "grad_norm": 10.738030433654785, + "learning_rate": 9.49305890702626e-05, + "loss": 0.10539579391479492, + "step": 35730 + }, + { + "epoch": 5.073101490418737, + "grad_norm": 0.2019427865743637, + "learning_rate": 9.49291696238467e-05, + "loss": 0.0512103259563446, + "step": 35740 + }, + { + "epoch": 5.0745209368346345, + "grad_norm": 5.051251411437988, + "learning_rate": 9.49277501774308e-05, + "loss": 0.06000564694404602, + "step": 35750 + }, + { + "epoch": 5.075940383250532, + "grad_norm": 2.901967763900757, + "learning_rate": 9.492633073101491e-05, + "loss": 0.07291017174720764, + "step": 35760 + }, + { + "epoch": 5.07735982966643, + "grad_norm": 1.4676152467727661, + "learning_rate": 9.492491128459901e-05, + "loss": 0.06764619946479797, + "step": 35770 + }, + { + "epoch": 5.078779276082328, + "grad_norm": 11.876858711242676, + "learning_rate": 9.492349183818311e-05, + "loss": 0.060235893726348876, + "step": 35780 + }, + { + "epoch": 5.080198722498226, + "grad_norm": 9.063863754272461, + "learning_rate": 9.492207239176722e-05, + "loss": 0.03448966443538666, + "step": 35790 + }, + { + "epoch": 5.081618168914123, + "grad_norm": 1.5545835494995117, + "learning_rate": 9.492065294535132e-05, + "loss": 0.06110445261001587, + "step": 35800 + }, + { + "epoch": 5.083037615330022, + "grad_norm": 3.2116994857788086, + "learning_rate": 9.491923349893543e-05, + "loss": 0.06111306548118591, + "step": 35810 + }, + { + "epoch": 5.084457061745919, + "grad_norm": 0.2644821107387543, + "learning_rate": 9.491781405251953e-05, + "loss": 0.05563850402832031, + "step": 35820 + }, + { + "epoch": 5.0858765081618165, + "grad_norm": 2.6873834133148193, + "learning_rate": 9.491639460610362e-05, + "loss": 0.08348619937896729, + "step": 35830 + }, + { + "epoch": 5.087295954577715, + "grad_norm": 6.330774784088135, + "learning_rate": 9.491497515968772e-05, + "loss": 0.07018688321113586, + "step": 35840 + }, + { + "epoch": 5.088715400993612, + "grad_norm": 1.0025646686553955, + "learning_rate": 9.491355571327183e-05, + "loss": 0.050104659795761106, + "step": 35850 + }, + { + "epoch": 5.0901348474095105, + "grad_norm": 2.3368682861328125, + "learning_rate": 9.491213626685593e-05, + "loss": 0.04211297333240509, + "step": 35860 + }, + { + "epoch": 5.091554293825408, + "grad_norm": 0.6230148077011108, + "learning_rate": 9.491071682044004e-05, + "loss": 0.049572864174842836, + "step": 35870 + }, + { + "epoch": 5.092973740241306, + "grad_norm": 0.10699428617954254, + "learning_rate": 9.490929737402414e-05, + "loss": 0.00935342162847519, + "step": 35880 + }, + { + "epoch": 5.094393186657204, + "grad_norm": 6.48928689956665, + "learning_rate": 9.490787792760823e-05, + "loss": 0.041819396615028384, + "step": 35890 + }, + { + "epoch": 5.095812633073101, + "grad_norm": 4.529843807220459, + "learning_rate": 9.490645848119234e-05, + "loss": 0.03696204125881195, + "step": 35900 + }, + { + "epoch": 5.097232079488999, + "grad_norm": 6.805991172790527, + "learning_rate": 9.490503903477644e-05, + "loss": 0.055303680896759036, + "step": 35910 + }, + { + "epoch": 5.098651525904897, + "grad_norm": 1.9501713514328003, + "learning_rate": 9.490361958836055e-05, + "loss": 0.11662576198577881, + "step": 35920 + }, + { + "epoch": 5.100070972320795, + "grad_norm": 0.07518387585878372, + "learning_rate": 9.490220014194464e-05, + "loss": 0.029073578119277955, + "step": 35930 + }, + { + "epoch": 5.1014904187366925, + "grad_norm": 0.3094475567340851, + "learning_rate": 9.490078069552875e-05, + "loss": 0.01400664895772934, + "step": 35940 + }, + { + "epoch": 5.102909865152591, + "grad_norm": 16.787120819091797, + "learning_rate": 9.489936124911285e-05, + "loss": 0.06732473373413086, + "step": 35950 + }, + { + "epoch": 5.104329311568488, + "grad_norm": 0.21593448519706726, + "learning_rate": 9.489794180269696e-05, + "loss": 0.042519426345825194, + "step": 35960 + }, + { + "epoch": 5.105748757984386, + "grad_norm": 5.305930137634277, + "learning_rate": 9.489652235628105e-05, + "loss": 0.05054143667221069, + "step": 35970 + }, + { + "epoch": 5.107168204400284, + "grad_norm": 8.940295219421387, + "learning_rate": 9.489510290986515e-05, + "loss": 0.06974593400955201, + "step": 35980 + }, + { + "epoch": 5.108587650816181, + "grad_norm": 0.44597089290618896, + "learning_rate": 9.489368346344926e-05, + "loss": 0.11333968639373779, + "step": 35990 + }, + { + "epoch": 5.11000709723208, + "grad_norm": 1.9616674184799194, + "learning_rate": 9.489226401703336e-05, + "loss": 0.03930683135986328, + "step": 36000 + }, + { + "epoch": 5.11000709723208, + "eval_accuracy": 0.9638201818528646, + "eval_loss": 0.12829390168190002, + "eval_runtime": 35.5371, + "eval_samples_per_second": 442.551, + "eval_steps_per_second": 13.845, + "step": 36000 + }, + { + "epoch": 5.111426543647977, + "grad_norm": 0.7007933855056763, + "learning_rate": 9.489084457061747e-05, + "loss": 0.07342724800109864, + "step": 36010 + }, + { + "epoch": 5.112845990063875, + "grad_norm": 8.156115531921387, + "learning_rate": 9.488942512420157e-05, + "loss": 0.05496933460235596, + "step": 36020 + }, + { + "epoch": 5.114265436479773, + "grad_norm": 8.341107368469238, + "learning_rate": 9.488800567778566e-05, + "loss": 0.07536699771881103, + "step": 36030 + }, + { + "epoch": 5.115684882895671, + "grad_norm": 2.163313627243042, + "learning_rate": 9.488658623136976e-05, + "loss": 0.06941324472427368, + "step": 36040 + }, + { + "epoch": 5.1171043293115686, + "grad_norm": 7.8382887840271, + "learning_rate": 9.488516678495387e-05, + "loss": 0.0580519437789917, + "step": 36050 + }, + { + "epoch": 5.118523775727466, + "grad_norm": 0.9207919239997864, + "learning_rate": 9.488374733853797e-05, + "loss": 0.08848693370819091, + "step": 36060 + }, + { + "epoch": 5.119943222143364, + "grad_norm": 4.699718475341797, + "learning_rate": 9.488232789212208e-05, + "loss": 0.05136229991912842, + "step": 36070 + }, + { + "epoch": 5.121362668559262, + "grad_norm": 4.970333099365234, + "learning_rate": 9.488090844570618e-05, + "loss": 0.04451129138469696, + "step": 36080 + }, + { + "epoch": 5.12278211497516, + "grad_norm": 0.04377421736717224, + "learning_rate": 9.487948899929028e-05, + "loss": 0.04046821594238281, + "step": 36090 + }, + { + "epoch": 5.124201561391057, + "grad_norm": 0.19261276721954346, + "learning_rate": 9.487806955287439e-05, + "loss": 0.04373805820941925, + "step": 36100 + }, + { + "epoch": 5.125621007806956, + "grad_norm": 4.2963056564331055, + "learning_rate": 9.487665010645848e-05, + "loss": 0.021022433042526247, + "step": 36110 + }, + { + "epoch": 5.127040454222853, + "grad_norm": 7.420901775360107, + "learning_rate": 9.48752306600426e-05, + "loss": 0.0632928729057312, + "step": 36120 + }, + { + "epoch": 5.128459900638751, + "grad_norm": 5.440396785736084, + "learning_rate": 9.487381121362669e-05, + "loss": 0.09147984385490418, + "step": 36130 + }, + { + "epoch": 5.129879347054649, + "grad_norm": 7.403855800628662, + "learning_rate": 9.487239176721079e-05, + "loss": 0.05113822817802429, + "step": 36140 + }, + { + "epoch": 5.131298793470546, + "grad_norm": 1.6538256406784058, + "learning_rate": 9.487097232079489e-05, + "loss": 0.06598106026649475, + "step": 36150 + }, + { + "epoch": 5.132718239886445, + "grad_norm": 0.8752590417861938, + "learning_rate": 9.4869552874379e-05, + "loss": 0.03561938405036926, + "step": 36160 + }, + { + "epoch": 5.134137686302342, + "grad_norm": 2.622938632965088, + "learning_rate": 9.48681334279631e-05, + "loss": 0.08233972787857055, + "step": 36170 + }, + { + "epoch": 5.13555713271824, + "grad_norm": 0.23157648742198944, + "learning_rate": 9.486671398154721e-05, + "loss": 0.06924783587455749, + "step": 36180 + }, + { + "epoch": 5.136976579134138, + "grad_norm": 2.9473605155944824, + "learning_rate": 9.48652945351313e-05, + "loss": 0.07875468730926513, + "step": 36190 + }, + { + "epoch": 5.138396025550035, + "grad_norm": 10.965639114379883, + "learning_rate": 9.48638750887154e-05, + "loss": 0.058244621753692626, + "step": 36200 + }, + { + "epoch": 5.1398154719659335, + "grad_norm": 0.24280737340450287, + "learning_rate": 9.486245564229951e-05, + "loss": 0.07738088965415954, + "step": 36210 + }, + { + "epoch": 5.141234918381831, + "grad_norm": 7.49735164642334, + "learning_rate": 9.486103619588361e-05, + "loss": 0.03476256728172302, + "step": 36220 + }, + { + "epoch": 5.142654364797729, + "grad_norm": 1.5520763397216797, + "learning_rate": 9.485961674946772e-05, + "loss": 0.04197014570236206, + "step": 36230 + }, + { + "epoch": 5.144073811213627, + "grad_norm": 4.975586414337158, + "learning_rate": 9.48581973030518e-05, + "loss": 0.06271924376487732, + "step": 36240 + }, + { + "epoch": 5.145493257629525, + "grad_norm": 7.479091644287109, + "learning_rate": 9.485677785663592e-05, + "loss": 0.08511911630630493, + "step": 36250 + }, + { + "epoch": 5.146912704045422, + "grad_norm": 0.14201731979846954, + "learning_rate": 9.485535841022001e-05, + "loss": 0.03889679312705994, + "step": 36260 + }, + { + "epoch": 5.14833215046132, + "grad_norm": 1.5078015327453613, + "learning_rate": 9.485393896380412e-05, + "loss": 0.07731766104698182, + "step": 36270 + }, + { + "epoch": 5.149751596877218, + "grad_norm": 0.5280294418334961, + "learning_rate": 9.485251951738822e-05, + "loss": 0.03280209302902222, + "step": 36280 + }, + { + "epoch": 5.1511710432931155, + "grad_norm": 1.5268537998199463, + "learning_rate": 9.485110007097232e-05, + "loss": 0.03509989082813263, + "step": 36290 + }, + { + "epoch": 5.152590489709014, + "grad_norm": 0.8695167303085327, + "learning_rate": 9.484968062455643e-05, + "loss": 0.016241730749607088, + "step": 36300 + }, + { + "epoch": 5.154009936124911, + "grad_norm": 7.062769889831543, + "learning_rate": 9.484826117814053e-05, + "loss": 0.06393226981163025, + "step": 36310 + }, + { + "epoch": 5.1554293825408095, + "grad_norm": 8.141936302185059, + "learning_rate": 9.484684173172464e-05, + "loss": 0.037215083837509155, + "step": 36320 + }, + { + "epoch": 5.156848828956707, + "grad_norm": 1.9921444654464722, + "learning_rate": 9.484542228530874e-05, + "loss": 0.04678789079189301, + "step": 36330 + }, + { + "epoch": 5.158268275372604, + "grad_norm": 0.44700711965560913, + "learning_rate": 9.484400283889283e-05, + "loss": 0.0651922881603241, + "step": 36340 + }, + { + "epoch": 5.159687721788503, + "grad_norm": 1.856458067893982, + "learning_rate": 9.484258339247693e-05, + "loss": 0.04245249330997467, + "step": 36350 + }, + { + "epoch": 5.1611071682044, + "grad_norm": 10.253634452819824, + "learning_rate": 9.484116394606104e-05, + "loss": 0.07513232231140136, + "step": 36360 + }, + { + "epoch": 5.162526614620298, + "grad_norm": 0.31568190455436707, + "learning_rate": 9.483974449964514e-05, + "loss": 0.08399287462234498, + "step": 36370 + }, + { + "epoch": 5.163946061036196, + "grad_norm": 1.2396879196166992, + "learning_rate": 9.483832505322925e-05, + "loss": 0.11422721147537232, + "step": 36380 + }, + { + "epoch": 5.165365507452094, + "grad_norm": 4.058791160583496, + "learning_rate": 9.483690560681335e-05, + "loss": 0.07308083772659302, + "step": 36390 + }, + { + "epoch": 5.1667849538679915, + "grad_norm": 5.865930557250977, + "learning_rate": 9.483548616039744e-05, + "loss": 0.057479435205459596, + "step": 36400 + }, + { + "epoch": 5.168204400283889, + "grad_norm": 13.093545913696289, + "learning_rate": 9.483406671398155e-05, + "loss": 0.050871860980987546, + "step": 36410 + }, + { + "epoch": 5.169623846699787, + "grad_norm": 2.405416250228882, + "learning_rate": 9.483264726756565e-05, + "loss": 0.02025536894798279, + "step": 36420 + }, + { + "epoch": 5.171043293115685, + "grad_norm": 0.4331030249595642, + "learning_rate": 9.483122782114976e-05, + "loss": 0.01977370083332062, + "step": 36430 + }, + { + "epoch": 5.172462739531583, + "grad_norm": 3.3781073093414307, + "learning_rate": 9.482980837473385e-05, + "loss": 0.041225132346153257, + "step": 36440 + }, + { + "epoch": 5.17388218594748, + "grad_norm": 7.368563175201416, + "learning_rate": 9.482838892831796e-05, + "loss": 0.0713624656200409, + "step": 36450 + }, + { + "epoch": 5.175301632363379, + "grad_norm": 4.2806878089904785, + "learning_rate": 9.482696948190206e-05, + "loss": 0.044384431838989255, + "step": 36460 + }, + { + "epoch": 5.176721078779276, + "grad_norm": 1.133421778678894, + "learning_rate": 9.482555003548617e-05, + "loss": 0.0395077645778656, + "step": 36470 + }, + { + "epoch": 5.1781405251951735, + "grad_norm": 5.6397929191589355, + "learning_rate": 9.482413058907026e-05, + "loss": 0.07517208456993103, + "step": 36480 + }, + { + "epoch": 5.179559971611072, + "grad_norm": 4.359251499176025, + "learning_rate": 9.482271114265437e-05, + "loss": 0.05270699858665466, + "step": 36490 + }, + { + "epoch": 5.180979418026969, + "grad_norm": 0.030488723888993263, + "learning_rate": 9.482129169623847e-05, + "loss": 0.0488809198141098, + "step": 36500 + }, + { + "epoch": 5.180979418026969, + "eval_accuracy": 0.9735486742544669, + "eval_loss": 0.07985691726207733, + "eval_runtime": 34.4709, + "eval_samples_per_second": 456.24, + "eval_steps_per_second": 14.273, + "step": 36500 + }, + { + "epoch": 5.182398864442868, + "grad_norm": 4.528557777404785, + "learning_rate": 9.481987224982257e-05, + "loss": 0.050593554973602295, + "step": 36510 + }, + { + "epoch": 5.183818310858765, + "grad_norm": 11.558876991271973, + "learning_rate": 9.481845280340668e-05, + "loss": 0.07353735566139222, + "step": 36520 + }, + { + "epoch": 5.185237757274663, + "grad_norm": 5.571136474609375, + "learning_rate": 9.481703335699078e-05, + "loss": 0.046426203846931455, + "step": 36530 + }, + { + "epoch": 5.186657203690561, + "grad_norm": 5.435025215148926, + "learning_rate": 9.481561391057489e-05, + "loss": 0.0408222883939743, + "step": 36540 + }, + { + "epoch": 5.188076650106458, + "grad_norm": 5.731179714202881, + "learning_rate": 9.481419446415897e-05, + "loss": 0.07564018964767456, + "step": 36550 + }, + { + "epoch": 5.189496096522356, + "grad_norm": 2.5727474689483643, + "learning_rate": 9.481277501774308e-05, + "loss": 0.05871484279632568, + "step": 36560 + }, + { + "epoch": 5.190915542938254, + "grad_norm": 12.591144561767578, + "learning_rate": 9.481135557132718e-05, + "loss": 0.08494226336479187, + "step": 36570 + }, + { + "epoch": 5.192334989354152, + "grad_norm": 4.414670944213867, + "learning_rate": 9.480993612491129e-05, + "loss": 0.05334811806678772, + "step": 36580 + }, + { + "epoch": 5.19375443577005, + "grad_norm": 0.2967151403427124, + "learning_rate": 9.48085166784954e-05, + "loss": 0.020326825976371764, + "step": 36590 + }, + { + "epoch": 5.195173882185948, + "grad_norm": 2.503615617752075, + "learning_rate": 9.480709723207949e-05, + "loss": 0.025067511200904845, + "step": 36600 + }, + { + "epoch": 5.196593328601845, + "grad_norm": 0.2529163062572479, + "learning_rate": 9.48056777856636e-05, + "loss": 0.018686428666114807, + "step": 36610 + }, + { + "epoch": 5.198012775017743, + "grad_norm": 2.891233444213867, + "learning_rate": 9.48042583392477e-05, + "loss": 0.06542560458183289, + "step": 36620 + }, + { + "epoch": 5.199432221433641, + "grad_norm": 6.781946182250977, + "learning_rate": 9.48028388928318e-05, + "loss": 0.03696680366992951, + "step": 36630 + }, + { + "epoch": 5.2008516678495385, + "grad_norm": 0.25263258814811707, + "learning_rate": 9.48014194464159e-05, + "loss": 0.05601266026496887, + "step": 36640 + }, + { + "epoch": 5.202271114265437, + "grad_norm": 3.6889445781707764, + "learning_rate": 9.48e-05, + "loss": 0.04601848125457764, + "step": 36650 + }, + { + "epoch": 5.203690560681334, + "grad_norm": 6.032188415527344, + "learning_rate": 9.47985805535841e-05, + "loss": 0.03191192746162415, + "step": 36660 + }, + { + "epoch": 5.2051100070972325, + "grad_norm": 2.5597052574157715, + "learning_rate": 9.479716110716821e-05, + "loss": 0.08680691719055175, + "step": 36670 + }, + { + "epoch": 5.20652945351313, + "grad_norm": 1.3042011260986328, + "learning_rate": 9.479574166075232e-05, + "loss": 0.07259045839309693, + "step": 36680 + }, + { + "epoch": 5.207948899929027, + "grad_norm": 6.466866970062256, + "learning_rate": 9.479432221433642e-05, + "loss": 0.1063350796699524, + "step": 36690 + }, + { + "epoch": 5.209368346344926, + "grad_norm": 3.7391083240509033, + "learning_rate": 9.479290276792051e-05, + "loss": 0.06057397127151489, + "step": 36700 + }, + { + "epoch": 5.210787792760823, + "grad_norm": 5.033336162567139, + "learning_rate": 9.479148332150461e-05, + "loss": 0.03979503214359283, + "step": 36710 + }, + { + "epoch": 5.212207239176721, + "grad_norm": 4.238401889801025, + "learning_rate": 9.479006387508872e-05, + "loss": 0.04253645837306976, + "step": 36720 + }, + { + "epoch": 5.213626685592619, + "grad_norm": 6.895720958709717, + "learning_rate": 9.478864442867282e-05, + "loss": 0.0989041805267334, + "step": 36730 + }, + { + "epoch": 5.215046132008517, + "grad_norm": 0.8805387616157532, + "learning_rate": 9.478722498225693e-05, + "loss": 0.06594863533973694, + "step": 36740 + }, + { + "epoch": 5.2164655784244145, + "grad_norm": 0.778193473815918, + "learning_rate": 9.478580553584101e-05, + "loss": 0.04552145004272461, + "step": 36750 + }, + { + "epoch": 5.217885024840312, + "grad_norm": 1.0025285482406616, + "learning_rate": 9.478438608942513e-05, + "loss": 0.09854941368103028, + "step": 36760 + }, + { + "epoch": 5.21930447125621, + "grad_norm": 2.777564764022827, + "learning_rate": 9.478296664300924e-05, + "loss": 0.07063305974006653, + "step": 36770 + }, + { + "epoch": 5.220723917672108, + "grad_norm": 6.000669479370117, + "learning_rate": 9.478154719659333e-05, + "loss": 0.04376820921897888, + "step": 36780 + }, + { + "epoch": 5.222143364088006, + "grad_norm": 9.674980163574219, + "learning_rate": 9.478012775017744e-05, + "loss": 0.039281606674194336, + "step": 36790 + }, + { + "epoch": 5.223562810503903, + "grad_norm": 1.4637360572814941, + "learning_rate": 9.477870830376154e-05, + "loss": 0.04983239769935608, + "step": 36800 + }, + { + "epoch": 5.224982256919802, + "grad_norm": 0.23426099121570587, + "learning_rate": 9.477728885734564e-05, + "loss": 0.07621067762374878, + "step": 36810 + }, + { + "epoch": 5.226401703335699, + "grad_norm": 0.1280343383550644, + "learning_rate": 9.477586941092974e-05, + "loss": 0.07941646575927734, + "step": 36820 + }, + { + "epoch": 5.2278211497515965, + "grad_norm": 6.921216011047363, + "learning_rate": 9.477444996451385e-05, + "loss": 0.08770122528076171, + "step": 36830 + }, + { + "epoch": 5.229240596167495, + "grad_norm": 0.5032868981361389, + "learning_rate": 9.477303051809795e-05, + "loss": 0.04545081257820129, + "step": 36840 + }, + { + "epoch": 5.230660042583392, + "grad_norm": 0.8603189587593079, + "learning_rate": 9.477161107168206e-05, + "loss": 0.09249699711799622, + "step": 36850 + }, + { + "epoch": 5.2320794889992905, + "grad_norm": 3.6071572303771973, + "learning_rate": 9.477019162526615e-05, + "loss": 0.08706681132316589, + "step": 36860 + }, + { + "epoch": 5.233498935415188, + "grad_norm": 0.3033102750778198, + "learning_rate": 9.476877217885025e-05, + "loss": 0.05687382221221924, + "step": 36870 + }, + { + "epoch": 5.234918381831086, + "grad_norm": 1.9104326963424683, + "learning_rate": 9.476735273243436e-05, + "loss": 0.05650158524513245, + "step": 36880 + }, + { + "epoch": 5.236337828246984, + "grad_norm": 0.33066046237945557, + "learning_rate": 9.476593328601846e-05, + "loss": 0.03637203574180603, + "step": 36890 + }, + { + "epoch": 5.237757274662881, + "grad_norm": 9.79216194152832, + "learning_rate": 9.476451383960257e-05, + "loss": 0.10526165962219239, + "step": 36900 + }, + { + "epoch": 5.239176721078779, + "grad_norm": 0.6268110871315002, + "learning_rate": 9.476309439318665e-05, + "loss": 0.05048244595527649, + "step": 36910 + }, + { + "epoch": 5.240596167494677, + "grad_norm": 3.2916431427001953, + "learning_rate": 9.476167494677077e-05, + "loss": 0.10257794857025146, + "step": 36920 + }, + { + "epoch": 5.242015613910575, + "grad_norm": 0.32357263565063477, + "learning_rate": 9.476025550035486e-05, + "loss": 0.034938329458236696, + "step": 36930 + }, + { + "epoch": 5.2434350603264726, + "grad_norm": 9.606283187866211, + "learning_rate": 9.475883605393897e-05, + "loss": 0.08114267587661743, + "step": 36940 + }, + { + "epoch": 5.244854506742371, + "grad_norm": 5.870957851409912, + "learning_rate": 9.475741660752307e-05, + "loss": 0.06058700084686279, + "step": 36950 + }, + { + "epoch": 5.246273953158268, + "grad_norm": 4.593484401702881, + "learning_rate": 9.475599716110717e-05, + "loss": 0.05402443408966064, + "step": 36960 + }, + { + "epoch": 5.247693399574166, + "grad_norm": 1.3845820426940918, + "learning_rate": 9.475457771469128e-05, + "loss": 0.047877585887908934, + "step": 36970 + }, + { + "epoch": 5.249112845990064, + "grad_norm": 2.0945143699645996, + "learning_rate": 9.475315826827538e-05, + "loss": 0.05654643177986145, + "step": 36980 + }, + { + "epoch": 5.250532292405961, + "grad_norm": 5.1305131912231445, + "learning_rate": 9.475173882185949e-05, + "loss": 0.09039227962493897, + "step": 36990 + }, + { + "epoch": 5.25195173882186, + "grad_norm": 0.9498898983001709, + "learning_rate": 9.475031937544358e-05, + "loss": 0.05969501733779907, + "step": 37000 + }, + { + "epoch": 5.25195173882186, + "eval_accuracy": 0.9731671647485216, + "eval_loss": 0.08746004849672318, + "eval_runtime": 33.9754, + "eval_samples_per_second": 462.894, + "eval_steps_per_second": 14.481, + "step": 37000 + }, + { + "epoch": 5.253371185237757, + "grad_norm": 0.6349175572395325, + "learning_rate": 9.474889992902768e-05, + "loss": 0.05443019866943359, + "step": 37010 + }, + { + "epoch": 5.2547906316536555, + "grad_norm": 5.677085876464844, + "learning_rate": 9.474748048261178e-05, + "loss": 0.0985885202884674, + "step": 37020 + }, + { + "epoch": 5.256210078069553, + "grad_norm": 8.88677978515625, + "learning_rate": 9.474606103619589e-05, + "loss": 0.13014729022979737, + "step": 37030 + }, + { + "epoch": 5.25762952448545, + "grad_norm": 5.704558372497559, + "learning_rate": 9.474464158977999e-05, + "loss": 0.043677717447280884, + "step": 37040 + }, + { + "epoch": 5.259048970901349, + "grad_norm": 4.478132724761963, + "learning_rate": 9.47432221433641e-05, + "loss": 0.0692388117313385, + "step": 37050 + }, + { + "epoch": 5.260468417317246, + "grad_norm": 0.5155969262123108, + "learning_rate": 9.47418026969482e-05, + "loss": 0.028971996903419495, + "step": 37060 + }, + { + "epoch": 5.261887863733144, + "grad_norm": 0.47446149587631226, + "learning_rate": 9.47403832505323e-05, + "loss": 0.0572409987449646, + "step": 37070 + }, + { + "epoch": 5.263307310149042, + "grad_norm": 1.3588443994522095, + "learning_rate": 9.47389638041164e-05, + "loss": 0.08811056017875671, + "step": 37080 + }, + { + "epoch": 5.26472675656494, + "grad_norm": 4.575846195220947, + "learning_rate": 9.47375443577005e-05, + "loss": 0.0533088743686676, + "step": 37090 + }, + { + "epoch": 5.2661462029808375, + "grad_norm": 3.1430861949920654, + "learning_rate": 9.473612491128461e-05, + "loss": 0.08389832377433777, + "step": 37100 + }, + { + "epoch": 5.267565649396735, + "grad_norm": 0.3461446166038513, + "learning_rate": 9.47347054648687e-05, + "loss": 0.07585886120796204, + "step": 37110 + }, + { + "epoch": 5.268985095812633, + "grad_norm": 7.318383693695068, + "learning_rate": 9.473328601845281e-05, + "loss": 0.05129183530807495, + "step": 37120 + }, + { + "epoch": 5.270404542228531, + "grad_norm": 3.846140146255493, + "learning_rate": 9.47318665720369e-05, + "loss": 0.041881787776947024, + "step": 37130 + }, + { + "epoch": 5.271823988644429, + "grad_norm": 6.777071952819824, + "learning_rate": 9.473044712562102e-05, + "loss": 0.0678529143333435, + "step": 37140 + }, + { + "epoch": 5.273243435060326, + "grad_norm": 4.96095085144043, + "learning_rate": 9.472902767920511e-05, + "loss": 0.04452368021011353, + "step": 37150 + }, + { + "epoch": 5.274662881476225, + "grad_norm": 3.6459364891052246, + "learning_rate": 9.472760823278922e-05, + "loss": 0.061465442180633545, + "step": 37160 + }, + { + "epoch": 5.276082327892122, + "grad_norm": 2.436566114425659, + "learning_rate": 9.472618878637332e-05, + "loss": 0.05536556839942932, + "step": 37170 + }, + { + "epoch": 5.2775017743080195, + "grad_norm": 7.050469398498535, + "learning_rate": 9.472476933995742e-05, + "loss": 0.04715914726257324, + "step": 37180 + }, + { + "epoch": 5.278921220723918, + "grad_norm": 1.642188549041748, + "learning_rate": 9.472334989354153e-05, + "loss": 0.03356336355209351, + "step": 37190 + }, + { + "epoch": 5.280340667139815, + "grad_norm": 0.2856753468513489, + "learning_rate": 9.472193044712563e-05, + "loss": 0.03416549563407898, + "step": 37200 + }, + { + "epoch": 5.2817601135557135, + "grad_norm": 3.587663173675537, + "learning_rate": 9.472051100070974e-05, + "loss": 0.05653611421585083, + "step": 37210 + }, + { + "epoch": 5.283179559971611, + "grad_norm": 6.947723865509033, + "learning_rate": 9.471909155429382e-05, + "loss": 0.091396963596344, + "step": 37220 + }, + { + "epoch": 5.284599006387509, + "grad_norm": 7.335968971252441, + "learning_rate": 9.471767210787793e-05, + "loss": 0.03362211585044861, + "step": 37230 + }, + { + "epoch": 5.286018452803407, + "grad_norm": 0.9858488440513611, + "learning_rate": 9.471625266146203e-05, + "loss": 0.04085931479930878, + "step": 37240 + }, + { + "epoch": 5.287437899219304, + "grad_norm": 5.122075080871582, + "learning_rate": 9.471483321504614e-05, + "loss": 0.05024126172065735, + "step": 37250 + }, + { + "epoch": 5.288857345635202, + "grad_norm": Infinity, + "learning_rate": 9.471341376863024e-05, + "loss": 0.06409624218940735, + "step": 37260 + }, + { + "epoch": 5.2902767920511, + "grad_norm": 7.42424201965332, + "learning_rate": 9.471213626685594e-05, + "loss": 0.06915678977966308, + "step": 37270 + }, + { + "epoch": 5.291696238466998, + "grad_norm": 0.18197417259216309, + "learning_rate": 9.471071682044003e-05, + "loss": 0.12947027683258056, + "step": 37280 + }, + { + "epoch": 5.2931156848828955, + "grad_norm": 1.1386280059814453, + "learning_rate": 9.470929737402413e-05, + "loss": 0.0524638831615448, + "step": 37290 + }, + { + "epoch": 5.294535131298794, + "grad_norm": 2.983981132507324, + "learning_rate": 9.470787792760823e-05, + "loss": 0.051947909593582156, + "step": 37300 + }, + { + "epoch": 5.295954577714691, + "grad_norm": 0.2438533753156662, + "learning_rate": 9.470645848119234e-05, + "loss": 0.03344468176364899, + "step": 37310 + }, + { + "epoch": 5.297374024130589, + "grad_norm": 1.9252828359603882, + "learning_rate": 9.470503903477644e-05, + "loss": 0.04618232250213623, + "step": 37320 + }, + { + "epoch": 5.298793470546487, + "grad_norm": 0.39696675539016724, + "learning_rate": 9.470361958836055e-05, + "loss": 0.08025044798851014, + "step": 37330 + }, + { + "epoch": 5.300212916962384, + "grad_norm": 2.89485239982605, + "learning_rate": 9.470220014194465e-05, + "loss": 0.08881823420524597, + "step": 37340 + }, + { + "epoch": 5.301632363378283, + "grad_norm": 8.957134246826172, + "learning_rate": 9.470078069552874e-05, + "loss": 0.15155066251754762, + "step": 37350 + }, + { + "epoch": 5.30305180979418, + "grad_norm": 1.5332341194152832, + "learning_rate": 9.469936124911285e-05, + "loss": 0.0627815842628479, + "step": 37360 + }, + { + "epoch": 5.304471256210078, + "grad_norm": 8.660079956054688, + "learning_rate": 9.469794180269695e-05, + "loss": 0.08449615240097046, + "step": 37370 + }, + { + "epoch": 5.305890702625976, + "grad_norm": 7.650805473327637, + "learning_rate": 9.469652235628106e-05, + "loss": 0.058890581130981445, + "step": 37380 + }, + { + "epoch": 5.307310149041873, + "grad_norm": 0.46144038438796997, + "learning_rate": 9.469510290986515e-05, + "loss": 0.035400664806365965, + "step": 37390 + }, + { + "epoch": 5.308729595457772, + "grad_norm": 6.973151206970215, + "learning_rate": 9.469368346344926e-05, + "loss": 0.05486550331115723, + "step": 37400 + }, + { + "epoch": 5.310149041873669, + "grad_norm": 5.088735580444336, + "learning_rate": 9.469226401703335e-05, + "loss": 0.0546966552734375, + "step": 37410 + }, + { + "epoch": 5.311568488289567, + "grad_norm": 2.2214155197143555, + "learning_rate": 9.469084457061747e-05, + "loss": 0.08320272564888001, + "step": 37420 + }, + { + "epoch": 5.312987934705465, + "grad_norm": 8.335865020751953, + "learning_rate": 9.468942512420158e-05, + "loss": 0.09053044319152832, + "step": 37430 + }, + { + "epoch": 5.314407381121363, + "grad_norm": 2.290205717086792, + "learning_rate": 9.468800567778566e-05, + "loss": 0.016748277842998503, + "step": 37440 + }, + { + "epoch": 5.31582682753726, + "grad_norm": 6.0070109367370605, + "learning_rate": 9.468658623136977e-05, + "loss": 0.037306949496269226, + "step": 37450 + }, + { + "epoch": 5.317246273953158, + "grad_norm": 0.7360553741455078, + "learning_rate": 9.468516678495387e-05, + "loss": 0.04275312125682831, + "step": 37460 + }, + { + "epoch": 5.318665720369056, + "grad_norm": 1.0693514347076416, + "learning_rate": 9.468374733853798e-05, + "loss": 0.053118348121643066, + "step": 37470 + }, + { + "epoch": 5.320085166784954, + "grad_norm": 0.3035983443260193, + "learning_rate": 9.468232789212208e-05, + "loss": 0.06972123384475708, + "step": 37480 + }, + { + "epoch": 5.321504613200852, + "grad_norm": 5.642818450927734, + "learning_rate": 9.468090844570619e-05, + "loss": 0.05097317695617676, + "step": 37490 + }, + { + "epoch": 5.322924059616749, + "grad_norm": 2.78389835357666, + "learning_rate": 9.467948899929027e-05, + "loss": 0.02805333733558655, + "step": 37500 + }, + { + "epoch": 5.322924059616749, + "eval_accuracy": 0.9726584854072614, + "eval_loss": 0.08883775025606155, + "eval_runtime": 33.5333, + "eval_samples_per_second": 468.997, + "eval_steps_per_second": 14.672, + "step": 37500 + }, + { + "epoch": 5.324343506032648, + "grad_norm": 5.956612586975098, + "learning_rate": 9.467806955287438e-05, + "loss": 0.03515351712703705, + "step": 37510 + }, + { + "epoch": 5.325762952448545, + "grad_norm": 11.028221130371094, + "learning_rate": 9.46766501064585e-05, + "loss": 0.07912625670433045, + "step": 37520 + }, + { + "epoch": 5.3271823988644424, + "grad_norm": 10.001873016357422, + "learning_rate": 9.467523066004259e-05, + "loss": 0.06761111617088318, + "step": 37530 + }, + { + "epoch": 5.328601845280341, + "grad_norm": 1.0504896640777588, + "learning_rate": 9.46738112136267e-05, + "loss": 0.06048610210418701, + "step": 37540 + }, + { + "epoch": 5.330021291696238, + "grad_norm": 3.470850944519043, + "learning_rate": 9.467239176721079e-05, + "loss": 0.05224236249923706, + "step": 37550 + }, + { + "epoch": 5.3314407381121365, + "grad_norm": 0.3818908929824829, + "learning_rate": 9.46709723207949e-05, + "loss": 0.03469651639461517, + "step": 37560 + }, + { + "epoch": 5.332860184528034, + "grad_norm": 7.192812919616699, + "learning_rate": 9.4669552874379e-05, + "loss": 0.10160307884216309, + "step": 37570 + }, + { + "epoch": 5.334279630943932, + "grad_norm": 6.561454772949219, + "learning_rate": 9.46681334279631e-05, + "loss": 0.0493013322353363, + "step": 37580 + }, + { + "epoch": 5.33569907735983, + "grad_norm": 4.2305755615234375, + "learning_rate": 9.46667139815472e-05, + "loss": 0.053803282976150515, + "step": 37590 + }, + { + "epoch": 5.337118523775727, + "grad_norm": 3.255946636199951, + "learning_rate": 9.46652945351313e-05, + "loss": 0.05600963830947876, + "step": 37600 + }, + { + "epoch": 5.338537970191625, + "grad_norm": 7.0095109939575195, + "learning_rate": 9.466387508871541e-05, + "loss": 0.032292142510414124, + "step": 37610 + }, + { + "epoch": 5.339957416607523, + "grad_norm": 3.136387825012207, + "learning_rate": 9.466245564229951e-05, + "loss": 0.0346368670463562, + "step": 37620 + }, + { + "epoch": 5.341376863023421, + "grad_norm": 6.414516448974609, + "learning_rate": 9.466103619588362e-05, + "loss": 0.04215942919254303, + "step": 37630 + }, + { + "epoch": 5.3427963094393185, + "grad_norm": 6.037017345428467, + "learning_rate": 9.465961674946772e-05, + "loss": 0.06676658987998962, + "step": 37640 + }, + { + "epoch": 5.344215755855217, + "grad_norm": 0.9441502690315247, + "learning_rate": 9.465819730305181e-05, + "loss": 0.03622086644172669, + "step": 37650 + }, + { + "epoch": 5.345635202271114, + "grad_norm": 4.125903606414795, + "learning_rate": 9.465677785663591e-05, + "loss": 0.06060633063316345, + "step": 37660 + }, + { + "epoch": 5.347054648687012, + "grad_norm": 4.286660671234131, + "learning_rate": 9.465535841022002e-05, + "loss": 0.044310915470123294, + "step": 37670 + }, + { + "epoch": 5.34847409510291, + "grad_norm": 0.5203285813331604, + "learning_rate": 9.465393896380412e-05, + "loss": 0.07237310409545898, + "step": 37680 + }, + { + "epoch": 5.349893541518807, + "grad_norm": 11.040092468261719, + "learning_rate": 9.465251951738823e-05, + "loss": 0.07519057989120484, + "step": 37690 + }, + { + "epoch": 5.351312987934706, + "grad_norm": 3.501375913619995, + "learning_rate": 9.465110007097231e-05, + "loss": 0.057271170616149905, + "step": 37700 + }, + { + "epoch": 5.352732434350603, + "grad_norm": 2.5016627311706543, + "learning_rate": 9.464968062455642e-05, + "loss": 0.042032480239868164, + "step": 37710 + }, + { + "epoch": 5.354151880766501, + "grad_norm": 5.547698497772217, + "learning_rate": 9.464826117814054e-05, + "loss": 0.06820942163467407, + "step": 37720 + }, + { + "epoch": 5.355571327182399, + "grad_norm": 3.99342679977417, + "learning_rate": 9.464684173172463e-05, + "loss": 0.06758497953414917, + "step": 37730 + }, + { + "epoch": 5.356990773598296, + "grad_norm": 1.67915678024292, + "learning_rate": 9.464542228530874e-05, + "loss": 0.05731663703918457, + "step": 37740 + }, + { + "epoch": 5.3584102200141945, + "grad_norm": 6.3496174812316895, + "learning_rate": 9.464400283889283e-05, + "loss": 0.06755791902542115, + "step": 37750 + }, + { + "epoch": 5.359829666430092, + "grad_norm": 7.569107532501221, + "learning_rate": 9.464272533711853e-05, + "loss": 0.13933613300323486, + "step": 37760 + }, + { + "epoch": 5.36124911284599, + "grad_norm": 7.283881664276123, + "learning_rate": 9.464130589070262e-05, + "loss": 0.07494657635688781, + "step": 37770 + }, + { + "epoch": 5.362668559261888, + "grad_norm": 6.227843284606934, + "learning_rate": 9.463988644428673e-05, + "loss": 0.04237803816795349, + "step": 37780 + }, + { + "epoch": 5.364088005677786, + "grad_norm": 7.2157392501831055, + "learning_rate": 9.463846699787083e-05, + "loss": 0.06301524639129638, + "step": 37790 + }, + { + "epoch": 5.365507452093683, + "grad_norm": 5.091018199920654, + "learning_rate": 9.463704755145494e-05, + "loss": 0.07857232093811035, + "step": 37800 + }, + { + "epoch": 5.366926898509581, + "grad_norm": 0.757509171962738, + "learning_rate": 9.463562810503904e-05, + "loss": 0.0769877016544342, + "step": 37810 + }, + { + "epoch": 5.368346344925479, + "grad_norm": 3.653813362121582, + "learning_rate": 9.463420865862315e-05, + "loss": 0.08140221238136292, + "step": 37820 + }, + { + "epoch": 5.3697657913413765, + "grad_norm": 6.798269271850586, + "learning_rate": 9.463278921220724e-05, + "loss": 0.04424488544464111, + "step": 37830 + }, + { + "epoch": 5.371185237757275, + "grad_norm": 9.487317085266113, + "learning_rate": 9.463136976579135e-05, + "loss": 0.05726785659790039, + "step": 37840 + }, + { + "epoch": 5.372604684173172, + "grad_norm": 0.3824310898780823, + "learning_rate": 9.462995031937544e-05, + "loss": 0.03568733036518097, + "step": 37850 + }, + { + "epoch": 5.374024130589071, + "grad_norm": 0.5051906108856201, + "learning_rate": 9.462853087295955e-05, + "loss": 0.05948272943496704, + "step": 37860 + }, + { + "epoch": 5.375443577004968, + "grad_norm": 1.7530025243759155, + "learning_rate": 9.462711142654365e-05, + "loss": 0.03755594789981842, + "step": 37870 + }, + { + "epoch": 5.376863023420865, + "grad_norm": 1.4626020193099976, + "learning_rate": 9.462569198012775e-05, + "loss": 0.04836176633834839, + "step": 37880 + }, + { + "epoch": 5.378282469836764, + "grad_norm": 0.16351209580898285, + "learning_rate": 9.462427253371186e-05, + "loss": 0.023823167383670806, + "step": 37890 + }, + { + "epoch": 5.379701916252661, + "grad_norm": 5.490500450134277, + "learning_rate": 9.462285308729596e-05, + "loss": 0.04158731102943421, + "step": 37900 + }, + { + "epoch": 5.3811213626685594, + "grad_norm": 1.351608157157898, + "learning_rate": 9.462143364088007e-05, + "loss": 0.0956190824508667, + "step": 37910 + }, + { + "epoch": 5.382540809084457, + "grad_norm": 0.13233773410320282, + "learning_rate": 9.462001419446417e-05, + "loss": 0.07234857082366944, + "step": 37920 + }, + { + "epoch": 5.383960255500355, + "grad_norm": 3.7168662548065186, + "learning_rate": 9.461859474804826e-05, + "loss": 0.040309539437294005, + "step": 37930 + }, + { + "epoch": 5.385379701916253, + "grad_norm": 8.766815185546875, + "learning_rate": 9.461717530163236e-05, + "loss": 0.08582027554512024, + "step": 37940 + }, + { + "epoch": 5.38679914833215, + "grad_norm": 1.3776038885116577, + "learning_rate": 9.461575585521647e-05, + "loss": 0.11603788137435914, + "step": 37950 + }, + { + "epoch": 5.388218594748048, + "grad_norm": 2.4015753269195557, + "learning_rate": 9.461433640880057e-05, + "loss": 0.060989999771118165, + "step": 37960 + }, + { + "epoch": 5.389638041163946, + "grad_norm": 0.7234444618225098, + "learning_rate": 9.461291696238468e-05, + "loss": 0.0282410591840744, + "step": 37970 + }, + { + "epoch": 5.391057487579844, + "grad_norm": 1.9186277389526367, + "learning_rate": 9.461149751596878e-05, + "loss": 0.05424323081970215, + "step": 37980 + }, + { + "epoch": 5.3924769339957415, + "grad_norm": 2.9660613536834717, + "learning_rate": 9.461007806955287e-05, + "loss": 0.04484150111675263, + "step": 37990 + }, + { + "epoch": 5.39389638041164, + "grad_norm": 4.6889967918396, + "learning_rate": 9.460865862313699e-05, + "loss": 0.05380064845085144, + "step": 38000 + }, + { + "epoch": 5.39389638041164, + "eval_accuracy": 0.970814522795193, + "eval_loss": 0.09178540110588074, + "eval_runtime": 33.7285, + "eval_samples_per_second": 466.282, + "eval_steps_per_second": 14.587, + "step": 38000 + }, + { + "epoch": 5.395315826827537, + "grad_norm": 9.106490135192871, + "learning_rate": 9.460723917672108e-05, + "loss": 0.05757981538772583, + "step": 38010 + }, + { + "epoch": 5.396735273243435, + "grad_norm": 3.3271431922912598, + "learning_rate": 9.46058197303052e-05, + "loss": 0.049441322684288025, + "step": 38020 + }, + { + "epoch": 5.398154719659333, + "grad_norm": 1.027500033378601, + "learning_rate": 9.460440028388928e-05, + "loss": 0.07134292721748352, + "step": 38030 + }, + { + "epoch": 5.39957416607523, + "grad_norm": 8.88465404510498, + "learning_rate": 9.460298083747339e-05, + "loss": 0.02503353953361511, + "step": 38040 + }, + { + "epoch": 5.400993612491129, + "grad_norm": 6.1659626960754395, + "learning_rate": 9.460156139105749e-05, + "loss": 0.07239366173744202, + "step": 38050 + }, + { + "epoch": 5.402413058907026, + "grad_norm": 8.75742244720459, + "learning_rate": 9.46001419446416e-05, + "loss": 0.052278178930282596, + "step": 38060 + }, + { + "epoch": 5.403832505322924, + "grad_norm": 0.5453545451164246, + "learning_rate": 9.45987224982257e-05, + "loss": 0.03611307740211487, + "step": 38070 + }, + { + "epoch": 5.405251951738822, + "grad_norm": 0.5815269351005554, + "learning_rate": 9.459730305180979e-05, + "loss": 0.02320513129234314, + "step": 38080 + }, + { + "epoch": 5.406671398154719, + "grad_norm": 7.123127460479736, + "learning_rate": 9.45958836053939e-05, + "loss": 0.08660604953765869, + "step": 38090 + }, + { + "epoch": 5.4080908445706175, + "grad_norm": 3.231856346130371, + "learning_rate": 9.4594464158978e-05, + "loss": 0.09372188448905945, + "step": 38100 + }, + { + "epoch": 5.409510290986515, + "grad_norm": 0.1635567545890808, + "learning_rate": 9.459304471256211e-05, + "loss": 0.039569467306137085, + "step": 38110 + }, + { + "epoch": 5.410929737402413, + "grad_norm": 5.7268829345703125, + "learning_rate": 9.459162526614621e-05, + "loss": 0.048163232207298276, + "step": 38120 + }, + { + "epoch": 5.412349183818311, + "grad_norm": 4.400755405426025, + "learning_rate": 9.45902058197303e-05, + "loss": 0.055844247341156006, + "step": 38130 + }, + { + "epoch": 5.413768630234209, + "grad_norm": 0.08212675154209137, + "learning_rate": 9.45887863733144e-05, + "loss": 0.015825629234313965, + "step": 38140 + }, + { + "epoch": 5.415188076650106, + "grad_norm": 2.162956953048706, + "learning_rate": 9.458736692689851e-05, + "loss": 0.03781618475914002, + "step": 38150 + }, + { + "epoch": 5.416607523066004, + "grad_norm": 4.835059642791748, + "learning_rate": 9.458594748048261e-05, + "loss": 0.02739281952381134, + "step": 38160 + }, + { + "epoch": 5.418026969481902, + "grad_norm": 0.8784237504005432, + "learning_rate": 9.458452803406672e-05, + "loss": 0.05759773254394531, + "step": 38170 + }, + { + "epoch": 5.4194464158977995, + "grad_norm": 4.168929576873779, + "learning_rate": 9.458310858765082e-05, + "loss": 0.040823107957839964, + "step": 38180 + }, + { + "epoch": 5.420865862313698, + "grad_norm": 2.2118477821350098, + "learning_rate": 9.458168914123492e-05, + "loss": 0.051774638891220096, + "step": 38190 + }, + { + "epoch": 5.422285308729595, + "grad_norm": 1.5984776020050049, + "learning_rate": 9.458026969481903e-05, + "loss": 0.03739486932754517, + "step": 38200 + }, + { + "epoch": 5.4237047551454936, + "grad_norm": 1.8358681201934814, + "learning_rate": 9.457885024840313e-05, + "loss": 0.05115988254547119, + "step": 38210 + }, + { + "epoch": 5.425124201561391, + "grad_norm": 2.7211074829101562, + "learning_rate": 9.457743080198724e-05, + "loss": 0.02570842206478119, + "step": 38220 + }, + { + "epoch": 5.426543647977288, + "grad_norm": 0.14551569521427155, + "learning_rate": 9.457601135557133e-05, + "loss": 0.07109014987945557, + "step": 38230 + }, + { + "epoch": 5.427963094393187, + "grad_norm": 2.1508007049560547, + "learning_rate": 9.457459190915543e-05, + "loss": 0.06561747789382935, + "step": 38240 + }, + { + "epoch": 5.429382540809084, + "grad_norm": 1.7346519231796265, + "learning_rate": 9.457317246273953e-05, + "loss": 0.03357102572917938, + "step": 38250 + }, + { + "epoch": 5.430801987224982, + "grad_norm": 2.5490376949310303, + "learning_rate": 9.457175301632364e-05, + "loss": 0.04644973874092102, + "step": 38260 + }, + { + "epoch": 5.43222143364088, + "grad_norm": 6.230116844177246, + "learning_rate": 9.457033356990774e-05, + "loss": 0.08044976592063904, + "step": 38270 + }, + { + "epoch": 5.433640880056778, + "grad_norm": 8.601234436035156, + "learning_rate": 9.456891412349185e-05, + "loss": 0.07310090065002442, + "step": 38280 + }, + { + "epoch": 5.435060326472676, + "grad_norm": 10.968649864196777, + "learning_rate": 9.456749467707594e-05, + "loss": 0.06937228441238404, + "step": 38290 + }, + { + "epoch": 5.436479772888574, + "grad_norm": 0.40604132413864136, + "learning_rate": 9.456607523066004e-05, + "loss": 0.0455585777759552, + "step": 38300 + }, + { + "epoch": 5.437899219304471, + "grad_norm": 6.999582767486572, + "learning_rate": 9.456465578424415e-05, + "loss": 0.057856935262680056, + "step": 38310 + }, + { + "epoch": 5.439318665720369, + "grad_norm": 9.098925590515137, + "learning_rate": 9.456323633782825e-05, + "loss": 0.10670442581176758, + "step": 38320 + }, + { + "epoch": 5.440738112136267, + "grad_norm": 5.516963481903076, + "learning_rate": 9.456181689141236e-05, + "loss": 0.07467976808547974, + "step": 38330 + }, + { + "epoch": 5.442157558552164, + "grad_norm": 0.20835815370082855, + "learning_rate": 9.456039744499645e-05, + "loss": 0.08520526289939881, + "step": 38340 + }, + { + "epoch": 5.443577004968063, + "grad_norm": 6.751248359680176, + "learning_rate": 9.455897799858056e-05, + "loss": 0.060392063856124875, + "step": 38350 + }, + { + "epoch": 5.44499645138396, + "grad_norm": 3.8347504138946533, + "learning_rate": 9.455755855216465e-05, + "loss": 0.059272587299346924, + "step": 38360 + }, + { + "epoch": 5.4464158977998585, + "grad_norm": 5.555981636047363, + "learning_rate": 9.455613910574876e-05, + "loss": 0.05174638628959656, + "step": 38370 + }, + { + "epoch": 5.447835344215756, + "grad_norm": 15.028047561645508, + "learning_rate": 9.455471965933288e-05, + "loss": 0.06913697719573975, + "step": 38380 + }, + { + "epoch": 5.449254790631653, + "grad_norm": 0.3131970167160034, + "learning_rate": 9.455330021291696e-05, + "loss": 0.06802209615707397, + "step": 38390 + }, + { + "epoch": 5.450674237047552, + "grad_norm": 5.729186534881592, + "learning_rate": 9.455188076650107e-05, + "loss": 0.04004753828048706, + "step": 38400 + }, + { + "epoch": 5.452093683463449, + "grad_norm": 0.8130164742469788, + "learning_rate": 9.455046132008517e-05, + "loss": 0.038001006841659545, + "step": 38410 + }, + { + "epoch": 5.453513129879347, + "grad_norm": 4.351025581359863, + "learning_rate": 9.454904187366928e-05, + "loss": 0.03257591426372528, + "step": 38420 + }, + { + "epoch": 5.454932576295245, + "grad_norm": 0.4877346158027649, + "learning_rate": 9.454762242725338e-05, + "loss": 0.08598875999450684, + "step": 38430 + }, + { + "epoch": 5.456352022711143, + "grad_norm": 2.889662981033325, + "learning_rate": 9.454620298083747e-05, + "loss": 0.05721787214279175, + "step": 38440 + }, + { + "epoch": 5.4577714691270405, + "grad_norm": 1.7848842144012451, + "learning_rate": 9.454478353442157e-05, + "loss": 0.07306339144706726, + "step": 38450 + }, + { + "epoch": 5.459190915542938, + "grad_norm": 5.2257914543151855, + "learning_rate": 9.454336408800568e-05, + "loss": 0.020994843542575838, + "step": 38460 + }, + { + "epoch": 5.460610361958836, + "grad_norm": 0.30310195684432983, + "learning_rate": 9.454194464158979e-05, + "loss": 0.044001060724258426, + "step": 38470 + }, + { + "epoch": 5.462029808374734, + "grad_norm": 10.677133560180664, + "learning_rate": 9.454052519517389e-05, + "loss": 0.10154651403427124, + "step": 38480 + }, + { + "epoch": 5.463449254790632, + "grad_norm": 2.1424472332000732, + "learning_rate": 9.453910574875799e-05, + "loss": 0.014285300672054291, + "step": 38490 + }, + { + "epoch": 5.464868701206529, + "grad_norm": 4.392820835113525, + "learning_rate": 9.453768630234208e-05, + "loss": 0.07753425240516662, + "step": 38500 + }, + { + "epoch": 5.464868701206529, + "eval_accuracy": 0.9799707509378776, + "eval_loss": 0.05647183209657669, + "eval_runtime": 35.3162, + "eval_samples_per_second": 445.32, + "eval_steps_per_second": 13.931, + "step": 38500 + }, + { + "epoch": 5.466288147622428, + "grad_norm": 2.9922590255737305, + "learning_rate": 9.45362668559262e-05, + "loss": 0.07568216323852539, + "step": 38510 + }, + { + "epoch": 5.467707594038325, + "grad_norm": 0.7704054713249207, + "learning_rate": 9.453484740951029e-05, + "loss": 0.04570820927619934, + "step": 38520 + }, + { + "epoch": 5.4691270404542225, + "grad_norm": 0.6614516377449036, + "learning_rate": 9.45334279630944e-05, + "loss": 0.048297053575515746, + "step": 38530 + }, + { + "epoch": 5.470546486870121, + "grad_norm": 0.7620261907577515, + "learning_rate": 9.45320085166785e-05, + "loss": 0.04648005664348602, + "step": 38540 + }, + { + "epoch": 5.471965933286018, + "grad_norm": 0.19767679274082184, + "learning_rate": 9.45305890702626e-05, + "loss": 0.03164273500442505, + "step": 38550 + }, + { + "epoch": 5.4733853797019165, + "grad_norm": 16.225950241088867, + "learning_rate": 9.452916962384671e-05, + "loss": 0.06350845098495483, + "step": 38560 + }, + { + "epoch": 5.474804826117814, + "grad_norm": 0.54612135887146, + "learning_rate": 9.45277501774308e-05, + "loss": 0.05994898080825806, + "step": 38570 + }, + { + "epoch": 5.476224272533712, + "grad_norm": 10.0214204788208, + "learning_rate": 9.452633073101492e-05, + "loss": 0.07067713737487794, + "step": 38580 + }, + { + "epoch": 5.47764371894961, + "grad_norm": 1.5006757974624634, + "learning_rate": 9.452491128459902e-05, + "loss": 0.04153343141078949, + "step": 38590 + }, + { + "epoch": 5.479063165365507, + "grad_norm": 9.297212600708008, + "learning_rate": 9.452349183818311e-05, + "loss": 0.049632930755615236, + "step": 38600 + }, + { + "epoch": 5.480482611781405, + "grad_norm": 0.7019015550613403, + "learning_rate": 9.452207239176721e-05, + "loss": 0.05248759984970093, + "step": 38610 + }, + { + "epoch": 5.481902058197303, + "grad_norm": 0.5330097675323486, + "learning_rate": 9.452065294535132e-05, + "loss": 0.06436276435852051, + "step": 38620 + }, + { + "epoch": 5.483321504613201, + "grad_norm": 10.498361587524414, + "learning_rate": 9.451923349893542e-05, + "loss": 0.06726992130279541, + "step": 38630 + }, + { + "epoch": 5.4847409510290985, + "grad_norm": 1.389711618423462, + "learning_rate": 9.451781405251953e-05, + "loss": 0.04922493100166321, + "step": 38640 + }, + { + "epoch": 5.486160397444997, + "grad_norm": 6.544168472290039, + "learning_rate": 9.451639460610363e-05, + "loss": 0.043633729219436646, + "step": 38650 + }, + { + "epoch": 5.487579843860894, + "grad_norm": 2.614717960357666, + "learning_rate": 9.451497515968772e-05, + "loss": 0.05251113176345825, + "step": 38660 + }, + { + "epoch": 5.488999290276792, + "grad_norm": 3.5543386936187744, + "learning_rate": 9.451355571327183e-05, + "loss": 0.12290339469909668, + "step": 38670 + }, + { + "epoch": 5.49041873669269, + "grad_norm": 3.551682472229004, + "learning_rate": 9.451213626685593e-05, + "loss": 0.020476463437080383, + "step": 38680 + }, + { + "epoch": 5.491838183108587, + "grad_norm": 2.683623790740967, + "learning_rate": 9.451071682044004e-05, + "loss": 0.09093580842018127, + "step": 38690 + }, + { + "epoch": 5.493257629524486, + "grad_norm": 0.2011883705854416, + "learning_rate": 9.450929737402413e-05, + "loss": 0.05041297674179077, + "step": 38700 + }, + { + "epoch": 5.494677075940383, + "grad_norm": 6.980527877807617, + "learning_rate": 9.450787792760824e-05, + "loss": 0.06176745891571045, + "step": 38710 + }, + { + "epoch": 5.496096522356281, + "grad_norm": 1.4676660299301147, + "learning_rate": 9.450645848119234e-05, + "loss": 0.07806309461593627, + "step": 38720 + }, + { + "epoch": 5.497515968772179, + "grad_norm": 0.9213213324546814, + "learning_rate": 9.450503903477645e-05, + "loss": 0.08935214877128601, + "step": 38730 + }, + { + "epoch": 5.498935415188076, + "grad_norm": 0.4523489773273468, + "learning_rate": 9.450361958836054e-05, + "loss": 0.03591077327728272, + "step": 38740 + }, + { + "epoch": 5.500354861603975, + "grad_norm": 10.078042984008789, + "learning_rate": 9.450220014194464e-05, + "loss": 0.06557263135910034, + "step": 38750 + }, + { + "epoch": 5.501774308019872, + "grad_norm": 1.226947546005249, + "learning_rate": 9.450078069552875e-05, + "loss": 0.04932633936405182, + "step": 38760 + }, + { + "epoch": 5.50319375443577, + "grad_norm": 11.178776741027832, + "learning_rate": 9.449936124911285e-05, + "loss": 0.05480325222015381, + "step": 38770 + }, + { + "epoch": 5.504613200851668, + "grad_norm": 0.29021137952804565, + "learning_rate": 9.449794180269696e-05, + "loss": 0.02817882001399994, + "step": 38780 + }, + { + "epoch": 5.506032647267566, + "grad_norm": 0.24028928577899933, + "learning_rate": 9.449652235628106e-05, + "loss": 0.053332853317260745, + "step": 38790 + }, + { + "epoch": 5.5074520936834634, + "grad_norm": 3.2386868000030518, + "learning_rate": 9.449510290986515e-05, + "loss": 0.054778027534484866, + "step": 38800 + }, + { + "epoch": 5.508871540099361, + "grad_norm": 8.147454261779785, + "learning_rate": 9.449368346344925e-05, + "loss": 0.07028309106826783, + "step": 38810 + }, + { + "epoch": 5.510290986515259, + "grad_norm": 0.2091905176639557, + "learning_rate": 9.449226401703336e-05, + "loss": 0.0604537308216095, + "step": 38820 + }, + { + "epoch": 5.511710432931157, + "grad_norm": 2.969684600830078, + "learning_rate": 9.449084457061746e-05, + "loss": 0.04221682250499725, + "step": 38830 + }, + { + "epoch": 5.513129879347055, + "grad_norm": 0.9687553644180298, + "learning_rate": 9.448942512420157e-05, + "loss": 0.024466480314731597, + "step": 38840 + }, + { + "epoch": 5.514549325762952, + "grad_norm": 2.4636096954345703, + "learning_rate": 9.448800567778567e-05, + "loss": 0.10217208862304687, + "step": 38850 + }, + { + "epoch": 5.515968772178851, + "grad_norm": 0.06745173037052155, + "learning_rate": 9.448658623136977e-05, + "loss": 0.05621076226234436, + "step": 38860 + }, + { + "epoch": 5.517388218594748, + "grad_norm": 5.0668535232543945, + "learning_rate": 9.448516678495388e-05, + "loss": 0.07827832102775574, + "step": 38870 + }, + { + "epoch": 5.518807665010646, + "grad_norm": 0.08209887892007828, + "learning_rate": 9.448374733853797e-05, + "loss": 0.055400484800338747, + "step": 38880 + }, + { + "epoch": 5.520227111426544, + "grad_norm": 13.148024559020996, + "learning_rate": 9.448232789212209e-05, + "loss": 0.05364589095115661, + "step": 38890 + }, + { + "epoch": 5.521646557842441, + "grad_norm": 2.172600507736206, + "learning_rate": 9.448090844570618e-05, + "loss": 0.0865916907787323, + "step": 38900 + }, + { + "epoch": 5.5230660042583395, + "grad_norm": 3.1451680660247803, + "learning_rate": 9.447948899929028e-05, + "loss": 0.05537939667701721, + "step": 38910 + }, + { + "epoch": 5.524485450674237, + "grad_norm": 6.0753960609436035, + "learning_rate": 9.447806955287438e-05, + "loss": 0.06018974781036377, + "step": 38920 + }, + { + "epoch": 5.525904897090135, + "grad_norm": 10.356642723083496, + "learning_rate": 9.447665010645849e-05, + "loss": 0.07976688146591186, + "step": 38930 + }, + { + "epoch": 5.527324343506033, + "grad_norm": 1.3878400325775146, + "learning_rate": 9.447523066004259e-05, + "loss": 0.03053358793258667, + "step": 38940 + }, + { + "epoch": 5.528743789921931, + "grad_norm": 1.5194265842437744, + "learning_rate": 9.44738112136267e-05, + "loss": 0.054735350608825686, + "step": 38950 + }, + { + "epoch": 5.530163236337828, + "grad_norm": 5.2882304191589355, + "learning_rate": 9.44723917672108e-05, + "loss": 0.04336060583591461, + "step": 38960 + }, + { + "epoch": 5.531582682753726, + "grad_norm": 1.1984394788742065, + "learning_rate": 9.447097232079489e-05, + "loss": 0.06638429760932922, + "step": 38970 + }, + { + "epoch": 5.533002129169624, + "grad_norm": 5.800374984741211, + "learning_rate": 9.4469552874379e-05, + "loss": 0.0930544376373291, + "step": 38980 + }, + { + "epoch": 5.5344215755855215, + "grad_norm": 2.947801113128662, + "learning_rate": 9.44681334279631e-05, + "loss": 0.047572529315948485, + "step": 38990 + }, + { + "epoch": 5.53584102200142, + "grad_norm": 0.7058837413787842, + "learning_rate": 9.446671398154721e-05, + "loss": 0.0859229564666748, + "step": 39000 + }, + { + "epoch": 5.53584102200142, + "eval_accuracy": 0.9780632034081516, + "eval_loss": 0.06798505038022995, + "eval_runtime": 34.2002, + "eval_samples_per_second": 459.852, + "eval_steps_per_second": 14.386, + "step": 39000 + }, + { + "epoch": 5.537260468417317, + "grad_norm": 0.11149133741855621, + "learning_rate": 9.44652945351313e-05, + "loss": 0.027533328533172606, + "step": 39010 + }, + { + "epoch": 5.5386799148332155, + "grad_norm": 4.605347156524658, + "learning_rate": 9.44638750887154e-05, + "loss": 0.029569646716117857, + "step": 39020 + }, + { + "epoch": 5.540099361249113, + "grad_norm": 1.489418864250183, + "learning_rate": 9.44624556422995e-05, + "loss": 0.05133401155471802, + "step": 39030 + }, + { + "epoch": 5.54151880766501, + "grad_norm": 2.5126025676727295, + "learning_rate": 9.446103619588361e-05, + "loss": 0.03453618586063385, + "step": 39040 + }, + { + "epoch": 5.542938254080909, + "grad_norm": 2.8823859691619873, + "learning_rate": 9.445961674946771e-05, + "loss": 0.03072114586830139, + "step": 39050 + }, + { + "epoch": 5.544357700496806, + "grad_norm": 2.666760206222534, + "learning_rate": 9.445819730305181e-05, + "loss": 0.11776949167251587, + "step": 39060 + }, + { + "epoch": 5.545777146912704, + "grad_norm": 0.38773733377456665, + "learning_rate": 9.445677785663592e-05, + "loss": 0.020077161490917206, + "step": 39070 + }, + { + "epoch": 5.547196593328602, + "grad_norm": 4.955237865447998, + "learning_rate": 9.445535841022002e-05, + "loss": 0.05261182188987732, + "step": 39080 + }, + { + "epoch": 5.5486160397445, + "grad_norm": 2.406719207763672, + "learning_rate": 9.445393896380413e-05, + "loss": 0.04864847362041473, + "step": 39090 + }, + { + "epoch": 5.5500354861603975, + "grad_norm": 7.484744548797607, + "learning_rate": 9.445251951738823e-05, + "loss": 0.09151811003684998, + "step": 39100 + }, + { + "epoch": 5.551454932576295, + "grad_norm": 5.400770664215088, + "learning_rate": 9.445110007097232e-05, + "loss": 0.06537050008773804, + "step": 39110 + }, + { + "epoch": 5.552874378992193, + "grad_norm": 4.668606281280518, + "learning_rate": 9.444968062455642e-05, + "loss": 0.07697397470474243, + "step": 39120 + }, + { + "epoch": 5.554293825408091, + "grad_norm": 4.4636006355285645, + "learning_rate": 9.444826117814053e-05, + "loss": 0.05506072640419006, + "step": 39130 + }, + { + "epoch": 5.555713271823989, + "grad_norm": 13.243671417236328, + "learning_rate": 9.444684173172463e-05, + "loss": 0.061540770530700686, + "step": 39140 + }, + { + "epoch": 5.557132718239886, + "grad_norm": 3.05822491645813, + "learning_rate": 9.444542228530874e-05, + "loss": 0.09180397987365722, + "step": 39150 + }, + { + "epoch": 5.558552164655785, + "grad_norm": 2.3664355278015137, + "learning_rate": 9.444400283889284e-05, + "loss": 0.026255601644515993, + "step": 39160 + }, + { + "epoch": 5.559971611071682, + "grad_norm": 2.924701452255249, + "learning_rate": 9.444258339247693e-05, + "loss": 0.0476565808057785, + "step": 39170 + }, + { + "epoch": 5.56139105748758, + "grad_norm": 4.52599573135376, + "learning_rate": 9.444116394606104e-05, + "loss": 0.05514841079711914, + "step": 39180 + }, + { + "epoch": 5.562810503903478, + "grad_norm": 9.638506889343262, + "learning_rate": 9.443974449964514e-05, + "loss": 0.04046670794486999, + "step": 39190 + }, + { + "epoch": 5.564229950319375, + "grad_norm": 4.365987777709961, + "learning_rate": 9.443832505322925e-05, + "loss": 0.07949233055114746, + "step": 39200 + }, + { + "epoch": 5.565649396735274, + "grad_norm": 4.490222454071045, + "learning_rate": 9.443690560681335e-05, + "loss": 0.1337314486503601, + "step": 39210 + }, + { + "epoch": 5.567068843151171, + "grad_norm": 4.05878210067749, + "learning_rate": 9.443548616039745e-05, + "loss": 0.04588429927825928, + "step": 39220 + }, + { + "epoch": 5.568488289567069, + "grad_norm": 0.48254233598709106, + "learning_rate": 9.443406671398155e-05, + "loss": 0.028879329562187195, + "step": 39230 + }, + { + "epoch": 5.569907735982967, + "grad_norm": 2.425044298171997, + "learning_rate": 9.443264726756566e-05, + "loss": 0.07074435949325561, + "step": 39240 + }, + { + "epoch": 5.571327182398864, + "grad_norm": 4.822774410247803, + "learning_rate": 9.443122782114975e-05, + "loss": 0.049528279900550844, + "step": 39250 + }, + { + "epoch": 5.5727466288147625, + "grad_norm": 6.030632495880127, + "learning_rate": 9.442980837473386e-05, + "loss": 0.07618424892425538, + "step": 39260 + }, + { + "epoch": 5.57416607523066, + "grad_norm": 2.610471725463867, + "learning_rate": 9.442838892831796e-05, + "loss": 0.037158846855163574, + "step": 39270 + }, + { + "epoch": 5.575585521646558, + "grad_norm": 1.8794517517089844, + "learning_rate": 9.442696948190206e-05, + "loss": 0.04584873616695404, + "step": 39280 + }, + { + "epoch": 5.577004968062456, + "grad_norm": 6.433529376983643, + "learning_rate": 9.442555003548617e-05, + "loss": 0.0613658607006073, + "step": 39290 + }, + { + "epoch": 5.578424414478354, + "grad_norm": 3.764742374420166, + "learning_rate": 9.442413058907027e-05, + "loss": 0.04258022904396057, + "step": 39300 + }, + { + "epoch": 5.579843860894251, + "grad_norm": 7.699685573577881, + "learning_rate": 9.442271114265438e-05, + "loss": 0.05763416886329651, + "step": 39310 + }, + { + "epoch": 5.581263307310149, + "grad_norm": 1.2105742692947388, + "learning_rate": 9.442129169623846e-05, + "loss": 0.029989880323410035, + "step": 39320 + }, + { + "epoch": 5.582682753726047, + "grad_norm": 3.355437994003296, + "learning_rate": 9.441987224982257e-05, + "loss": 0.08556990623474121, + "step": 39330 + }, + { + "epoch": 5.5841022001419445, + "grad_norm": 0.5614784955978394, + "learning_rate": 9.441845280340667e-05, + "loss": 0.04003655612468719, + "step": 39340 + }, + { + "epoch": 5.585521646557843, + "grad_norm": 1.7917828559875488, + "learning_rate": 9.441703335699078e-05, + "loss": 0.03968890905380249, + "step": 39350 + }, + { + "epoch": 5.58694109297374, + "grad_norm": 0.16788877546787262, + "learning_rate": 9.441561391057488e-05, + "loss": 0.03502267599105835, + "step": 39360 + }, + { + "epoch": 5.5883605393896385, + "grad_norm": 0.6021848917007446, + "learning_rate": 9.441419446415898e-05, + "loss": 0.06506335139274597, + "step": 39370 + }, + { + "epoch": 5.589779985805536, + "grad_norm": 0.4147074222564697, + "learning_rate": 9.441277501774309e-05, + "loss": 0.04189459681510925, + "step": 39380 + }, + { + "epoch": 5.591199432221433, + "grad_norm": 8.627924919128418, + "learning_rate": 9.441135557132718e-05, + "loss": 0.044555434584617616, + "step": 39390 + }, + { + "epoch": 5.592618878637332, + "grad_norm": 2.4682135581970215, + "learning_rate": 9.44099361249113e-05, + "loss": 0.13713971376419068, + "step": 39400 + }, + { + "epoch": 5.594038325053229, + "grad_norm": 2.6810388565063477, + "learning_rate": 9.440851667849539e-05, + "loss": 0.020219671726226806, + "step": 39410 + }, + { + "epoch": 5.595457771469127, + "grad_norm": 9.803950309753418, + "learning_rate": 9.440709723207949e-05, + "loss": 0.07011809349060058, + "step": 39420 + }, + { + "epoch": 5.596877217885025, + "grad_norm": 0.29333794116973877, + "learning_rate": 9.440567778566359e-05, + "loss": 0.020935848355293274, + "step": 39430 + }, + { + "epoch": 5.598296664300923, + "grad_norm": 1.5691699981689453, + "learning_rate": 9.44042583392477e-05, + "loss": 0.04376091659069061, + "step": 39440 + }, + { + "epoch": 5.5997161107168205, + "grad_norm": 0.5065397024154663, + "learning_rate": 9.44028388928318e-05, + "loss": 0.04781602025032043, + "step": 39450 + }, + { + "epoch": 5.601135557132718, + "grad_norm": 4.79033899307251, + "learning_rate": 9.440141944641591e-05, + "loss": 0.10913141965866088, + "step": 39460 + }, + { + "epoch": 5.602555003548616, + "grad_norm": 3.315653085708618, + "learning_rate": 9.44e-05, + "loss": 0.0696462333202362, + "step": 39470 + }, + { + "epoch": 5.603974449964514, + "grad_norm": 0.12675778567790985, + "learning_rate": 9.43985805535841e-05, + "loss": 0.017715385556221007, + "step": 39480 + }, + { + "epoch": 5.605393896380412, + "grad_norm": 7.688170433044434, + "learning_rate": 9.439716110716821e-05, + "loss": 0.061501210927963255, + "step": 39490 + }, + { + "epoch": 5.606813342796309, + "grad_norm": 7.34026575088501, + "learning_rate": 9.439574166075231e-05, + "loss": 0.057365798950195314, + "step": 39500 + }, + { + "epoch": 5.606813342796309, + "eval_accuracy": 0.9736758440897819, + "eval_loss": 0.07783501595258713, + "eval_runtime": 34.6522, + "eval_samples_per_second": 453.853, + "eval_steps_per_second": 14.198, + "step": 39500 + }, + { + "epoch": 5.608232789212208, + "grad_norm": 6.776620864868164, + "learning_rate": 9.439432221433642e-05, + "loss": 0.08461334109306336, + "step": 39510 + }, + { + "epoch": 5.609652235628105, + "grad_norm": 7.816592693328857, + "learning_rate": 9.43929027679205e-05, + "loss": 0.05237630009651184, + "step": 39520 + }, + { + "epoch": 5.6110716820440025, + "grad_norm": 2.657180070877075, + "learning_rate": 9.439148332150462e-05, + "loss": 0.04638761878013611, + "step": 39530 + }, + { + "epoch": 5.612491128459901, + "grad_norm": 3.7408230304718018, + "learning_rate": 9.439006387508871e-05, + "loss": 0.05367375612258911, + "step": 39540 + }, + { + "epoch": 5.613910574875798, + "grad_norm": 2.930955648422241, + "learning_rate": 9.438864442867282e-05, + "loss": 0.06972458958625793, + "step": 39550 + }, + { + "epoch": 5.615330021291697, + "grad_norm": 16.335681915283203, + "learning_rate": 9.438722498225692e-05, + "loss": 0.11342012882232666, + "step": 39560 + }, + { + "epoch": 5.616749467707594, + "grad_norm": 0.9312990307807922, + "learning_rate": 9.438580553584103e-05, + "loss": 0.09171445965766907, + "step": 39570 + }, + { + "epoch": 5.618168914123492, + "grad_norm": 0.5279941558837891, + "learning_rate": 9.438438608942513e-05, + "loss": 0.0824202299118042, + "step": 39580 + }, + { + "epoch": 5.61958836053939, + "grad_norm": 5.812867164611816, + "learning_rate": 9.438296664300923e-05, + "loss": 0.03917689919471741, + "step": 39590 + }, + { + "epoch": 5.621007806955287, + "grad_norm": 4.642354965209961, + "learning_rate": 9.438154719659334e-05, + "loss": 0.05176680088043213, + "step": 39600 + }, + { + "epoch": 5.622427253371185, + "grad_norm": 4.812618255615234, + "learning_rate": 9.438012775017744e-05, + "loss": 0.05726593136787415, + "step": 39610 + }, + { + "epoch": 5.623846699787083, + "grad_norm": 6.474045276641846, + "learning_rate": 9.437870830376155e-05, + "loss": 0.06338353157043457, + "step": 39620 + }, + { + "epoch": 5.625266146202981, + "grad_norm": 3.1899099349975586, + "learning_rate": 9.437728885734563e-05, + "loss": 0.03615102469921112, + "step": 39630 + }, + { + "epoch": 5.626685592618879, + "grad_norm": 1.5344411134719849, + "learning_rate": 9.437586941092974e-05, + "loss": 0.052175390720367434, + "step": 39640 + }, + { + "epoch": 5.628105039034777, + "grad_norm": 4.2242255210876465, + "learning_rate": 9.437444996451384e-05, + "loss": 0.02579593062400818, + "step": 39650 + }, + { + "epoch": 5.629524485450674, + "grad_norm": 2.9231820106506348, + "learning_rate": 9.437303051809795e-05, + "loss": 0.058068424463272095, + "step": 39660 + }, + { + "epoch": 5.630943931866572, + "grad_norm": 2.0201687812805176, + "learning_rate": 9.437161107168206e-05, + "loss": 0.038857880234718326, + "step": 39670 + }, + { + "epoch": 5.63236337828247, + "grad_norm": 7.435208797454834, + "learning_rate": 9.437019162526614e-05, + "loss": 0.05277242660522461, + "step": 39680 + }, + { + "epoch": 5.633782824698367, + "grad_norm": 0.2021070271730423, + "learning_rate": 9.436877217885025e-05, + "loss": 0.07399315237998963, + "step": 39690 + }, + { + "epoch": 5.635202271114266, + "grad_norm": 2.765399932861328, + "learning_rate": 9.436735273243435e-05, + "loss": 0.030564799904823303, + "step": 39700 + }, + { + "epoch": 5.636621717530163, + "grad_norm": 6.345180511474609, + "learning_rate": 9.436593328601846e-05, + "loss": 0.03482165336608887, + "step": 39710 + }, + { + "epoch": 5.6380411639460615, + "grad_norm": 12.207220077514648, + "learning_rate": 9.436451383960256e-05, + "loss": 0.07414867281913758, + "step": 39720 + }, + { + "epoch": 5.639460610361959, + "grad_norm": 1.6320226192474365, + "learning_rate": 9.436309439318666e-05, + "loss": 0.03807423412799835, + "step": 39730 + }, + { + "epoch": 5.640880056777856, + "grad_norm": 10.701844215393066, + "learning_rate": 9.436167494677076e-05, + "loss": 0.1101304531097412, + "step": 39740 + }, + { + "epoch": 5.642299503193755, + "grad_norm": 8.42212963104248, + "learning_rate": 9.436025550035487e-05, + "loss": 0.06819977760314941, + "step": 39750 + }, + { + "epoch": 5.643718949609652, + "grad_norm": 3.634274959564209, + "learning_rate": 9.435883605393898e-05, + "loss": 0.032803896069526675, + "step": 39760 + }, + { + "epoch": 5.64513839602555, + "grad_norm": 6.771927356719971, + "learning_rate": 9.435741660752307e-05, + "loss": 0.07954181432723999, + "step": 39770 + }, + { + "epoch": 5.646557842441448, + "grad_norm": 2.6635067462921143, + "learning_rate": 9.435599716110717e-05, + "loss": 0.04880297780036926, + "step": 39780 + }, + { + "epoch": 5.647977288857346, + "grad_norm": 5.2400922775268555, + "learning_rate": 9.435457771469127e-05, + "loss": 0.09463647603988648, + "step": 39790 + }, + { + "epoch": 5.6493967352732435, + "grad_norm": 0.019930781796574593, + "learning_rate": 9.435315826827538e-05, + "loss": 0.03061905801296234, + "step": 39800 + }, + { + "epoch": 5.650816181689141, + "grad_norm": 0.7077277302742004, + "learning_rate": 9.435173882185948e-05, + "loss": 0.060614013671875, + "step": 39810 + }, + { + "epoch": 5.652235628105039, + "grad_norm": 6.0713348388671875, + "learning_rate": 9.435031937544359e-05, + "loss": 0.025630703568458556, + "step": 39820 + }, + { + "epoch": 5.653655074520937, + "grad_norm": 4.086087226867676, + "learning_rate": 9.434889992902767e-05, + "loss": 0.018952296674251558, + "step": 39830 + }, + { + "epoch": 5.655074520936835, + "grad_norm": 1.135500431060791, + "learning_rate": 9.434748048261178e-05, + "loss": 0.03191319704055786, + "step": 39840 + }, + { + "epoch": 5.656493967352732, + "grad_norm": 0.2785182595252991, + "learning_rate": 9.43460610361959e-05, + "loss": 0.02202431410551071, + "step": 39850 + }, + { + "epoch": 5.657913413768631, + "grad_norm": 0.11208788305521011, + "learning_rate": 9.434464158977999e-05, + "loss": 0.03365016877651215, + "step": 39860 + }, + { + "epoch": 5.659332860184528, + "grad_norm": 0.42555415630340576, + "learning_rate": 9.43432221433641e-05, + "loss": 0.04779731035232544, + "step": 39870 + }, + { + "epoch": 5.6607523066004255, + "grad_norm": 0.16418029367923737, + "learning_rate": 9.434180269694819e-05, + "loss": 0.03143316805362702, + "step": 39880 + }, + { + "epoch": 5.662171753016324, + "grad_norm": 8.50043773651123, + "learning_rate": 9.43403832505323e-05, + "loss": 0.09453284740447998, + "step": 39890 + }, + { + "epoch": 5.663591199432221, + "grad_norm": 11.144037246704102, + "learning_rate": 9.43389638041164e-05, + "loss": 0.015583939850330353, + "step": 39900 + }, + { + "epoch": 5.6650106458481195, + "grad_norm": 3.8421754837036133, + "learning_rate": 9.43375443577005e-05, + "loss": 0.04787985980510712, + "step": 39910 + }, + { + "epoch": 5.666430092264017, + "grad_norm": 2.839329481124878, + "learning_rate": 9.43361249112846e-05, + "loss": 0.05071015357971191, + "step": 39920 + }, + { + "epoch": 5.667849538679915, + "grad_norm": 1.3259540796279907, + "learning_rate": 9.433470546486871e-05, + "loss": 0.06693935990333558, + "step": 39930 + }, + { + "epoch": 5.669268985095813, + "grad_norm": 1.9949698448181152, + "learning_rate": 9.43332860184528e-05, + "loss": 0.09938514232635498, + "step": 39940 + }, + { + "epoch": 5.67068843151171, + "grad_norm": 5.349208831787109, + "learning_rate": 9.433186657203691e-05, + "loss": 0.07142342925071717, + "step": 39950 + }, + { + "epoch": 5.672107877927608, + "grad_norm": 6.508642673492432, + "learning_rate": 9.433044712562102e-05, + "loss": 0.07783754467964173, + "step": 39960 + }, + { + "epoch": 5.673527324343506, + "grad_norm": 6.658568859100342, + "learning_rate": 9.432902767920512e-05, + "loss": 0.08317623138427735, + "step": 39970 + }, + { + "epoch": 5.674946770759404, + "grad_norm": 0.1687992364168167, + "learning_rate": 9.432760823278923e-05, + "loss": 0.0768187940120697, + "step": 39980 + }, + { + "epoch": 5.6763662171753015, + "grad_norm": 2.4580390453338623, + "learning_rate": 9.432618878637331e-05, + "loss": 0.08780956864356995, + "step": 39990 + }, + { + "epoch": 5.6777856635912, + "grad_norm": 0.24730490148067474, + "learning_rate": 9.432476933995742e-05, + "loss": 0.06389739513397216, + "step": 40000 + }, + { + "epoch": 5.6777856635912, + "eval_accuracy": 0.9646467857824124, + "eval_loss": 0.1065235510468483, + "eval_runtime": 35.9304, + "eval_samples_per_second": 437.708, + "eval_steps_per_second": 13.693, + "step": 40000 + }, + { + "epoch": 5.679205110007097, + "grad_norm": 1.2596027851104736, + "learning_rate": 9.432334989354152e-05, + "loss": 0.025333791971206665, + "step": 40010 + }, + { + "epoch": 5.680624556422995, + "grad_norm": 4.28132963180542, + "learning_rate": 9.432193044712563e-05, + "loss": 0.07308688759803772, + "step": 40020 + }, + { + "epoch": 5.682044002838893, + "grad_norm": 2.235490322113037, + "learning_rate": 9.432051100070973e-05, + "loss": 0.04388102889060974, + "step": 40030 + }, + { + "epoch": 5.68346344925479, + "grad_norm": 0.5646092891693115, + "learning_rate": 9.431909155429383e-05, + "loss": 0.07124125361442565, + "step": 40040 + }, + { + "epoch": 5.684882895670689, + "grad_norm": 0.07894527912139893, + "learning_rate": 9.431767210787794e-05, + "loss": 0.06131964325904846, + "step": 40050 + }, + { + "epoch": 5.686302342086586, + "grad_norm": 1.2791597843170166, + "learning_rate": 9.431625266146203e-05, + "loss": 0.05140694975852966, + "step": 40060 + }, + { + "epoch": 5.687721788502484, + "grad_norm": 0.9377229809761047, + "learning_rate": 9.431483321504614e-05, + "loss": 0.07500240802764893, + "step": 40070 + }, + { + "epoch": 5.689141234918382, + "grad_norm": 5.653379440307617, + "learning_rate": 9.431341376863024e-05, + "loss": 0.05739631056785584, + "step": 40080 + }, + { + "epoch": 5.690560681334279, + "grad_norm": 5.261422634124756, + "learning_rate": 9.431199432221434e-05, + "loss": 0.06874457597732545, + "step": 40090 + }, + { + "epoch": 5.691980127750178, + "grad_norm": 9.214577674865723, + "learning_rate": 9.431057487579844e-05, + "loss": 0.044308590888977054, + "step": 40100 + }, + { + "epoch": 5.693399574166075, + "grad_norm": 1.361165165901184, + "learning_rate": 9.430915542938255e-05, + "loss": 0.022246035933494567, + "step": 40110 + }, + { + "epoch": 5.694819020581973, + "grad_norm": 8.207006454467773, + "learning_rate": 9.430773598296665e-05, + "loss": 0.047158649563789366, + "step": 40120 + }, + { + "epoch": 5.696238466997871, + "grad_norm": 4.108800411224365, + "learning_rate": 9.430631653655076e-05, + "loss": 0.07500581741333008, + "step": 40130 + }, + { + "epoch": 5.697657913413769, + "grad_norm": 8.759567260742188, + "learning_rate": 9.430489709013485e-05, + "loss": 0.038922271132469176, + "step": 40140 + }, + { + "epoch": 5.6990773598296665, + "grad_norm": 4.76708459854126, + "learning_rate": 9.430347764371895e-05, + "loss": 0.030303937196731568, + "step": 40150 + }, + { + "epoch": 5.700496806245564, + "grad_norm": 0.7679837942123413, + "learning_rate": 9.430205819730306e-05, + "loss": 0.04279916882514954, + "step": 40160 + }, + { + "epoch": 5.701916252661462, + "grad_norm": 1.9951245784759521, + "learning_rate": 9.430063875088716e-05, + "loss": 0.04763171076774597, + "step": 40170 + }, + { + "epoch": 5.70333569907736, + "grad_norm": 1.4225386381149292, + "learning_rate": 9.429921930447127e-05, + "loss": 0.059112942218780516, + "step": 40180 + }, + { + "epoch": 5.704755145493258, + "grad_norm": 5.624825477600098, + "learning_rate": 9.429779985805535e-05, + "loss": 0.08224546909332275, + "step": 40190 + }, + { + "epoch": 5.706174591909155, + "grad_norm": 4.557068824768066, + "learning_rate": 9.429638041163946e-05, + "loss": 0.07699850797653199, + "step": 40200 + }, + { + "epoch": 5.707594038325054, + "grad_norm": 2.6185569763183594, + "learning_rate": 9.429496096522356e-05, + "loss": 0.09195753931999207, + "step": 40210 + }, + { + "epoch": 5.709013484740951, + "grad_norm": 9.599045753479004, + "learning_rate": 9.429354151880767e-05, + "loss": 0.07318518757820129, + "step": 40220 + }, + { + "epoch": 5.7104329311568485, + "grad_norm": 8.182303428649902, + "learning_rate": 9.429212207239177e-05, + "loss": 0.06409326791763306, + "step": 40230 + }, + { + "epoch": 5.711852377572747, + "grad_norm": 1.4199198484420776, + "learning_rate": 9.429070262597587e-05, + "loss": 0.05758379697799683, + "step": 40240 + }, + { + "epoch": 5.713271823988644, + "grad_norm": 0.9231958389282227, + "learning_rate": 9.428928317955998e-05, + "loss": 0.04129588007926941, + "step": 40250 + }, + { + "epoch": 5.7146912704045425, + "grad_norm": 4.508418560028076, + "learning_rate": 9.428786373314408e-05, + "loss": 0.06449969410896302, + "step": 40260 + }, + { + "epoch": 5.71611071682044, + "grad_norm": 7.092264652252197, + "learning_rate": 9.428644428672819e-05, + "loss": 0.045121192932128906, + "step": 40270 + }, + { + "epoch": 5.717530163236338, + "grad_norm": 0.6932799816131592, + "learning_rate": 9.428502484031228e-05, + "loss": 0.03739106059074402, + "step": 40280 + }, + { + "epoch": 5.718949609652236, + "grad_norm": 0.5188174247741699, + "learning_rate": 9.42836053938964e-05, + "loss": 0.042255252599716187, + "step": 40290 + }, + { + "epoch": 5.720369056068133, + "grad_norm": 1.1262272596359253, + "learning_rate": 9.428218594748048e-05, + "loss": 0.03085188865661621, + "step": 40300 + }, + { + "epoch": 5.721788502484031, + "grad_norm": 1.6696752309799194, + "learning_rate": 9.428076650106459e-05, + "loss": 0.02104969471693039, + "step": 40310 + }, + { + "epoch": 5.723207948899929, + "grad_norm": 1.4816834926605225, + "learning_rate": 9.427934705464869e-05, + "loss": 0.03856719136238098, + "step": 40320 + }, + { + "epoch": 5.724627395315827, + "grad_norm": 2.560551643371582, + "learning_rate": 9.42779276082328e-05, + "loss": 0.06555436849594116, + "step": 40330 + }, + { + "epoch": 5.7260468417317245, + "grad_norm": 6.567645072937012, + "learning_rate": 9.42765081618169e-05, + "loss": 0.05767791271209717, + "step": 40340 + }, + { + "epoch": 5.727466288147623, + "grad_norm": 7.098581790924072, + "learning_rate": 9.4275088715401e-05, + "loss": 0.08214937448501587, + "step": 40350 + }, + { + "epoch": 5.72888573456352, + "grad_norm": 0.1502898931503296, + "learning_rate": 9.42736692689851e-05, + "loss": 0.07196863293647766, + "step": 40360 + }, + { + "epoch": 5.730305180979418, + "grad_norm": 7.313935279846191, + "learning_rate": 9.42722498225692e-05, + "loss": 0.04690050184726715, + "step": 40370 + }, + { + "epoch": 5.731724627395316, + "grad_norm": 4.765439510345459, + "learning_rate": 9.427083037615331e-05, + "loss": 0.11581621170043946, + "step": 40380 + }, + { + "epoch": 5.733144073811213, + "grad_norm": 0.062072690576314926, + "learning_rate": 9.426941092973741e-05, + "loss": 0.04716021716594696, + "step": 40390 + }, + { + "epoch": 5.734563520227112, + "grad_norm": 6.344238758087158, + "learning_rate": 9.426799148332151e-05, + "loss": 0.07715264558792115, + "step": 40400 + }, + { + "epoch": 5.735982966643009, + "grad_norm": 4.1125593185424805, + "learning_rate": 9.42665720369056e-05, + "loss": 0.1007968544960022, + "step": 40410 + }, + { + "epoch": 5.737402413058907, + "grad_norm": 0.2826620936393738, + "learning_rate": 9.426515259048972e-05, + "loss": 0.05582694411277771, + "step": 40420 + }, + { + "epoch": 5.738821859474805, + "grad_norm": 8.204134941101074, + "learning_rate": 9.426373314407381e-05, + "loss": 0.03588265776634216, + "step": 40430 + }, + { + "epoch": 5.740241305890702, + "grad_norm": 3.2922136783599854, + "learning_rate": 9.426231369765792e-05, + "loss": 0.06283365488052368, + "step": 40440 + }, + { + "epoch": 5.741660752306601, + "grad_norm": 5.972446441650391, + "learning_rate": 9.426089425124202e-05, + "loss": 0.059499341249465945, + "step": 40450 + }, + { + "epoch": 5.743080198722498, + "grad_norm": 3.592957019805908, + "learning_rate": 9.425947480482612e-05, + "loss": 0.07895704507827758, + "step": 40460 + }, + { + "epoch": 5.744499645138396, + "grad_norm": 2.2135820388793945, + "learning_rate": 9.425805535841023e-05, + "loss": 0.045309117436408995, + "step": 40470 + }, + { + "epoch": 5.745919091554294, + "grad_norm": 7.3867340087890625, + "learning_rate": 9.425663591199433e-05, + "loss": 0.04623596966266632, + "step": 40480 + }, + { + "epoch": 5.747338537970192, + "grad_norm": 0.20550206303596497, + "learning_rate": 9.425521646557844e-05, + "loss": 0.05645106434822082, + "step": 40490 + }, + { + "epoch": 5.748757984386089, + "grad_norm": 0.5938138365745544, + "learning_rate": 9.425379701916252e-05, + "loss": 0.03962576985359192, + "step": 40500 + }, + { + "epoch": 5.748757984386089, + "eval_accuracy": 0.9722133909836587, + "eval_loss": 0.08429045230150223, + "eval_runtime": 33.4191, + "eval_samples_per_second": 470.6, + "eval_steps_per_second": 14.722, + "step": 40500 + }, + { + "epoch": 5.750177430801987, + "grad_norm": 2.9440619945526123, + "learning_rate": 9.425237757274663e-05, + "loss": 0.06737836599349975, + "step": 40510 + }, + { + "epoch": 5.751596877217885, + "grad_norm": 13.42500114440918, + "learning_rate": 9.425095812633073e-05, + "loss": 0.14449608325958252, + "step": 40520 + }, + { + "epoch": 5.753016323633783, + "grad_norm": 0.19540852308273315, + "learning_rate": 9.424953867991484e-05, + "loss": 0.11976728439331055, + "step": 40530 + }, + { + "epoch": 5.754435770049681, + "grad_norm": 5.771694660186768, + "learning_rate": 9.424811923349894e-05, + "loss": 0.0465570330619812, + "step": 40540 + }, + { + "epoch": 5.755855216465578, + "grad_norm": 0.7781568169593811, + "learning_rate": 9.424669978708304e-05, + "loss": 0.06043409109115601, + "step": 40550 + }, + { + "epoch": 5.757274662881477, + "grad_norm": 4.123456954956055, + "learning_rate": 9.424528034066715e-05, + "loss": 0.02993989586830139, + "step": 40560 + }, + { + "epoch": 5.758694109297374, + "grad_norm": 5.118284225463867, + "learning_rate": 9.424386089425124e-05, + "loss": 0.0467894971370697, + "step": 40570 + }, + { + "epoch": 5.760113555713271, + "grad_norm": 2.537217378616333, + "learning_rate": 9.424244144783535e-05, + "loss": 0.028548663854598998, + "step": 40580 + }, + { + "epoch": 5.76153300212917, + "grad_norm": 7.4742112159729, + "learning_rate": 9.424102200141945e-05, + "loss": 0.0495083749294281, + "step": 40590 + }, + { + "epoch": 5.762952448545067, + "grad_norm": 6.256778240203857, + "learning_rate": 9.423960255500355e-05, + "loss": 0.05922043919563293, + "step": 40600 + }, + { + "epoch": 5.7643718949609655, + "grad_norm": 5.0860490798950195, + "learning_rate": 9.423818310858765e-05, + "loss": 0.05987945795059204, + "step": 40610 + }, + { + "epoch": 5.765791341376863, + "grad_norm": 12.065763473510742, + "learning_rate": 9.423676366217176e-05, + "loss": 0.05129314064979553, + "step": 40620 + }, + { + "epoch": 5.767210787792761, + "grad_norm": 2.2333362102508545, + "learning_rate": 9.423534421575586e-05, + "loss": 0.025675442814826966, + "step": 40630 + }, + { + "epoch": 5.768630234208659, + "grad_norm": 0.7619284987449646, + "learning_rate": 9.423392476933997e-05, + "loss": 0.03333350419998169, + "step": 40640 + }, + { + "epoch": 5.770049680624556, + "grad_norm": 3.0292410850524902, + "learning_rate": 9.423250532292406e-05, + "loss": 0.032198408246040346, + "step": 40650 + }, + { + "epoch": 5.771469127040454, + "grad_norm": 1.155750036239624, + "learning_rate": 9.423108587650816e-05, + "loss": 0.03320641815662384, + "step": 40660 + }, + { + "epoch": 5.772888573456352, + "grad_norm": 2.6847808361053467, + "learning_rate": 9.422966643009227e-05, + "loss": 0.05150899887084961, + "step": 40670 + }, + { + "epoch": 5.77430801987225, + "grad_norm": 3.288931369781494, + "learning_rate": 9.422824698367637e-05, + "loss": 0.06830101013183594, + "step": 40680 + }, + { + "epoch": 5.7757274662881475, + "grad_norm": 6.511233806610107, + "learning_rate": 9.422682753726048e-05, + "loss": 0.059491848945617674, + "step": 40690 + }, + { + "epoch": 5.777146912704046, + "grad_norm": 0.7103070616722107, + "learning_rate": 9.422540809084458e-05, + "loss": 0.06547984480857849, + "step": 40700 + }, + { + "epoch": 5.778566359119943, + "grad_norm": 0.3059501647949219, + "learning_rate": 9.422398864442868e-05, + "loss": 0.05114136934280396, + "step": 40710 + }, + { + "epoch": 5.779985805535841, + "grad_norm": 5.036358833312988, + "learning_rate": 9.422256919801277e-05, + "loss": 0.04435153007507324, + "step": 40720 + }, + { + "epoch": 5.781405251951739, + "grad_norm": 1.1257925033569336, + "learning_rate": 9.422114975159688e-05, + "loss": 0.07170879244804382, + "step": 40730 + }, + { + "epoch": 5.782824698367636, + "grad_norm": 2.28365421295166, + "learning_rate": 9.421973030518098e-05, + "loss": 0.08844174146652221, + "step": 40740 + }, + { + "epoch": 5.784244144783535, + "grad_norm": 6.227928638458252, + "learning_rate": 9.421845280340668e-05, + "loss": 0.03891546130180359, + "step": 40750 + }, + { + "epoch": 5.785663591199432, + "grad_norm": 7.569578170776367, + "learning_rate": 9.421703335699078e-05, + "loss": 0.07715065479278564, + "step": 40760 + }, + { + "epoch": 5.78708303761533, + "grad_norm": 3.209096670150757, + "learning_rate": 9.421561391057489e-05, + "loss": 0.05780371427536011, + "step": 40770 + }, + { + "epoch": 5.788502484031228, + "grad_norm": 5.905817031860352, + "learning_rate": 9.421419446415897e-05, + "loss": 0.06400578618049621, + "step": 40780 + }, + { + "epoch": 5.789921930447125, + "grad_norm": 1.9129750728607178, + "learning_rate": 9.421277501774308e-05, + "loss": 0.07512367963790893, + "step": 40790 + }, + { + "epoch": 5.7913413768630235, + "grad_norm": 1.1263295412063599, + "learning_rate": 9.42113555713272e-05, + "loss": 0.038415607810020444, + "step": 40800 + }, + { + "epoch": 5.792760823278921, + "grad_norm": 0.9057685732841492, + "learning_rate": 9.420993612491129e-05, + "loss": 0.025888726115226746, + "step": 40810 + }, + { + "epoch": 5.794180269694819, + "grad_norm": 5.407680988311768, + "learning_rate": 9.42085166784954e-05, + "loss": 0.08174926638603211, + "step": 40820 + }, + { + "epoch": 5.795599716110717, + "grad_norm": 2.7758450508117676, + "learning_rate": 9.420709723207949e-05, + "loss": 0.04903877377510071, + "step": 40830 + }, + { + "epoch": 5.797019162526615, + "grad_norm": 3.875502109527588, + "learning_rate": 9.42056777856636e-05, + "loss": 0.0529244601726532, + "step": 40840 + }, + { + "epoch": 5.798438608942512, + "grad_norm": 2.0841777324676514, + "learning_rate": 9.42042583392477e-05, + "loss": 0.03719224333763123, + "step": 40850 + }, + { + "epoch": 5.79985805535841, + "grad_norm": 0.20633159577846527, + "learning_rate": 9.42028388928318e-05, + "loss": 0.0415228933095932, + "step": 40860 + }, + { + "epoch": 5.801277501774308, + "grad_norm": 1.6225624084472656, + "learning_rate": 9.42014194464159e-05, + "loss": 0.07819415926933289, + "step": 40870 + }, + { + "epoch": 5.8026969481902055, + "grad_norm": 3.071683883666992, + "learning_rate": 9.42e-05, + "loss": 0.037554305791854856, + "step": 40880 + }, + { + "epoch": 5.804116394606104, + "grad_norm": 0.5660277009010315, + "learning_rate": 9.419858055358411e-05, + "loss": 0.0829436719417572, + "step": 40890 + }, + { + "epoch": 5.805535841022001, + "grad_norm": 0.9265013933181763, + "learning_rate": 9.419716110716821e-05, + "loss": 0.035265910625457766, + "step": 40900 + }, + { + "epoch": 5.8069552874379, + "grad_norm": 0.10838378220796585, + "learning_rate": 9.419574166075232e-05, + "loss": 0.03172871470451355, + "step": 40910 + }, + { + "epoch": 5.808374733853797, + "grad_norm": 11.150486946105957, + "learning_rate": 9.419432221433642e-05, + "loss": 0.05358134508132935, + "step": 40920 + }, + { + "epoch": 5.809794180269694, + "grad_norm": 5.653853416442871, + "learning_rate": 9.419290276792051e-05, + "loss": 0.053334379196166994, + "step": 40930 + }, + { + "epoch": 5.811213626685593, + "grad_norm": 0.3218175172805786, + "learning_rate": 9.419148332150461e-05, + "loss": 0.04683685302734375, + "step": 40940 + }, + { + "epoch": 5.81263307310149, + "grad_norm": 2.429058313369751, + "learning_rate": 9.419006387508872e-05, + "loss": 0.034245806932449344, + "step": 40950 + }, + { + "epoch": 5.814052519517388, + "grad_norm": 5.519045829772949, + "learning_rate": 9.418864442867282e-05, + "loss": 0.09080212116241455, + "step": 40960 + }, + { + "epoch": 5.815471965933286, + "grad_norm": 2.262676477432251, + "learning_rate": 9.418722498225693e-05, + "loss": 0.04305407404899597, + "step": 40970 + }, + { + "epoch": 5.816891412349184, + "grad_norm": 6.300297260284424, + "learning_rate": 9.418580553584103e-05, + "loss": 0.08451374769210815, + "step": 40980 + }, + { + "epoch": 5.818310858765082, + "grad_norm": 6.759078502655029, + "learning_rate": 9.418438608942512e-05, + "loss": 0.040090644359588624, + "step": 40990 + }, + { + "epoch": 5.819730305180979, + "grad_norm": 0.04662134125828743, + "learning_rate": 9.418296664300924e-05, + "loss": 0.0369631290435791, + "step": 41000 + }, + { + "epoch": 5.819730305180979, + "eval_accuracy": 0.9738665988427545, + "eval_loss": 0.07505597919225693, + "eval_runtime": 33.4209, + "eval_samples_per_second": 470.574, + "eval_steps_per_second": 14.721, + "step": 41000 + }, + { + "epoch": 5.821149751596877, + "grad_norm": 1.048794150352478, + "learning_rate": 9.418154719659333e-05, + "loss": 0.03393426835536957, + "step": 41010 + }, + { + "epoch": 5.822569198012775, + "grad_norm": 1.4327489137649536, + "learning_rate": 9.418012775017744e-05, + "loss": 0.03338783085346222, + "step": 41020 + }, + { + "epoch": 5.823988644428673, + "grad_norm": 0.037851158529520035, + "learning_rate": 9.417870830376154e-05, + "loss": 0.021252821385860442, + "step": 41030 + }, + { + "epoch": 5.8254080908445705, + "grad_norm": 2.149754285812378, + "learning_rate": 9.417728885734564e-05, + "loss": 0.06314542889595032, + "step": 41040 + }, + { + "epoch": 5.826827537260469, + "grad_norm": 0.4033157229423523, + "learning_rate": 9.417586941092974e-05, + "loss": 0.02093043476343155, + "step": 41050 + }, + { + "epoch": 5.828246983676366, + "grad_norm": 2.183093547821045, + "learning_rate": 9.417444996451385e-05, + "loss": 0.02726356089115143, + "step": 41060 + }, + { + "epoch": 5.829666430092264, + "grad_norm": 4.68568229675293, + "learning_rate": 9.417303051809794e-05, + "loss": 0.05667162537574768, + "step": 41070 + }, + { + "epoch": 5.831085876508162, + "grad_norm": 0.24591878056526184, + "learning_rate": 9.417161107168206e-05, + "loss": 0.046596580743789674, + "step": 41080 + }, + { + "epoch": 5.832505322924059, + "grad_norm": 2.6592085361480713, + "learning_rate": 9.417019162526615e-05, + "loss": 0.05997686982154846, + "step": 41090 + }, + { + "epoch": 5.833924769339958, + "grad_norm": 15.260762214660645, + "learning_rate": 9.416877217885025e-05, + "loss": 0.0799644410610199, + "step": 41100 + }, + { + "epoch": 5.835344215755855, + "grad_norm": 6.797810077667236, + "learning_rate": 9.416735273243436e-05, + "loss": 0.04322269558906555, + "step": 41110 + }, + { + "epoch": 5.836763662171753, + "grad_norm": 0.5926470160484314, + "learning_rate": 9.416593328601846e-05, + "loss": 0.029363343119621278, + "step": 41120 + }, + { + "epoch": 5.838183108587651, + "grad_norm": 0.012577028945088387, + "learning_rate": 9.416451383960257e-05, + "loss": 0.038662773370742795, + "step": 41130 + }, + { + "epoch": 5.839602555003548, + "grad_norm": 11.325472831726074, + "learning_rate": 9.416309439318665e-05, + "loss": 0.11654891967773437, + "step": 41140 + }, + { + "epoch": 5.8410220014194465, + "grad_norm": 3.3514904975891113, + "learning_rate": 9.416167494677076e-05, + "loss": 0.03298133611679077, + "step": 41150 + }, + { + "epoch": 5.842441447835344, + "grad_norm": 4.035382270812988, + "learning_rate": 9.416025550035486e-05, + "loss": 0.0738753318786621, + "step": 41160 + }, + { + "epoch": 5.843860894251242, + "grad_norm": 0.33365610241889954, + "learning_rate": 9.415883605393897e-05, + "loss": 0.06353077292442322, + "step": 41170 + }, + { + "epoch": 5.84528034066714, + "grad_norm": 0.7018740773200989, + "learning_rate": 9.415741660752307e-05, + "loss": 0.08425546288490296, + "step": 41180 + }, + { + "epoch": 5.846699787083038, + "grad_norm": 6.3598151206970215, + "learning_rate": 9.415599716110717e-05, + "loss": 0.06130687594413757, + "step": 41190 + }, + { + "epoch": 5.848119233498935, + "grad_norm": 7.904303550720215, + "learning_rate": 9.415457771469128e-05, + "loss": 0.0328709602355957, + "step": 41200 + }, + { + "epoch": 5.849538679914833, + "grad_norm": 4.881137371063232, + "learning_rate": 9.415315826827538e-05, + "loss": 0.03672412037849426, + "step": 41210 + }, + { + "epoch": 5.850958126330731, + "grad_norm": 5.240328311920166, + "learning_rate": 9.415173882185949e-05, + "loss": 0.04333122968673706, + "step": 41220 + }, + { + "epoch": 5.8523775727466285, + "grad_norm": 6.439934253692627, + "learning_rate": 9.415031937544358e-05, + "loss": 0.06000009775161743, + "step": 41230 + }, + { + "epoch": 5.853797019162527, + "grad_norm": 2.4920706748962402, + "learning_rate": 9.414889992902768e-05, + "loss": 0.05984145402908325, + "step": 41240 + }, + { + "epoch": 5.855216465578424, + "grad_norm": 4.731122970581055, + "learning_rate": 9.414748048261178e-05, + "loss": 0.059812134504318236, + "step": 41250 + }, + { + "epoch": 5.8566359119943225, + "grad_norm": 0.4531116187572479, + "learning_rate": 9.414606103619589e-05, + "loss": 0.07219678163528442, + "step": 41260 + }, + { + "epoch": 5.85805535841022, + "grad_norm": 1.3103761672973633, + "learning_rate": 9.414464158977999e-05, + "loss": 0.022967004776000978, + "step": 41270 + }, + { + "epoch": 5.859474804826117, + "grad_norm": 3.049083709716797, + "learning_rate": 9.41432221433641e-05, + "loss": 0.04617577791213989, + "step": 41280 + }, + { + "epoch": 5.860894251242016, + "grad_norm": 6.9003472328186035, + "learning_rate": 9.41418026969482e-05, + "loss": 0.05837984681129456, + "step": 41290 + }, + { + "epoch": 5.862313697657913, + "grad_norm": 4.321074485778809, + "learning_rate": 9.414038325053229e-05, + "loss": 0.05662268400192261, + "step": 41300 + }, + { + "epoch": 5.863733144073811, + "grad_norm": 1.8019779920578003, + "learning_rate": 9.41389638041164e-05, + "loss": 0.04414930045604706, + "step": 41310 + }, + { + "epoch": 5.865152590489709, + "grad_norm": 4.754904270172119, + "learning_rate": 9.41375443577005e-05, + "loss": 0.0796053946018219, + "step": 41320 + }, + { + "epoch": 5.866572036905607, + "grad_norm": 0.6924619078636169, + "learning_rate": 9.413612491128461e-05, + "loss": 0.04946174323558807, + "step": 41330 + }, + { + "epoch": 5.8679914833215046, + "grad_norm": 0.18079739809036255, + "learning_rate": 9.413470546486871e-05, + "loss": 0.10200127363204955, + "step": 41340 + }, + { + "epoch": 5.869410929737402, + "grad_norm": 9.690866470336914, + "learning_rate": 9.41332860184528e-05, + "loss": 0.060866284370422366, + "step": 41350 + }, + { + "epoch": 5.8708303761533, + "grad_norm": 3.259993076324463, + "learning_rate": 9.41318665720369e-05, + "loss": 0.05056840181350708, + "step": 41360 + }, + { + "epoch": 5.872249822569198, + "grad_norm": 0.3155812919139862, + "learning_rate": 9.413044712562101e-05, + "loss": 0.058551359176635745, + "step": 41370 + }, + { + "epoch": 5.873669268985096, + "grad_norm": 6.132554531097412, + "learning_rate": 9.412902767920511e-05, + "loss": 0.059990334510803225, + "step": 41380 + }, + { + "epoch": 5.875088715400993, + "grad_norm": 0.24582448601722717, + "learning_rate": 9.412760823278922e-05, + "loss": 0.06940883994102479, + "step": 41390 + }, + { + "epoch": 5.876508161816892, + "grad_norm": 2.3477296829223633, + "learning_rate": 9.412618878637332e-05, + "loss": 0.041541433334350585, + "step": 41400 + }, + { + "epoch": 5.877927608232789, + "grad_norm": 1.636675477027893, + "learning_rate": 9.412476933995742e-05, + "loss": 0.0951632797718048, + "step": 41410 + }, + { + "epoch": 5.879347054648687, + "grad_norm": 1.0382994413375854, + "learning_rate": 9.412334989354153e-05, + "loss": 0.04100165367126465, + "step": 41420 + }, + { + "epoch": 5.880766501064585, + "grad_norm": 3.1466052532196045, + "learning_rate": 9.412193044712563e-05, + "loss": 0.024977406859397887, + "step": 41430 + }, + { + "epoch": 5.882185947480482, + "grad_norm": 3.585829019546509, + "learning_rate": 9.412051100070974e-05, + "loss": 0.06898729801177979, + "step": 41440 + }, + { + "epoch": 5.883605393896381, + "grad_norm": 8.466485977172852, + "learning_rate": 9.411909155429382e-05, + "loss": 0.10010728836059571, + "step": 41450 + }, + { + "epoch": 5.885024840312278, + "grad_norm": 0.8343414068222046, + "learning_rate": 9.411767210787793e-05, + "loss": 0.05715740919113159, + "step": 41460 + }, + { + "epoch": 5.886444286728176, + "grad_norm": 9.334309577941895, + "learning_rate": 9.411625266146203e-05, + "loss": 0.12129504680633545, + "step": 41470 + }, + { + "epoch": 5.887863733144074, + "grad_norm": 4.06323766708374, + "learning_rate": 9.411483321504614e-05, + "loss": 0.030147609114646912, + "step": 41480 + }, + { + "epoch": 5.889283179559971, + "grad_norm": 2.0144970417022705, + "learning_rate": 9.411341376863024e-05, + "loss": 0.06269176602363587, + "step": 41490 + }, + { + "epoch": 5.8907026259758695, + "grad_norm": 1.7438490390777588, + "learning_rate": 9.411199432221433e-05, + "loss": 0.0560301661491394, + "step": 41500 + }, + { + "epoch": 5.8907026259758695, + "eval_accuracy": 0.9686526355948369, + "eval_loss": 0.09519174695014954, + "eval_runtime": 35.0451, + "eval_samples_per_second": 448.765, + "eval_steps_per_second": 14.039, + "step": 41500 + }, + { + "epoch": 5.892122072391767, + "grad_norm": 15.525542259216309, + "learning_rate": 9.411057487579845e-05, + "loss": 0.13895368576049805, + "step": 41510 + }, + { + "epoch": 5.893541518807665, + "grad_norm": 8.046239852905273, + "learning_rate": 9.410915542938254e-05, + "loss": 0.1090732455253601, + "step": 41520 + }, + { + "epoch": 5.894960965223563, + "grad_norm": 0.7156932353973389, + "learning_rate": 9.410773598296665e-05, + "loss": 0.07319698333740235, + "step": 41530 + }, + { + "epoch": 5.896380411639461, + "grad_norm": 4.401820659637451, + "learning_rate": 9.410631653655075e-05, + "loss": 0.034980499744415285, + "step": 41540 + }, + { + "epoch": 5.897799858055358, + "grad_norm": 0.6378658413887024, + "learning_rate": 9.410489709013485e-05, + "loss": 0.03048483729362488, + "step": 41550 + }, + { + "epoch": 5.899219304471256, + "grad_norm": 6.762858867645264, + "learning_rate": 9.410347764371895e-05, + "loss": 0.11323305368423461, + "step": 41560 + }, + { + "epoch": 5.900638750887154, + "grad_norm": 2.8695895671844482, + "learning_rate": 9.410205819730306e-05, + "loss": 0.07545459866523743, + "step": 41570 + }, + { + "epoch": 5.9020581973030515, + "grad_norm": 3.977581024169922, + "learning_rate": 9.410063875088715e-05, + "loss": 0.031920188665390016, + "step": 41580 + }, + { + "epoch": 5.90347764371895, + "grad_norm": 0.39675480127334595, + "learning_rate": 9.409921930447127e-05, + "loss": 0.022741490602493288, + "step": 41590 + }, + { + "epoch": 5.904897090134847, + "grad_norm": 6.030241012573242, + "learning_rate": 9.409779985805536e-05, + "loss": 0.044648933410644534, + "step": 41600 + }, + { + "epoch": 5.9063165365507455, + "grad_norm": 2.218195676803589, + "learning_rate": 9.409638041163946e-05, + "loss": 0.02160325050354004, + "step": 41610 + }, + { + "epoch": 5.907735982966643, + "grad_norm": 1.204745888710022, + "learning_rate": 9.409496096522357e-05, + "loss": 0.06839647889137268, + "step": 41620 + }, + { + "epoch": 5.90915542938254, + "grad_norm": 0.15682987868785858, + "learning_rate": 9.409354151880767e-05, + "loss": 0.033197081089019774, + "step": 41630 + }, + { + "epoch": 5.910574875798439, + "grad_norm": 3.0369253158569336, + "learning_rate": 9.409212207239178e-05, + "loss": 0.06591796278953552, + "step": 41640 + }, + { + "epoch": 5.911994322214336, + "grad_norm": 5.717238426208496, + "learning_rate": 9.409070262597586e-05, + "loss": 0.06664312481880189, + "step": 41650 + }, + { + "epoch": 5.913413768630234, + "grad_norm": 0.033254224807024, + "learning_rate": 9.408928317955997e-05, + "loss": 0.0270003616809845, + "step": 41660 + }, + { + "epoch": 5.914833215046132, + "grad_norm": 0.2468717098236084, + "learning_rate": 9.408786373314407e-05, + "loss": 0.034814268350601196, + "step": 41670 + }, + { + "epoch": 5.91625266146203, + "grad_norm": 9.007941246032715, + "learning_rate": 9.408644428672818e-05, + "loss": 0.07450242042541504, + "step": 41680 + }, + { + "epoch": 5.9176721078779275, + "grad_norm": 11.96431827545166, + "learning_rate": 9.408502484031228e-05, + "loss": 0.047266215085983276, + "step": 41690 + }, + { + "epoch": 5.919091554293825, + "grad_norm": 4.716080188751221, + "learning_rate": 9.408360539389639e-05, + "loss": 0.04762662053108215, + "step": 41700 + }, + { + "epoch": 5.920511000709723, + "grad_norm": 7.223570823669434, + "learning_rate": 9.408218594748049e-05, + "loss": 0.06301666498184204, + "step": 41710 + }, + { + "epoch": 5.921930447125621, + "grad_norm": 9.651657104492188, + "learning_rate": 9.408076650106459e-05, + "loss": 0.052734434604644775, + "step": 41720 + }, + { + "epoch": 5.923349893541519, + "grad_norm": 0.2429005652666092, + "learning_rate": 9.40793470546487e-05, + "loss": 0.02907339930534363, + "step": 41730 + }, + { + "epoch": 5.924769339957416, + "grad_norm": 6.333399772644043, + "learning_rate": 9.40779276082328e-05, + "loss": 0.11444522142410278, + "step": 41740 + }, + { + "epoch": 5.926188786373315, + "grad_norm": 5.317239761352539, + "learning_rate": 9.40765081618169e-05, + "loss": 0.06607084274291992, + "step": 41750 + }, + { + "epoch": 5.927608232789212, + "grad_norm": 0.490640252828598, + "learning_rate": 9.407508871540099e-05, + "loss": 0.03365239799022675, + "step": 41760 + }, + { + "epoch": 5.9290276792051095, + "grad_norm": 8.204052925109863, + "learning_rate": 9.40736692689851e-05, + "loss": 0.0700852930545807, + "step": 41770 + }, + { + "epoch": 5.930447125621008, + "grad_norm": 0.6442670822143555, + "learning_rate": 9.40722498225692e-05, + "loss": 0.045039495825767516, + "step": 41780 + }, + { + "epoch": 5.931866572036905, + "grad_norm": 7.721465587615967, + "learning_rate": 9.407083037615331e-05, + "loss": 0.06508231163024902, + "step": 41790 + }, + { + "epoch": 5.933286018452804, + "grad_norm": 1.8151298761367798, + "learning_rate": 9.40694109297374e-05, + "loss": 0.04082694351673126, + "step": 41800 + }, + { + "epoch": 5.934705464868701, + "grad_norm": 0.27777761220932007, + "learning_rate": 9.40679914833215e-05, + "loss": 0.03647947609424591, + "step": 41810 + }, + { + "epoch": 5.936124911284599, + "grad_norm": 2.4215736389160156, + "learning_rate": 9.406657203690561e-05, + "loss": 0.06282040476799011, + "step": 41820 + }, + { + "epoch": 5.937544357700497, + "grad_norm": 5.381174564361572, + "learning_rate": 9.406515259048971e-05, + "loss": 0.07924935817718506, + "step": 41830 + }, + { + "epoch": 5.938963804116394, + "grad_norm": 0.8996217250823975, + "learning_rate": 9.406373314407382e-05, + "loss": 0.0540692925453186, + "step": 41840 + }, + { + "epoch": 5.940383250532292, + "grad_norm": 0.28388017416000366, + "learning_rate": 9.406231369765792e-05, + "loss": 0.0712361752986908, + "step": 41850 + }, + { + "epoch": 5.94180269694819, + "grad_norm": 0.06892205774784088, + "learning_rate": 9.406089425124202e-05, + "loss": 0.08142430782318115, + "step": 41860 + }, + { + "epoch": 5.943222143364088, + "grad_norm": 4.6068434715271, + "learning_rate": 9.405947480482611e-05, + "loss": 0.07882195711135864, + "step": 41870 + }, + { + "epoch": 5.944641589779986, + "grad_norm": 7.996845722198486, + "learning_rate": 9.405805535841022e-05, + "loss": 0.06462631225585938, + "step": 41880 + }, + { + "epoch": 5.946061036195884, + "grad_norm": 5.027513027191162, + "learning_rate": 9.405663591199432e-05, + "loss": 0.03442648947238922, + "step": 41890 + }, + { + "epoch": 5.947480482611781, + "grad_norm": 10.417757034301758, + "learning_rate": 9.405521646557843e-05, + "loss": 0.05618232488632202, + "step": 41900 + }, + { + "epoch": 5.948899929027679, + "grad_norm": 5.782607078552246, + "learning_rate": 9.405379701916253e-05, + "loss": 0.06721282005310059, + "step": 41910 + }, + { + "epoch": 5.950319375443577, + "grad_norm": 0.7739474773406982, + "learning_rate": 9.405237757274663e-05, + "loss": 0.03563763499259949, + "step": 41920 + }, + { + "epoch": 5.9517388218594744, + "grad_norm": 9.562399864196777, + "learning_rate": 9.405095812633074e-05, + "loss": 0.07549918293952942, + "step": 41930 + }, + { + "epoch": 5.953158268275373, + "grad_norm": 3.9365859031677246, + "learning_rate": 9.404953867991484e-05, + "loss": 0.052819907665252686, + "step": 41940 + }, + { + "epoch": 5.95457771469127, + "grad_norm": 0.7143135070800781, + "learning_rate": 9.404811923349895e-05, + "loss": 0.07506571412086487, + "step": 41950 + }, + { + "epoch": 5.9559971611071685, + "grad_norm": 1.248931646347046, + "learning_rate": 9.404669978708303e-05, + "loss": 0.06863842606544494, + "step": 41960 + }, + { + "epoch": 5.957416607523066, + "grad_norm": 0.1544966995716095, + "learning_rate": 9.404528034066714e-05, + "loss": 0.04283129572868347, + "step": 41970 + }, + { + "epoch": 5.958836053938963, + "grad_norm": 3.303541660308838, + "learning_rate": 9.404386089425124e-05, + "loss": 0.03427127003669739, + "step": 41980 + }, + { + "epoch": 5.960255500354862, + "grad_norm": 9.369416236877441, + "learning_rate": 9.404244144783535e-05, + "loss": 0.12133831977844238, + "step": 41990 + }, + { + "epoch": 5.961674946770759, + "grad_norm": 1.9785155057907104, + "learning_rate": 9.404102200141946e-05, + "loss": 0.06636718511581421, + "step": 42000 + }, + { + "epoch": 5.961674946770759, + "eval_accuracy": 0.9723405608189737, + "eval_loss": 0.08659365773200989, + "eval_runtime": 32.6749, + "eval_samples_per_second": 481.317, + "eval_steps_per_second": 15.057, + "step": 42000 + }, + { + "epoch": 5.963094393186657, + "grad_norm": 4.923141002655029, + "learning_rate": 9.403960255500356e-05, + "loss": 0.06877887845039368, + "step": 42010 + }, + { + "epoch": 5.964513839602555, + "grad_norm": 2.2776553630828857, + "learning_rate": 9.403818310858766e-05, + "loss": 0.0712451994419098, + "step": 42020 + }, + { + "epoch": 5.965933286018453, + "grad_norm": 1.6378326416015625, + "learning_rate": 9.403676366217175e-05, + "loss": 0.03723881542682648, + "step": 42030 + }, + { + "epoch": 5.9673527324343505, + "grad_norm": 2.101365089416504, + "learning_rate": 9.403534421575586e-05, + "loss": 0.056721025705337526, + "step": 42040 + }, + { + "epoch": 5.968772178850248, + "grad_norm": 0.6855022311210632, + "learning_rate": 9.403392476933996e-05, + "loss": 0.08039049506187439, + "step": 42050 + }, + { + "epoch": 5.970191625266146, + "grad_norm": 4.207023620605469, + "learning_rate": 9.403250532292407e-05, + "loss": 0.10304387807846069, + "step": 42060 + }, + { + "epoch": 5.971611071682044, + "grad_norm": 9.549059867858887, + "learning_rate": 9.403108587650816e-05, + "loss": 0.05132551789283753, + "step": 42070 + }, + { + "epoch": 5.973030518097942, + "grad_norm": 3.993290662765503, + "learning_rate": 9.402966643009227e-05, + "loss": 0.0616031289100647, + "step": 42080 + }, + { + "epoch": 5.974449964513839, + "grad_norm": 7.922762393951416, + "learning_rate": 9.402824698367636e-05, + "loss": 0.06184162497520447, + "step": 42090 + }, + { + "epoch": 5.975869410929738, + "grad_norm": 0.07029788196086884, + "learning_rate": 9.402682753726048e-05, + "loss": 0.03704347312450409, + "step": 42100 + }, + { + "epoch": 5.977288857345635, + "grad_norm": 0.35138562321662903, + "learning_rate": 9.402540809084459e-05, + "loss": 0.016092486679553986, + "step": 42110 + }, + { + "epoch": 5.9787083037615325, + "grad_norm": 0.23122110962867737, + "learning_rate": 9.402398864442867e-05, + "loss": 0.030700623989105225, + "step": 42120 + }, + { + "epoch": 5.980127750177431, + "grad_norm": 9.871964454650879, + "learning_rate": 9.402256919801278e-05, + "loss": 0.05316352844238281, + "step": 42130 + }, + { + "epoch": 5.981547196593328, + "grad_norm": 8.557596206665039, + "learning_rate": 9.402114975159688e-05, + "loss": 0.08352534174919128, + "step": 42140 + }, + { + "epoch": 5.9829666430092265, + "grad_norm": 0.055390872061252594, + "learning_rate": 9.401973030518099e-05, + "loss": 0.021974930167198183, + "step": 42150 + }, + { + "epoch": 5.984386089425124, + "grad_norm": 0.12106958776712418, + "learning_rate": 9.401831085876509e-05, + "loss": 0.09234058260917663, + "step": 42160 + }, + { + "epoch": 5.985805535841022, + "grad_norm": 1.0915229320526123, + "learning_rate": 9.401689141234918e-05, + "loss": 0.039049354195594785, + "step": 42170 + }, + { + "epoch": 5.98722498225692, + "grad_norm": 1.639233112335205, + "learning_rate": 9.401547196593328e-05, + "loss": 0.07456170916557311, + "step": 42180 + }, + { + "epoch": 5.988644428672818, + "grad_norm": 0.9587175846099854, + "learning_rate": 9.401405251951739e-05, + "loss": 0.06777811646461487, + "step": 42190 + }, + { + "epoch": 5.990063875088715, + "grad_norm": 4.586569309234619, + "learning_rate": 9.40126330731015e-05, + "loss": 0.0436800479888916, + "step": 42200 + }, + { + "epoch": 5.991483321504613, + "grad_norm": 0.17483319342136383, + "learning_rate": 9.40112136266856e-05, + "loss": 0.10744675397872924, + "step": 42210 + }, + { + "epoch": 5.992902767920511, + "grad_norm": 0.6297892332077026, + "learning_rate": 9.40097941802697e-05, + "loss": 0.016437722742557524, + "step": 42220 + }, + { + "epoch": 5.9943222143364085, + "grad_norm": 10.479232788085938, + "learning_rate": 9.40083747338538e-05, + "loss": 0.04590970277786255, + "step": 42230 + }, + { + "epoch": 5.995741660752307, + "grad_norm": 0.8099613189697266, + "learning_rate": 9.40069552874379e-05, + "loss": 0.04706493616104126, + "step": 42240 + }, + { + "epoch": 5.997161107168204, + "grad_norm": 0.6547835469245911, + "learning_rate": 9.4005535841022e-05, + "loss": 0.04878454804420471, + "step": 42250 + }, + { + "epoch": 5.998580553584103, + "grad_norm": 8.429951667785645, + "learning_rate": 9.400411639460611e-05, + "loss": 0.03529610633850098, + "step": 42260 + }, + { + "epoch": 6.0, + "grad_norm": 1.0936342477798462, + "learning_rate": 9.40026969481902e-05, + "loss": 0.058073770999908444, + "step": 42270 + }, + { + "epoch": 6.001419446415897, + "grad_norm": 10.419304847717285, + "learning_rate": 9.400127750177431e-05, + "loss": 0.04838542342185974, + "step": 42280 + }, + { + "epoch": 6.002838892831796, + "grad_norm": 1.3970303535461426, + "learning_rate": 9.399985805535842e-05, + "loss": 0.027141714096069337, + "step": 42290 + }, + { + "epoch": 6.004258339247693, + "grad_norm": 0.9260740280151367, + "learning_rate": 9.399843860894252e-05, + "loss": 0.05527122020721435, + "step": 42300 + }, + { + "epoch": 6.0056777856635915, + "grad_norm": 3.39192533493042, + "learning_rate": 9.399701916252663e-05, + "loss": 0.08901798725128174, + "step": 42310 + }, + { + "epoch": 6.007097232079489, + "grad_norm": 2.0796215534210205, + "learning_rate": 9.399559971611071e-05, + "loss": 0.0270698219537735, + "step": 42320 + }, + { + "epoch": 6.008516678495387, + "grad_norm": 2.704911708831787, + "learning_rate": 9.399418026969482e-05, + "loss": 0.021138927340507506, + "step": 42330 + }, + { + "epoch": 6.009936124911285, + "grad_norm": 0.3152889609336853, + "learning_rate": 9.399276082327892e-05, + "loss": 0.02813498079776764, + "step": 42340 + }, + { + "epoch": 6.011355571327182, + "grad_norm": 0.4241684675216675, + "learning_rate": 9.399134137686303e-05, + "loss": 0.030684193968772887, + "step": 42350 + }, + { + "epoch": 6.01277501774308, + "grad_norm": 5.618993759155273, + "learning_rate": 9.398992193044713e-05, + "loss": 0.035936284065246585, + "step": 42360 + }, + { + "epoch": 6.014194464158978, + "grad_norm": 2.3576912879943848, + "learning_rate": 9.398850248403124e-05, + "loss": 0.021441501379013062, + "step": 42370 + }, + { + "epoch": 6.015613910574876, + "grad_norm": 1.1104201078414917, + "learning_rate": 9.398708303761534e-05, + "loss": 0.03485158383846283, + "step": 42380 + }, + { + "epoch": 6.0170333569907735, + "grad_norm": 0.14437326788902283, + "learning_rate": 9.398566359119943e-05, + "loss": 0.0569730281829834, + "step": 42390 + }, + { + "epoch": 6.018452803406672, + "grad_norm": 3.9557926654815674, + "learning_rate": 9.398424414478355e-05, + "loss": 0.04000087082386017, + "step": 42400 + }, + { + "epoch": 6.019872249822569, + "grad_norm": 0.8551321029663086, + "learning_rate": 9.398282469836764e-05, + "loss": 0.02508898377418518, + "step": 42410 + }, + { + "epoch": 6.021291696238467, + "grad_norm": 0.5500660538673401, + "learning_rate": 9.398140525195175e-05, + "loss": 0.040706342458724974, + "step": 42420 + }, + { + "epoch": 6.022711142654365, + "grad_norm": 1.740173101425171, + "learning_rate": 9.397998580553584e-05, + "loss": 0.05620036721229553, + "step": 42430 + }, + { + "epoch": 6.024130589070262, + "grad_norm": 0.16710884869098663, + "learning_rate": 9.397856635911995e-05, + "loss": 0.00892709344625473, + "step": 42440 + }, + { + "epoch": 6.025550035486161, + "grad_norm": 0.6740273237228394, + "learning_rate": 9.397714691270405e-05, + "loss": 0.06913689970970154, + "step": 42450 + }, + { + "epoch": 6.026969481902058, + "grad_norm": 0.17985455691814423, + "learning_rate": 9.397572746628816e-05, + "loss": 0.03412851691246033, + "step": 42460 + }, + { + "epoch": 6.028388928317956, + "grad_norm": 8.147322654724121, + "learning_rate": 9.397430801987225e-05, + "loss": 0.03205571174621582, + "step": 42470 + }, + { + "epoch": 6.029808374733854, + "grad_norm": 0.2391805201768875, + "learning_rate": 9.397288857345635e-05, + "loss": 0.03885909616947174, + "step": 42480 + }, + { + "epoch": 6.031227821149751, + "grad_norm": 2.3706560134887695, + "learning_rate": 9.397146912704046e-05, + "loss": 0.059897488355636595, + "step": 42490 + }, + { + "epoch": 6.0326472675656495, + "grad_norm": 1.3437626361846924, + "learning_rate": 9.397004968062456e-05, + "loss": 0.026122891902923585, + "step": 42500 + }, + { + "epoch": 6.0326472675656495, + "eval_accuracy": 0.9801615056908501, + "eval_loss": 0.06414638459682465, + "eval_runtime": 32.6309, + "eval_samples_per_second": 481.966, + "eval_steps_per_second": 15.078, + "step": 42500 + }, + { + "epoch": 6.034066713981547, + "grad_norm": 4.8015360832214355, + "learning_rate": 9.396863023420867e-05, + "loss": 0.032011619210243224, + "step": 42510 + }, + { + "epoch": 6.035486160397445, + "grad_norm": 4.332185745239258, + "learning_rate": 9.396721078779277e-05, + "loss": 0.056959223747253415, + "step": 42520 + }, + { + "epoch": 6.036905606813343, + "grad_norm": 0.11899875849485397, + "learning_rate": 9.396579134137687e-05, + "loss": 0.02158723771572113, + "step": 42530 + }, + { + "epoch": 6.038325053229241, + "grad_norm": 6.497653961181641, + "learning_rate": 9.396437189496096e-05, + "loss": 0.025063958764076234, + "step": 42540 + }, + { + "epoch": 6.039744499645138, + "grad_norm": 10.640429496765137, + "learning_rate": 9.396295244854507e-05, + "loss": 0.07477115392684937, + "step": 42550 + }, + { + "epoch": 6.041163946061036, + "grad_norm": 3.3736305236816406, + "learning_rate": 9.396153300212917e-05, + "loss": 0.028403592109680176, + "step": 42560 + }, + { + "epoch": 6.042583392476934, + "grad_norm": 0.13938064873218536, + "learning_rate": 9.396011355571328e-05, + "loss": 0.0490555077791214, + "step": 42570 + }, + { + "epoch": 6.0440028388928315, + "grad_norm": 3.9250857830047607, + "learning_rate": 9.395869410929738e-05, + "loss": 0.02079556733369827, + "step": 42580 + }, + { + "epoch": 6.04542228530873, + "grad_norm": 8.848977088928223, + "learning_rate": 9.395727466288148e-05, + "loss": 0.04536510109901428, + "step": 42590 + }, + { + "epoch": 6.046841731724627, + "grad_norm": 2.9003498554229736, + "learning_rate": 9.395585521646559e-05, + "loss": 0.06284236311912536, + "step": 42600 + }, + { + "epoch": 6.0482611781405256, + "grad_norm": 2.06571626663208, + "learning_rate": 9.395443577004969e-05, + "loss": 0.10079717636108398, + "step": 42610 + }, + { + "epoch": 6.049680624556423, + "grad_norm": 1.5646941661834717, + "learning_rate": 9.39530163236338e-05, + "loss": 0.0747799277305603, + "step": 42620 + }, + { + "epoch": 6.05110007097232, + "grad_norm": 4.782958030700684, + "learning_rate": 9.395159687721788e-05, + "loss": 0.10376496315002441, + "step": 42630 + }, + { + "epoch": 6.052519517388219, + "grad_norm": 0.5791422128677368, + "learning_rate": 9.395017743080199e-05, + "loss": 0.059558308124542235, + "step": 42640 + }, + { + "epoch": 6.053938963804116, + "grad_norm": 6.751038551330566, + "learning_rate": 9.394875798438609e-05, + "loss": 0.05839126706123352, + "step": 42650 + }, + { + "epoch": 6.055358410220014, + "grad_norm": 7.07871675491333, + "learning_rate": 9.39473385379702e-05, + "loss": 0.059094560146331784, + "step": 42660 + }, + { + "epoch": 6.056777856635912, + "grad_norm": 1.6905990839004517, + "learning_rate": 9.39459190915543e-05, + "loss": 0.015537199378013612, + "step": 42670 + }, + { + "epoch": 6.05819730305181, + "grad_norm": 6.017456531524658, + "learning_rate": 9.39444996451384e-05, + "loss": 0.08799818754196168, + "step": 42680 + }, + { + "epoch": 6.059616749467708, + "grad_norm": 4.168159008026123, + "learning_rate": 9.39430801987225e-05, + "loss": 0.035238003730773924, + "step": 42690 + }, + { + "epoch": 6.061036195883605, + "grad_norm": 3.4934043884277344, + "learning_rate": 9.39416607523066e-05, + "loss": 0.04012168049812317, + "step": 42700 + }, + { + "epoch": 6.062455642299503, + "grad_norm": 0.6561540365219116, + "learning_rate": 9.394024130589071e-05, + "loss": 0.03983815610408783, + "step": 42710 + }, + { + "epoch": 6.063875088715401, + "grad_norm": 5.77907133102417, + "learning_rate": 9.393882185947481e-05, + "loss": 0.0537925660610199, + "step": 42720 + }, + { + "epoch": 6.065294535131299, + "grad_norm": 0.7920040488243103, + "learning_rate": 9.393740241305892e-05, + "loss": 0.040977182984352115, + "step": 42730 + }, + { + "epoch": 6.066713981547196, + "grad_norm": 0.8078078031539917, + "learning_rate": 9.3935982966643e-05, + "loss": 0.02283947616815567, + "step": 42740 + }, + { + "epoch": 6.068133427963095, + "grad_norm": 6.577816963195801, + "learning_rate": 9.393456352022712e-05, + "loss": 0.048980104923248294, + "step": 42750 + }, + { + "epoch": 6.069552874378992, + "grad_norm": 2.022977113723755, + "learning_rate": 9.393314407381121e-05, + "loss": 0.01717734932899475, + "step": 42760 + }, + { + "epoch": 6.07097232079489, + "grad_norm": 3.7534005641937256, + "learning_rate": 9.393172462739532e-05, + "loss": 0.034478670358657836, + "step": 42770 + }, + { + "epoch": 6.072391767210788, + "grad_norm": 0.1850811243057251, + "learning_rate": 9.393030518097942e-05, + "loss": 0.0779535412788391, + "step": 42780 + }, + { + "epoch": 6.073811213626685, + "grad_norm": 2.3731038570404053, + "learning_rate": 9.392888573456352e-05, + "loss": 0.054844236373901366, + "step": 42790 + }, + { + "epoch": 6.075230660042584, + "grad_norm": 2.8397276401519775, + "learning_rate": 9.392746628814763e-05, + "loss": 0.03636707365512848, + "step": 42800 + }, + { + "epoch": 6.076650106458481, + "grad_norm": 1.0015398263931274, + "learning_rate": 9.392604684173173e-05, + "loss": 0.02905575931072235, + "step": 42810 + }, + { + "epoch": 6.078069552874379, + "grad_norm": 4.600225448608398, + "learning_rate": 9.392462739531584e-05, + "loss": 0.044324475526809695, + "step": 42820 + }, + { + "epoch": 6.079488999290277, + "grad_norm": 0.05996633321046829, + "learning_rate": 9.392320794889994e-05, + "loss": 0.022943411767482758, + "step": 42830 + }, + { + "epoch": 6.080908445706174, + "grad_norm": 5.939465522766113, + "learning_rate": 9.392178850248403e-05, + "loss": 0.032868221402168274, + "step": 42840 + }, + { + "epoch": 6.0823278921220725, + "grad_norm": 1.8107235431671143, + "learning_rate": 9.392036905606813e-05, + "loss": 0.04718858897686005, + "step": 42850 + }, + { + "epoch": 6.08374733853797, + "grad_norm": 7.665246486663818, + "learning_rate": 9.391894960965224e-05, + "loss": 0.056220120191574095, + "step": 42860 + }, + { + "epoch": 6.085166784953868, + "grad_norm": 2.0348501205444336, + "learning_rate": 9.391753016323634e-05, + "loss": 0.06356436610221863, + "step": 42870 + }, + { + "epoch": 6.086586231369766, + "grad_norm": 6.74163293838501, + "learning_rate": 9.391611071682045e-05, + "loss": 0.027032476663589478, + "step": 42880 + }, + { + "epoch": 6.088005677785664, + "grad_norm": 9.711258888244629, + "learning_rate": 9.391469127040455e-05, + "loss": 0.02918843924999237, + "step": 42890 + }, + { + "epoch": 6.089425124201561, + "grad_norm": 1.9372355937957764, + "learning_rate": 9.391327182398864e-05, + "loss": 0.027305248379707336, + "step": 42900 + }, + { + "epoch": 6.090844570617459, + "grad_norm": 3.3903517723083496, + "learning_rate": 9.391185237757276e-05, + "loss": 0.035347151756286624, + "step": 42910 + }, + { + "epoch": 6.092264017033357, + "grad_norm": 2.7463488578796387, + "learning_rate": 9.391043293115685e-05, + "loss": 0.06535101532936097, + "step": 42920 + }, + { + "epoch": 6.0936834634492545, + "grad_norm": 6.837551593780518, + "learning_rate": 9.390901348474096e-05, + "loss": 0.08181880712509156, + "step": 42930 + }, + { + "epoch": 6.095102909865153, + "grad_norm": 0.08821488916873932, + "learning_rate": 9.390759403832505e-05, + "loss": 0.07695122361183167, + "step": 42940 + }, + { + "epoch": 6.09652235628105, + "grad_norm": 2.108302593231201, + "learning_rate": 9.390617459190916e-05, + "loss": 0.053022068738937375, + "step": 42950 + }, + { + "epoch": 6.0979418026969485, + "grad_norm": 1.4591866731643677, + "learning_rate": 9.390475514549326e-05, + "loss": 0.029264354705810548, + "step": 42960 + }, + { + "epoch": 6.099361249112846, + "grad_norm": 0.7254082560539246, + "learning_rate": 9.390333569907737e-05, + "loss": 0.023519128561019897, + "step": 42970 + }, + { + "epoch": 6.100780695528743, + "grad_norm": 3.487905740737915, + "learning_rate": 9.390191625266146e-05, + "loss": 0.025741568207740782, + "step": 42980 + }, + { + "epoch": 6.102200141944642, + "grad_norm": 2.3520405292510986, + "learning_rate": 9.390049680624556e-05, + "loss": 0.03384661078453064, + "step": 42990 + }, + { + "epoch": 6.103619588360539, + "grad_norm": 12.140244483947754, + "learning_rate": 9.389907735982967e-05, + "loss": 0.047564372420310974, + "step": 43000 + }, + { + "epoch": 6.103619588360539, + "eval_accuracy": 0.9731671647485216, + "eval_loss": 0.10344529151916504, + "eval_runtime": 32.822, + "eval_samples_per_second": 479.161, + "eval_steps_per_second": 14.99, + "step": 43000 + }, + { + "epoch": 6.105039034776437, + "grad_norm": 9.412208557128906, + "learning_rate": 9.389779985805536e-05, + "loss": 0.08669753670692444, + "step": 43010 + }, + { + "epoch": 6.106458481192335, + "grad_norm": 7.122009754180908, + "learning_rate": 9.389638041163946e-05, + "loss": 0.0934341549873352, + "step": 43020 + }, + { + "epoch": 6.107877927608233, + "grad_norm": 1.6225032806396484, + "learning_rate": 9.389496096522357e-05, + "loss": 0.05215970277786255, + "step": 43030 + }, + { + "epoch": 6.1092973740241305, + "grad_norm": 6.039062023162842, + "learning_rate": 9.389354151880768e-05, + "loss": 0.08043327331542968, + "step": 43040 + }, + { + "epoch": 6.110716820440028, + "grad_norm": 0.20308256149291992, + "learning_rate": 9.389212207239177e-05, + "loss": 0.04331388473510742, + "step": 43050 + }, + { + "epoch": 6.112136266855926, + "grad_norm": 4.1182990074157715, + "learning_rate": 9.389070262597589e-05, + "loss": 0.09296801090240478, + "step": 43060 + }, + { + "epoch": 6.113555713271824, + "grad_norm": 2.675135374069214, + "learning_rate": 9.388928317955997e-05, + "loss": 0.04550227820873261, + "step": 43070 + }, + { + "epoch": 6.114975159687722, + "grad_norm": 6.068324565887451, + "learning_rate": 9.388786373314408e-05, + "loss": 0.06599665880203247, + "step": 43080 + }, + { + "epoch": 6.116394606103619, + "grad_norm": 1.9616451263427734, + "learning_rate": 9.388644428672818e-05, + "loss": 0.07046244740486145, + "step": 43090 + }, + { + "epoch": 6.117814052519518, + "grad_norm": 4.773113250732422, + "learning_rate": 9.388502484031229e-05, + "loss": 0.05360671281814575, + "step": 43100 + }, + { + "epoch": 6.119233498935415, + "grad_norm": 0.3193398714065552, + "learning_rate": 9.388360539389639e-05, + "loss": 0.08010044693946838, + "step": 43110 + }, + { + "epoch": 6.120652945351313, + "grad_norm": 9.674460411071777, + "learning_rate": 9.388218594748048e-05, + "loss": 0.04749388694763183, + "step": 43120 + }, + { + "epoch": 6.122072391767211, + "grad_norm": 1.1620126962661743, + "learning_rate": 9.38807665010646e-05, + "loss": 0.033527106046676636, + "step": 43130 + }, + { + "epoch": 6.123491838183108, + "grad_norm": 1.2030847072601318, + "learning_rate": 9.387934705464869e-05, + "loss": 0.03352363109588623, + "step": 43140 + }, + { + "epoch": 6.124911284599007, + "grad_norm": 1.329487681388855, + "learning_rate": 9.38779276082328e-05, + "loss": 0.03268220722675323, + "step": 43150 + }, + { + "epoch": 6.126330731014904, + "grad_norm": 0.743346631526947, + "learning_rate": 9.38765081618169e-05, + "loss": 0.02008904367685318, + "step": 43160 + }, + { + "epoch": 6.127750177430802, + "grad_norm": 7.962668418884277, + "learning_rate": 9.3875088715401e-05, + "loss": 0.09379298686981201, + "step": 43170 + }, + { + "epoch": 6.1291696238467, + "grad_norm": 0.15104877948760986, + "learning_rate": 9.38736692689851e-05, + "loss": 0.06707976460456848, + "step": 43180 + }, + { + "epoch": 6.130589070262598, + "grad_norm": 1.7654162645339966, + "learning_rate": 9.38722498225692e-05, + "loss": 0.04772307276725769, + "step": 43190 + }, + { + "epoch": 6.1320085166784954, + "grad_norm": 4.709306240081787, + "learning_rate": 9.38708303761533e-05, + "loss": 0.03134104907512665, + "step": 43200 + }, + { + "epoch": 6.133427963094393, + "grad_norm": 7.34123420715332, + "learning_rate": 9.386941092973741e-05, + "loss": 0.0683401346206665, + "step": 43210 + }, + { + "epoch": 6.134847409510291, + "grad_norm": 2.9818601608276367, + "learning_rate": 9.386799148332151e-05, + "loss": 0.037702041864395144, + "step": 43220 + }, + { + "epoch": 6.136266855926189, + "grad_norm": 2.6485135555267334, + "learning_rate": 9.386657203690561e-05, + "loss": 0.03553232550621033, + "step": 43230 + }, + { + "epoch": 6.137686302342087, + "grad_norm": 0.658879280090332, + "learning_rate": 9.386515259048972e-05, + "loss": 0.05370528697967529, + "step": 43240 + }, + { + "epoch": 6.139105748757984, + "grad_norm": 1.2858973741531372, + "learning_rate": 9.386373314407382e-05, + "loss": 0.08532284498214722, + "step": 43250 + }, + { + "epoch": 6.140525195173883, + "grad_norm": 1.1662542819976807, + "learning_rate": 9.386231369765793e-05, + "loss": 0.035893863439559935, + "step": 43260 + }, + { + "epoch": 6.14194464158978, + "grad_norm": 0.8113408088684082, + "learning_rate": 9.386089425124201e-05, + "loss": 0.05194441676139831, + "step": 43270 + }, + { + "epoch": 6.1433640880056775, + "grad_norm": 1.8177664279937744, + "learning_rate": 9.385947480482612e-05, + "loss": 0.06852318644523621, + "step": 43280 + }, + { + "epoch": 6.144783534421576, + "grad_norm": 10.182692527770996, + "learning_rate": 9.385805535841022e-05, + "loss": 0.06881612539291382, + "step": 43290 + }, + { + "epoch": 6.146202980837473, + "grad_norm": 0.47980985045433044, + "learning_rate": 9.385663591199433e-05, + "loss": 0.044000831246376035, + "step": 43300 + }, + { + "epoch": 6.1476224272533715, + "grad_norm": 0.38547614216804504, + "learning_rate": 9.385521646557843e-05, + "loss": 0.030921560525894166, + "step": 43310 + }, + { + "epoch": 6.149041873669269, + "grad_norm": 4.417194843292236, + "learning_rate": 9.385379701916253e-05, + "loss": 0.02149178385734558, + "step": 43320 + }, + { + "epoch": 6.150461320085167, + "grad_norm": 3.865617513656616, + "learning_rate": 9.385237757274664e-05, + "loss": 0.01396312564611435, + "step": 43330 + }, + { + "epoch": 6.151880766501065, + "grad_norm": 0.28896984457969666, + "learning_rate": 9.385095812633073e-05, + "loss": 0.059472233057022095, + "step": 43340 + }, + { + "epoch": 6.153300212916962, + "grad_norm": 0.8453249931335449, + "learning_rate": 9.384953867991484e-05, + "loss": 0.059427005052566526, + "step": 43350 + }, + { + "epoch": 6.15471965933286, + "grad_norm": 0.9897168278694153, + "learning_rate": 9.384811923349894e-05, + "loss": 0.047054699063301085, + "step": 43360 + }, + { + "epoch": 6.156139105748758, + "grad_norm": 10.11900806427002, + "learning_rate": 9.384669978708304e-05, + "loss": 0.06199625730514526, + "step": 43370 + }, + { + "epoch": 6.157558552164656, + "grad_norm": 6.863821029663086, + "learning_rate": 9.384528034066714e-05, + "loss": 0.03126347362995148, + "step": 43380 + }, + { + "epoch": 6.1589779985805535, + "grad_norm": 0.7479419112205505, + "learning_rate": 9.384386089425125e-05, + "loss": 0.022752903401851654, + "step": 43390 + }, + { + "epoch": 6.160397444996452, + "grad_norm": 7.677117347717285, + "learning_rate": 9.384244144783535e-05, + "loss": 0.06328274011611938, + "step": 43400 + }, + { + "epoch": 6.161816891412349, + "grad_norm": 0.43821343779563904, + "learning_rate": 9.384102200141946e-05, + "loss": 0.011660891026258469, + "step": 43410 + }, + { + "epoch": 6.163236337828247, + "grad_norm": 0.12804394960403442, + "learning_rate": 9.383960255500355e-05, + "loss": 0.11909148693084717, + "step": 43420 + }, + { + "epoch": 6.164655784244145, + "grad_norm": 2.8437774181365967, + "learning_rate": 9.383818310858765e-05, + "loss": 0.0267734169960022, + "step": 43430 + }, + { + "epoch": 6.166075230660042, + "grad_norm": 0.17364199459552765, + "learning_rate": 9.383676366217176e-05, + "loss": 0.014441606402397156, + "step": 43440 + }, + { + "epoch": 6.167494677075941, + "grad_norm": 5.291993618011475, + "learning_rate": 9.383534421575586e-05, + "loss": 0.04625110328197479, + "step": 43450 + }, + { + "epoch": 6.168914123491838, + "grad_norm": 1.9384088516235352, + "learning_rate": 9.383392476933997e-05, + "loss": 0.03442354500293732, + "step": 43460 + }, + { + "epoch": 6.170333569907736, + "grad_norm": 5.928829669952393, + "learning_rate": 9.383250532292407e-05, + "loss": 0.052574223279953, + "step": 43470 + }, + { + "epoch": 6.171753016323634, + "grad_norm": 0.3862343728542328, + "learning_rate": 9.383108587650816e-05, + "loss": 0.10290310382843018, + "step": 43480 + }, + { + "epoch": 6.173172462739531, + "grad_norm": 4.460153102874756, + "learning_rate": 9.382966643009226e-05, + "loss": 0.04883348345756531, + "step": 43490 + }, + { + "epoch": 6.1745919091554295, + "grad_norm": 0.46098002791404724, + "learning_rate": 9.382824698367637e-05, + "loss": 0.024615487456321715, + "step": 43500 + }, + { + "epoch": 6.1745919091554295, + "eval_accuracy": 0.9751382971959052, + "eval_loss": 0.07891522347927094, + "eval_runtime": 32.9149, + "eval_samples_per_second": 477.808, + "eval_steps_per_second": 14.948, + "step": 43500 + }, + { + "epoch": 6.176011355571327, + "grad_norm": 7.657423496246338, + "learning_rate": 9.382682753726047e-05, + "loss": 0.02250729650259018, + "step": 43510 + }, + { + "epoch": 6.177430801987225, + "grad_norm": 3.3920164108276367, + "learning_rate": 9.382540809084458e-05, + "loss": 0.05920064449310303, + "step": 43520 + }, + { + "epoch": 6.178850248403123, + "grad_norm": 5.01832914352417, + "learning_rate": 9.382398864442868e-05, + "loss": 0.06701637506484985, + "step": 43530 + }, + { + "epoch": 6.180269694819021, + "grad_norm": 13.015891075134277, + "learning_rate": 9.382256919801278e-05, + "loss": 0.07436400651931763, + "step": 43540 + }, + { + "epoch": 6.181689141234918, + "grad_norm": 0.7317226529121399, + "learning_rate": 9.382114975159689e-05, + "loss": 0.0654987096786499, + "step": 43550 + }, + { + "epoch": 6.183108587650816, + "grad_norm": 2.2464566230773926, + "learning_rate": 9.381973030518098e-05, + "loss": 0.056348496675491334, + "step": 43560 + }, + { + "epoch": 6.184528034066714, + "grad_norm": 3.364604949951172, + "learning_rate": 9.38183108587651e-05, + "loss": 0.054125458002090454, + "step": 43570 + }, + { + "epoch": 6.185947480482612, + "grad_norm": 0.46684959530830383, + "learning_rate": 9.381689141234918e-05, + "loss": 0.06286740899085999, + "step": 43580 + }, + { + "epoch": 6.18736692689851, + "grad_norm": 1.326682209968567, + "learning_rate": 9.381547196593329e-05, + "loss": 0.09077779650688171, + "step": 43590 + }, + { + "epoch": 6.188786373314407, + "grad_norm": 5.395484447479248, + "learning_rate": 9.381405251951739e-05, + "loss": 0.0627810776233673, + "step": 43600 + }, + { + "epoch": 6.190205819730306, + "grad_norm": 0.3197677433490753, + "learning_rate": 9.38126330731015e-05, + "loss": 0.07363017201423645, + "step": 43610 + }, + { + "epoch": 6.191625266146203, + "grad_norm": 0.048317801207304, + "learning_rate": 9.38112136266856e-05, + "loss": 0.04617903530597687, + "step": 43620 + }, + { + "epoch": 6.1930447125621, + "grad_norm": 0.30027082562446594, + "learning_rate": 9.38097941802697e-05, + "loss": 0.04771397709846496, + "step": 43630 + }, + { + "epoch": 6.194464158977999, + "grad_norm": 1.9492099285125732, + "learning_rate": 9.38083747338538e-05, + "loss": 0.05550530552864075, + "step": 43640 + }, + { + "epoch": 6.195883605393896, + "grad_norm": 0.49073126912117004, + "learning_rate": 9.38069552874379e-05, + "loss": 0.03370376825332642, + "step": 43650 + }, + { + "epoch": 6.1973030518097945, + "grad_norm": 4.336076736450195, + "learning_rate": 9.380553584102201e-05, + "loss": 0.04263518452644348, + "step": 43660 + }, + { + "epoch": 6.198722498225692, + "grad_norm": 1.0135289430618286, + "learning_rate": 9.380411639460611e-05, + "loss": 0.010483792424201966, + "step": 43670 + }, + { + "epoch": 6.20014194464159, + "grad_norm": 0.6504700183868408, + "learning_rate": 9.380269694819021e-05, + "loss": 0.10248892307281494, + "step": 43680 + }, + { + "epoch": 6.201561391057488, + "grad_norm": 0.7889754176139832, + "learning_rate": 9.38012775017743e-05, + "loss": 0.02525465190410614, + "step": 43690 + }, + { + "epoch": 6.202980837473385, + "grad_norm": 11.922003746032715, + "learning_rate": 9.379985805535842e-05, + "loss": 0.12885262966156005, + "step": 43700 + }, + { + "epoch": 6.204400283889283, + "grad_norm": 8.441861152648926, + "learning_rate": 9.379843860894251e-05, + "loss": 0.062443208694458005, + "step": 43710 + }, + { + "epoch": 6.205819730305181, + "grad_norm": 0.432558536529541, + "learning_rate": 9.379701916252662e-05, + "loss": 0.016783684492111206, + "step": 43720 + }, + { + "epoch": 6.207239176721079, + "grad_norm": 3.7388827800750732, + "learning_rate": 9.379559971611072e-05, + "loss": 0.04627739787101746, + "step": 43730 + }, + { + "epoch": 6.2086586231369765, + "grad_norm": 5.352664470672607, + "learning_rate": 9.379418026969482e-05, + "loss": 0.04351229965686798, + "step": 43740 + }, + { + "epoch": 6.210078069552875, + "grad_norm": 0.5855856537818909, + "learning_rate": 9.379276082327893e-05, + "loss": 0.05773522257804871, + "step": 43750 + }, + { + "epoch": 6.211497515968772, + "grad_norm": 0.16541746258735657, + "learning_rate": 9.379134137686303e-05, + "loss": 0.04460527896881104, + "step": 43760 + }, + { + "epoch": 6.21291696238467, + "grad_norm": 0.3547366261482239, + "learning_rate": 9.378992193044714e-05, + "loss": 0.1184334397315979, + "step": 43770 + }, + { + "epoch": 6.214336408800568, + "grad_norm": 2.3118815422058105, + "learning_rate": 9.378850248403124e-05, + "loss": 0.05782003998756409, + "step": 43780 + }, + { + "epoch": 6.215755855216465, + "grad_norm": 0.15461857616901398, + "learning_rate": 9.378708303761533e-05, + "loss": 0.07659928202629089, + "step": 43790 + }, + { + "epoch": 6.217175301632364, + "grad_norm": 2.9949324131011963, + "learning_rate": 9.378566359119943e-05, + "loss": 0.06376568078994752, + "step": 43800 + }, + { + "epoch": 6.218594748048261, + "grad_norm": 0.9916458129882812, + "learning_rate": 9.378424414478354e-05, + "loss": 0.018292531371116638, + "step": 43810 + }, + { + "epoch": 6.220014194464159, + "grad_norm": 0.48116499185562134, + "learning_rate": 9.378282469836764e-05, + "loss": 0.024942028522491454, + "step": 43820 + }, + { + "epoch": 6.221433640880057, + "grad_norm": 1.3514341115951538, + "learning_rate": 9.378140525195175e-05, + "loss": 0.09017609357833863, + "step": 43830 + }, + { + "epoch": 6.222853087295954, + "grad_norm": 1.7869921922683716, + "learning_rate": 9.377998580553585e-05, + "loss": 0.06085496544837952, + "step": 43840 + }, + { + "epoch": 6.2242725337118525, + "grad_norm": 1.120748519897461, + "learning_rate": 9.377856635911994e-05, + "loss": 0.08365220427513123, + "step": 43850 + }, + { + "epoch": 6.22569198012775, + "grad_norm": 4.36846399307251, + "learning_rate": 9.377714691270405e-05, + "loss": 0.03777306079864502, + "step": 43860 + }, + { + "epoch": 6.227111426543648, + "grad_norm": 2.9369184970855713, + "learning_rate": 9.377572746628815e-05, + "loss": 0.08006559610366822, + "step": 43870 + }, + { + "epoch": 6.228530872959546, + "grad_norm": 0.3148958384990692, + "learning_rate": 9.377430801987226e-05, + "loss": 0.0385653018951416, + "step": 43880 + }, + { + "epoch": 6.229950319375444, + "grad_norm": 1.3210095167160034, + "learning_rate": 9.377288857345635e-05, + "loss": 0.033772128820419314, + "step": 43890 + }, + { + "epoch": 6.231369765791341, + "grad_norm": 2.804184675216675, + "learning_rate": 9.377146912704046e-05, + "loss": 0.0412954181432724, + "step": 43900 + }, + { + "epoch": 6.232789212207239, + "grad_norm": 0.18998447060585022, + "learning_rate": 9.377004968062456e-05, + "loss": 0.020071253180503845, + "step": 43910 + }, + { + "epoch": 6.234208658623137, + "grad_norm": 0.3029160499572754, + "learning_rate": 9.376863023420867e-05, + "loss": 0.06371122598648071, + "step": 43920 + }, + { + "epoch": 6.2356281050390345, + "grad_norm": 9.19083023071289, + "learning_rate": 9.376721078779276e-05, + "loss": 0.03808676302433014, + "step": 43930 + }, + { + "epoch": 6.237047551454933, + "grad_norm": 0.2272202968597412, + "learning_rate": 9.376579134137686e-05, + "loss": 0.04455348253250122, + "step": 43940 + }, + { + "epoch": 6.23846699787083, + "grad_norm": 0.23112483322620392, + "learning_rate": 9.376437189496097e-05, + "loss": 0.01939200460910797, + "step": 43950 + }, + { + "epoch": 6.239886444286729, + "grad_norm": 0.1665191948413849, + "learning_rate": 9.376295244854507e-05, + "loss": 0.03822802007198334, + "step": 43960 + }, + { + "epoch": 6.241305890702626, + "grad_norm": 11.432101249694824, + "learning_rate": 9.376153300212918e-05, + "loss": 0.10485298633575439, + "step": 43970 + }, + { + "epoch": 6.242725337118523, + "grad_norm": 1.8148565292358398, + "learning_rate": 9.376011355571328e-05, + "loss": 0.09240283370018006, + "step": 43980 + }, + { + "epoch": 6.244144783534422, + "grad_norm": 0.7837454676628113, + "learning_rate": 9.375869410929738e-05, + "loss": 0.05253263115882874, + "step": 43990 + }, + { + "epoch": 6.245564229950319, + "grad_norm": 5.624810695648193, + "learning_rate": 9.375727466288147e-05, + "loss": 0.049315616488456726, + "step": 44000 + }, + { + "epoch": 6.245564229950319, + "eval_accuracy": 0.9733579195014942, + "eval_loss": 0.0813438892364502, + "eval_runtime": 32.0503, + "eval_samples_per_second": 490.697, + "eval_steps_per_second": 15.351, + "step": 44000 + }, + { + "epoch": 6.246983676366217, + "grad_norm": 5.86858606338501, + "learning_rate": 9.375585521646558e-05, + "loss": 0.029601067304611206, + "step": 44010 + }, + { + "epoch": 6.248403122782115, + "grad_norm": 1.0253491401672363, + "learning_rate": 9.375443577004968e-05, + "loss": 0.05629914402961731, + "step": 44020 + }, + { + "epoch": 6.249822569198013, + "grad_norm": 0.475136399269104, + "learning_rate": 9.375301632363379e-05, + "loss": 0.038224822282791136, + "step": 44030 + }, + { + "epoch": 6.251242015613911, + "grad_norm": 10.579747200012207, + "learning_rate": 9.375159687721789e-05, + "loss": 0.08462954759597778, + "step": 44040 + }, + { + "epoch": 6.252661462029808, + "grad_norm": 1.7066859006881714, + "learning_rate": 9.375017743080199e-05, + "loss": 0.07730778455734252, + "step": 44050 + }, + { + "epoch": 6.254080908445706, + "grad_norm": 0.373104065656662, + "learning_rate": 9.37487579843861e-05, + "loss": 0.04493587613105774, + "step": 44060 + }, + { + "epoch": 6.255500354861604, + "grad_norm": 2.209771156311035, + "learning_rate": 9.37473385379702e-05, + "loss": 0.031600701808929446, + "step": 44070 + }, + { + "epoch": 6.256919801277502, + "grad_norm": 1.3299434185028076, + "learning_rate": 9.37459190915543e-05, + "loss": 0.04354170858860016, + "step": 44080 + }, + { + "epoch": 6.258339247693399, + "grad_norm": 3.383254289627075, + "learning_rate": 9.374449964513839e-05, + "loss": 0.03865576386451721, + "step": 44090 + }, + { + "epoch": 6.259758694109298, + "grad_norm": 0.8928094506263733, + "learning_rate": 9.37430801987225e-05, + "loss": 0.11763886213302613, + "step": 44100 + }, + { + "epoch": 6.261178140525195, + "grad_norm": 0.7259202003479004, + "learning_rate": 9.37416607523066e-05, + "loss": 0.07595643401145935, + "step": 44110 + }, + { + "epoch": 6.262597586941093, + "grad_norm": 10.287557601928711, + "learning_rate": 9.374024130589071e-05, + "loss": 0.026817291975021362, + "step": 44120 + }, + { + "epoch": 6.264017033356991, + "grad_norm": 6.837195873260498, + "learning_rate": 9.37388218594748e-05, + "loss": 0.045735102891921994, + "step": 44130 + }, + { + "epoch": 6.265436479772888, + "grad_norm": 2.9552390575408936, + "learning_rate": 9.373740241305892e-05, + "loss": 0.06823272109031678, + "step": 44140 + }, + { + "epoch": 6.266855926188787, + "grad_norm": 7.999316692352295, + "learning_rate": 9.373598296664301e-05, + "loss": 0.08050004243850709, + "step": 44150 + }, + { + "epoch": 6.268275372604684, + "grad_norm": 6.604404926300049, + "learning_rate": 9.373456352022711e-05, + "loss": 0.04037375450134277, + "step": 44160 + }, + { + "epoch": 6.269694819020582, + "grad_norm": 0.303448885679245, + "learning_rate": 9.373314407381122e-05, + "loss": 0.06240311861038208, + "step": 44170 + }, + { + "epoch": 6.27111426543648, + "grad_norm": 0.028919706121087074, + "learning_rate": 9.373172462739532e-05, + "loss": 0.01785750240087509, + "step": 44180 + }, + { + "epoch": 6.272533711852377, + "grad_norm": 2.20166277885437, + "learning_rate": 9.373030518097943e-05, + "loss": 0.06402125954627991, + "step": 44190 + }, + { + "epoch": 6.2739531582682755, + "grad_norm": 9.297113418579102, + "learning_rate": 9.372888573456351e-05, + "loss": 0.031133198738098146, + "step": 44200 + }, + { + "epoch": 6.275372604684173, + "grad_norm": 2.6971702575683594, + "learning_rate": 9.372746628814763e-05, + "loss": 0.02306177020072937, + "step": 44210 + }, + { + "epoch": 6.276792051100071, + "grad_norm": 3.3070108890533447, + "learning_rate": 9.372604684173172e-05, + "loss": 0.08548479676246643, + "step": 44220 + }, + { + "epoch": 6.278211497515969, + "grad_norm": 7.981506824493408, + "learning_rate": 9.372462739531583e-05, + "loss": 0.013684302568435669, + "step": 44230 + }, + { + "epoch": 6.279630943931867, + "grad_norm": 8.1182222366333, + "learning_rate": 9.372320794889993e-05, + "loss": 0.03548404574394226, + "step": 44240 + }, + { + "epoch": 6.281050390347764, + "grad_norm": 0.4073677957057953, + "learning_rate": 9.372178850248403e-05, + "loss": 0.07523361444473267, + "step": 44250 + }, + { + "epoch": 6.282469836763662, + "grad_norm": 1.4130693674087524, + "learning_rate": 9.372036905606814e-05, + "loss": 0.06159330606460571, + "step": 44260 + }, + { + "epoch": 6.28388928317956, + "grad_norm": 0.19895227253437042, + "learning_rate": 9.371894960965224e-05, + "loss": 0.03507781326770783, + "step": 44270 + }, + { + "epoch": 6.2853087295954575, + "grad_norm": 0.9761980175971985, + "learning_rate": 9.371753016323635e-05, + "loss": 0.04177262783050537, + "step": 44280 + }, + { + "epoch": 6.286728176011356, + "grad_norm": 1.4593634605407715, + "learning_rate": 9.371611071682045e-05, + "loss": 0.02598634660243988, + "step": 44290 + }, + { + "epoch": 6.288147622427253, + "grad_norm": 0.16853317618370056, + "learning_rate": 9.371469127040454e-05, + "loss": 0.030664128065109254, + "step": 44300 + }, + { + "epoch": 6.2895670688431515, + "grad_norm": 5.152888774871826, + "learning_rate": 9.371327182398864e-05, + "loss": 0.026159942150115967, + "step": 44310 + }, + { + "epoch": 6.290986515259049, + "grad_norm": 0.22693578898906708, + "learning_rate": 9.371185237757275e-05, + "loss": 0.011505614221096038, + "step": 44320 + }, + { + "epoch": 6.292405961674946, + "grad_norm": 0.12040051072835922, + "learning_rate": 9.371043293115685e-05, + "loss": 0.063713937997818, + "step": 44330 + }, + { + "epoch": 6.293825408090845, + "grad_norm": 8.595141410827637, + "learning_rate": 9.370901348474096e-05, + "loss": 0.052627182006835936, + "step": 44340 + }, + { + "epoch": 6.295244854506742, + "grad_norm": 0.4570486545562744, + "learning_rate": 9.370759403832506e-05, + "loss": 0.05868352651596069, + "step": 44350 + }, + { + "epoch": 6.29666430092264, + "grad_norm": 10.074518203735352, + "learning_rate": 9.370617459190915e-05, + "loss": 0.0731860876083374, + "step": 44360 + }, + { + "epoch": 6.298083747338538, + "grad_norm": 5.729341506958008, + "learning_rate": 9.370475514549327e-05, + "loss": 0.04211472272872925, + "step": 44370 + }, + { + "epoch": 6.299503193754436, + "grad_norm": 7.382556438446045, + "learning_rate": 9.370333569907736e-05, + "loss": 0.041568410396575925, + "step": 44380 + }, + { + "epoch": 6.3009226401703335, + "grad_norm": 7.665040016174316, + "learning_rate": 9.370191625266147e-05, + "loss": 0.08681845664978027, + "step": 44390 + }, + { + "epoch": 6.302342086586231, + "grad_norm": 5.031284332275391, + "learning_rate": 9.370049680624556e-05, + "loss": 0.06689251661300659, + "step": 44400 + }, + { + "epoch": 6.303761533002129, + "grad_norm": 8.009147644042969, + "learning_rate": 9.369907735982967e-05, + "loss": 0.05348163843154907, + "step": 44410 + }, + { + "epoch": 6.305180979418027, + "grad_norm": 0.7823144793510437, + "learning_rate": 9.369765791341377e-05, + "loss": 0.025225034356117247, + "step": 44420 + }, + { + "epoch": 6.306600425833925, + "grad_norm": 6.241203784942627, + "learning_rate": 9.369623846699788e-05, + "loss": 0.055314040184020995, + "step": 44430 + }, + { + "epoch": 6.308019872249822, + "grad_norm": 0.48145613074302673, + "learning_rate": 9.369481902058199e-05, + "loss": 0.046424245834350585, + "step": 44440 + }, + { + "epoch": 6.309439318665721, + "grad_norm": 8.973401069641113, + "learning_rate": 9.369339957416608e-05, + "loss": 0.05323241949081421, + "step": 44450 + }, + { + "epoch": 6.310858765081618, + "grad_norm": 10.027241706848145, + "learning_rate": 9.369198012775018e-05, + "loss": 0.027989843487739564, + "step": 44460 + }, + { + "epoch": 6.312278211497516, + "grad_norm": 0.4566883146762848, + "learning_rate": 9.369056068133428e-05, + "loss": 0.010857632756233216, + "step": 44470 + }, + { + "epoch": 6.313697657913414, + "grad_norm": 3.899604320526123, + "learning_rate": 9.368914123491839e-05, + "loss": 0.03426201343536377, + "step": 44480 + }, + { + "epoch": 6.315117104329311, + "grad_norm": 0.21963563561439514, + "learning_rate": 9.368772178850249e-05, + "loss": 0.021139997243881225, + "step": 44490 + }, + { + "epoch": 6.31653655074521, + "grad_norm": 0.3161865472793579, + "learning_rate": 9.36863023420866e-05, + "loss": 0.06483102440834046, + "step": 44500 + }, + { + "epoch": 6.31653655074521, + "eval_accuracy": 0.9754562217841928, + "eval_loss": 0.07596415281295776, + "eval_runtime": 32.4521, + "eval_samples_per_second": 484.622, + "eval_steps_per_second": 15.161, + "step": 44500 + }, + { + "epoch": 6.317955997161107, + "grad_norm": 3.4797637462615967, + "learning_rate": 9.368488289567068e-05, + "loss": 0.046000164747238156, + "step": 44510 + }, + { + "epoch": 6.319375443577005, + "grad_norm": 3.0673649311065674, + "learning_rate": 9.36834634492548e-05, + "loss": 0.06700726747512817, + "step": 44520 + }, + { + "epoch": 6.320794889992903, + "grad_norm": 0.4138965308666229, + "learning_rate": 9.36820440028389e-05, + "loss": 0.03780338764190674, + "step": 44530 + }, + { + "epoch": 6.3222143364088, + "grad_norm": 0.016734696924686432, + "learning_rate": 9.3680624556423e-05, + "loss": 0.059049326181411746, + "step": 44540 + }, + { + "epoch": 6.3236337828246985, + "grad_norm": 4.171092987060547, + "learning_rate": 9.367920511000711e-05, + "loss": 0.021105588972568513, + "step": 44550 + }, + { + "epoch": 6.325053229240596, + "grad_norm": 0.40709081292152405, + "learning_rate": 9.36777856635912e-05, + "loss": 0.031069639325141906, + "step": 44560 + }, + { + "epoch": 6.326472675656494, + "grad_norm": 1.556479811668396, + "learning_rate": 9.367636621717531e-05, + "loss": 0.022256243228912353, + "step": 44570 + }, + { + "epoch": 6.327892122072392, + "grad_norm": 0.566481351852417, + "learning_rate": 9.36749467707594e-05, + "loss": 0.06047871708869934, + "step": 44580 + }, + { + "epoch": 6.32931156848829, + "grad_norm": 0.029307467862963676, + "learning_rate": 9.367352732434352e-05, + "loss": 0.0970751941204071, + "step": 44590 + }, + { + "epoch": 6.330731014904187, + "grad_norm": 0.20486246049404144, + "learning_rate": 9.367210787792761e-05, + "loss": 0.015537744760513306, + "step": 44600 + }, + { + "epoch": 6.332150461320085, + "grad_norm": 7.652834415435791, + "learning_rate": 9.367068843151171e-05, + "loss": 0.031410837173461915, + "step": 44610 + }, + { + "epoch": 6.333569907735983, + "grad_norm": 0.8402873277664185, + "learning_rate": 9.366926898509582e-05, + "loss": 0.02272775173187256, + "step": 44620 + }, + { + "epoch": 6.3349893541518805, + "grad_norm": 0.1902949959039688, + "learning_rate": 9.366784953867992e-05, + "loss": 0.06183580756187439, + "step": 44630 + }, + { + "epoch": 6.336408800567779, + "grad_norm": 1.0105608701705933, + "learning_rate": 9.366643009226403e-05, + "loss": 0.027782031893730165, + "step": 44640 + }, + { + "epoch": 6.337828246983676, + "grad_norm": 2.3518152236938477, + "learning_rate": 9.366501064584813e-05, + "loss": 0.05995446443557739, + "step": 44650 + }, + { + "epoch": 6.3392476933995745, + "grad_norm": 0.39905598759651184, + "learning_rate": 9.366359119943222e-05, + "loss": 0.061394399404525755, + "step": 44660 + }, + { + "epoch": 6.340667139815472, + "grad_norm": 7.838604927062988, + "learning_rate": 9.366217175301632e-05, + "loss": 0.06700649261474609, + "step": 44670 + }, + { + "epoch": 6.342086586231369, + "grad_norm": 0.11704706400632858, + "learning_rate": 9.366075230660043e-05, + "loss": 0.03449685275554657, + "step": 44680 + }, + { + "epoch": 6.343506032647268, + "grad_norm": 1.6874034404754639, + "learning_rate": 9.365933286018453e-05, + "loss": 0.08206239938735962, + "step": 44690 + }, + { + "epoch": 6.344925479063165, + "grad_norm": 7.237381935119629, + "learning_rate": 9.365791341376864e-05, + "loss": 0.08659184575080872, + "step": 44700 + }, + { + "epoch": 6.346344925479063, + "grad_norm": 2.569840908050537, + "learning_rate": 9.365649396735274e-05, + "loss": 0.10804457664489746, + "step": 44710 + }, + { + "epoch": 6.347764371894961, + "grad_norm": 12.174409866333008, + "learning_rate": 9.365507452093684e-05, + "loss": 0.08026717901229859, + "step": 44720 + }, + { + "epoch": 6.349183818310859, + "grad_norm": 0.8195727467536926, + "learning_rate": 9.365365507452095e-05, + "loss": 0.024174678325653075, + "step": 44730 + }, + { + "epoch": 6.3506032647267565, + "grad_norm": 1.0109519958496094, + "learning_rate": 9.365223562810504e-05, + "loss": 0.08143213391304016, + "step": 44740 + }, + { + "epoch": 6.352022711142654, + "grad_norm": 6.605328559875488, + "learning_rate": 9.365081618168916e-05, + "loss": 0.03471195697784424, + "step": 44750 + }, + { + "epoch": 6.353442157558552, + "grad_norm": 0.9406673908233643, + "learning_rate": 9.364939673527324e-05, + "loss": 0.0571575939655304, + "step": 44760 + }, + { + "epoch": 6.35486160397445, + "grad_norm": 0.23806260526180267, + "learning_rate": 9.364797728885735e-05, + "loss": 0.04274870157241821, + "step": 44770 + }, + { + "epoch": 6.356281050390348, + "grad_norm": 1.739245057106018, + "learning_rate": 9.364655784244145e-05, + "loss": 0.033979329466819766, + "step": 44780 + }, + { + "epoch": 6.357700496806245, + "grad_norm": 2.6521658897399902, + "learning_rate": 9.364513839602556e-05, + "loss": 0.0800173044204712, + "step": 44790 + }, + { + "epoch": 6.359119943222144, + "grad_norm": 9.809504508972168, + "learning_rate": 9.364371894960966e-05, + "loss": 0.11794828176498413, + "step": 44800 + }, + { + "epoch": 6.360539389638041, + "grad_norm": 0.32209938764572144, + "learning_rate": 9.364229950319377e-05, + "loss": 0.037167853116989134, + "step": 44810 + }, + { + "epoch": 6.3619588360539385, + "grad_norm": 1.5642523765563965, + "learning_rate": 9.364088005677786e-05, + "loss": 0.02572680115699768, + "step": 44820 + }, + { + "epoch": 6.363378282469837, + "grad_norm": 0.2237132340669632, + "learning_rate": 9.363946061036196e-05, + "loss": 0.022832623124122618, + "step": 44830 + }, + { + "epoch": 6.364797728885734, + "grad_norm": 2.831024646759033, + "learning_rate": 9.363804116394607e-05, + "loss": 0.05402438640594483, + "step": 44840 + }, + { + "epoch": 6.366217175301633, + "grad_norm": 5.034896373748779, + "learning_rate": 9.363662171753017e-05, + "loss": 0.07991594672203065, + "step": 44850 + }, + { + "epoch": 6.36763662171753, + "grad_norm": 1.0471001863479614, + "learning_rate": 9.363520227111428e-05, + "loss": 0.01694260984659195, + "step": 44860 + }, + { + "epoch": 6.369056068133428, + "grad_norm": 4.997382164001465, + "learning_rate": 9.363378282469836e-05, + "loss": 0.04651702046394348, + "step": 44870 + }, + { + "epoch": 6.370475514549326, + "grad_norm": 3.6034419536590576, + "learning_rate": 9.363236337828248e-05, + "loss": 0.029604411125183104, + "step": 44880 + }, + { + "epoch": 6.371894960965223, + "grad_norm": 5.520668029785156, + "learning_rate": 9.363094393186657e-05, + "loss": 0.05422980189323425, + "step": 44890 + }, + { + "epoch": 6.373314407381121, + "grad_norm": 4.492794990539551, + "learning_rate": 9.362952448545068e-05, + "loss": 0.057867521047592164, + "step": 44900 + }, + { + "epoch": 6.374733853797019, + "grad_norm": 9.890128135681152, + "learning_rate": 9.362810503903478e-05, + "loss": 0.048404908180236815, + "step": 44910 + }, + { + "epoch": 6.376153300212917, + "grad_norm": 1.9544626474380493, + "learning_rate": 9.362668559261888e-05, + "loss": 0.052456903457641604, + "step": 44920 + }, + { + "epoch": 6.377572746628815, + "grad_norm": 0.8930409550666809, + "learning_rate": 9.362526614620299e-05, + "loss": 0.04241478145122528, + "step": 44930 + }, + { + "epoch": 6.378992193044713, + "grad_norm": 8.498833656311035, + "learning_rate": 9.362384669978709e-05, + "loss": 0.023489537835121154, + "step": 44940 + }, + { + "epoch": 6.38041163946061, + "grad_norm": 1.9117820262908936, + "learning_rate": 9.36224272533712e-05, + "loss": 0.050252276659011844, + "step": 44950 + }, + { + "epoch": 6.381831085876508, + "grad_norm": 0.12219048291444778, + "learning_rate": 9.36210078069553e-05, + "loss": 0.031068319082260133, + "step": 44960 + }, + { + "epoch": 6.383250532292406, + "grad_norm": 1.1104416847229004, + "learning_rate": 9.361958836053939e-05, + "loss": 0.034421283006668094, + "step": 44970 + }, + { + "epoch": 6.384669978708303, + "grad_norm": 5.63448429107666, + "learning_rate": 9.361816891412349e-05, + "loss": 0.06306658387184143, + "step": 44980 + }, + { + "epoch": 6.386089425124202, + "grad_norm": 0.5965878963470459, + "learning_rate": 9.36167494677076e-05, + "loss": 0.09757879376411438, + "step": 44990 + }, + { + "epoch": 6.387508871540099, + "grad_norm": 7.923148155212402, + "learning_rate": 9.36153300212917e-05, + "loss": 0.02036636769771576, + "step": 45000 + }, + { + "epoch": 6.387508871540099, + "eval_accuracy": 0.9795892414319324, + "eval_loss": 0.05749217048287392, + "eval_runtime": 32.5022, + "eval_samples_per_second": 483.874, + "eval_steps_per_second": 15.137, + "step": 45000 + }, + { + "epoch": 6.3889283179559975, + "grad_norm": 9.941558837890625, + "learning_rate": 9.361391057487581e-05, + "loss": 0.05102572441101074, + "step": 45010 + }, + { + "epoch": 6.390347764371895, + "grad_norm": 3.2416279315948486, + "learning_rate": 9.36124911284599e-05, + "loss": 0.07383667826652526, + "step": 45020 + }, + { + "epoch": 6.391767210787792, + "grad_norm": 5.702998638153076, + "learning_rate": 9.3611071682044e-05, + "loss": 0.028665339946746825, + "step": 45030 + }, + { + "epoch": 6.393186657203691, + "grad_norm": 0.4869743883609772, + "learning_rate": 9.360965223562811e-05, + "loss": 0.037399545311927795, + "step": 45040 + }, + { + "epoch": 6.394606103619588, + "grad_norm": 1.0424598455429077, + "learning_rate": 9.360823278921221e-05, + "loss": 0.04913428127765655, + "step": 45050 + }, + { + "epoch": 6.396025550035486, + "grad_norm": 4.111805438995361, + "learning_rate": 9.360681334279632e-05, + "loss": 0.019465672969818115, + "step": 45060 + }, + { + "epoch": 6.397444996451384, + "grad_norm": 0.8496167659759521, + "learning_rate": 9.360553584102201e-05, + "loss": 0.08575330376625061, + "step": 45070 + }, + { + "epoch": 6.398864442867282, + "grad_norm": 2.862640857696533, + "learning_rate": 9.36041163946061e-05, + "loss": 0.022041980922222138, + "step": 45080 + }, + { + "epoch": 6.4002838892831795, + "grad_norm": 0.3247391879558563, + "learning_rate": 9.36026969481902e-05, + "loss": 0.040409648418426515, + "step": 45090 + }, + { + "epoch": 6.401703335699077, + "grad_norm": 4.703793525695801, + "learning_rate": 9.360127750177431e-05, + "loss": 0.023367772996425628, + "step": 45100 + }, + { + "epoch": 6.403122782114975, + "grad_norm": 0.1681506484746933, + "learning_rate": 9.359985805535841e-05, + "loss": 0.024120521545410157, + "step": 45110 + }, + { + "epoch": 6.404542228530873, + "grad_norm": 5.247531890869141, + "learning_rate": 9.359843860894252e-05, + "loss": 0.028122204542160033, + "step": 45120 + }, + { + "epoch": 6.405961674946771, + "grad_norm": 0.015556308440864086, + "learning_rate": 9.359701916252662e-05, + "loss": 0.014227265119552612, + "step": 45130 + }, + { + "epoch": 6.407381121362668, + "grad_norm": 3.447943687438965, + "learning_rate": 9.359559971611073e-05, + "loss": 0.05484868288040161, + "step": 45140 + }, + { + "epoch": 6.408800567778567, + "grad_norm": 0.591742217540741, + "learning_rate": 9.359418026969481e-05, + "loss": 0.052353084087371826, + "step": 45150 + }, + { + "epoch": 6.410220014194464, + "grad_norm": 1.4653022289276123, + "learning_rate": 9.359276082327892e-05, + "loss": 0.04562208354473114, + "step": 45160 + }, + { + "epoch": 6.4116394606103615, + "grad_norm": 4.289706230163574, + "learning_rate": 9.359134137686302e-05, + "loss": 0.04674837589263916, + "step": 45170 + }, + { + "epoch": 6.41305890702626, + "grad_norm": 7.5176682472229, + "learning_rate": 9.358992193044713e-05, + "loss": 0.048534101247787474, + "step": 45180 + }, + { + "epoch": 6.414478353442157, + "grad_norm": 0.7810305953025818, + "learning_rate": 9.358850248403124e-05, + "loss": 0.00921451896429062, + "step": 45190 + }, + { + "epoch": 6.4158977998580555, + "grad_norm": 1.4035773277282715, + "learning_rate": 9.358708303761533e-05, + "loss": 0.033289432525634766, + "step": 45200 + }, + { + "epoch": 6.417317246273953, + "grad_norm": 1.1093297004699707, + "learning_rate": 9.358566359119944e-05, + "loss": 0.025805479288101195, + "step": 45210 + }, + { + "epoch": 6.418736692689851, + "grad_norm": 0.6371444463729858, + "learning_rate": 9.358424414478354e-05, + "loss": 0.03949221968650818, + "step": 45220 + }, + { + "epoch": 6.420156139105749, + "grad_norm": 5.708765983581543, + "learning_rate": 9.358282469836765e-05, + "loss": 0.04703320562839508, + "step": 45230 + }, + { + "epoch": 6.421575585521646, + "grad_norm": 7.504692077636719, + "learning_rate": 9.358140525195174e-05, + "loss": 0.027354171872138976, + "step": 45240 + }, + { + "epoch": 6.422995031937544, + "grad_norm": 6.347846984863281, + "learning_rate": 9.357998580553584e-05, + "loss": 0.07207931280136108, + "step": 45250 + }, + { + "epoch": 6.424414478353442, + "grad_norm": 4.845231533050537, + "learning_rate": 9.357856635911994e-05, + "loss": 0.029147011041641236, + "step": 45260 + }, + { + "epoch": 6.42583392476934, + "grad_norm": 2.5489513874053955, + "learning_rate": 9.357714691270405e-05, + "loss": 0.05189456939697266, + "step": 45270 + }, + { + "epoch": 6.4272533711852375, + "grad_norm": 3.6041786670684814, + "learning_rate": 9.357572746628816e-05, + "loss": 0.056130462884902955, + "step": 45280 + }, + { + "epoch": 6.428672817601136, + "grad_norm": 2.5728187561035156, + "learning_rate": 9.357430801987226e-05, + "loss": 0.02223515808582306, + "step": 45290 + }, + { + "epoch": 6.430092264017033, + "grad_norm": 5.534434795379639, + "learning_rate": 9.357288857345636e-05, + "loss": 0.1306079149246216, + "step": 45300 + }, + { + "epoch": 6.431511710432932, + "grad_norm": 2.4969444274902344, + "learning_rate": 9.357146912704045e-05, + "loss": 0.030625393986701964, + "step": 45310 + }, + { + "epoch": 6.432931156848829, + "grad_norm": 0.8693184852600098, + "learning_rate": 9.357004968062456e-05, + "loss": 0.058841168880462646, + "step": 45320 + }, + { + "epoch": 6.434350603264726, + "grad_norm": 0.7950196266174316, + "learning_rate": 9.356863023420866e-05, + "loss": 0.07491456866264343, + "step": 45330 + }, + { + "epoch": 6.435770049680625, + "grad_norm": 8.494572639465332, + "learning_rate": 9.356721078779277e-05, + "loss": 0.05608614683151245, + "step": 45340 + }, + { + "epoch": 6.437189496096522, + "grad_norm": 9.173154830932617, + "learning_rate": 9.356579134137686e-05, + "loss": 0.10091447830200195, + "step": 45350 + }, + { + "epoch": 6.43860894251242, + "grad_norm": 6.76547384262085, + "learning_rate": 9.356437189496097e-05, + "loss": 0.027798345685005187, + "step": 45360 + }, + { + "epoch": 6.440028388928318, + "grad_norm": 2.0051448345184326, + "learning_rate": 9.356295244854508e-05, + "loss": 0.07456052303314209, + "step": 45370 + }, + { + "epoch": 6.441447835344216, + "grad_norm": 2.5545055866241455, + "learning_rate": 9.356153300212918e-05, + "loss": 0.05242663621902466, + "step": 45380 + }, + { + "epoch": 6.442867281760114, + "grad_norm": 0.8713223338127136, + "learning_rate": 9.356011355571329e-05, + "loss": 0.01972261071205139, + "step": 45390 + }, + { + "epoch": 6.444286728176011, + "grad_norm": 2.4405908584594727, + "learning_rate": 9.355869410929737e-05, + "loss": 0.021471349895000456, + "step": 45400 + }, + { + "epoch": 6.445706174591909, + "grad_norm": 0.43845289945602417, + "learning_rate": 9.355727466288148e-05, + "loss": 0.041665560007095336, + "step": 45410 + }, + { + "epoch": 6.447125621007807, + "grad_norm": 4.887557029724121, + "learning_rate": 9.355585521646558e-05, + "loss": 0.021108832955360413, + "step": 45420 + }, + { + "epoch": 6.448545067423705, + "grad_norm": 13.41622543334961, + "learning_rate": 9.355443577004969e-05, + "loss": 0.030141952633857726, + "step": 45430 + }, + { + "epoch": 6.4499645138396025, + "grad_norm": 3.250065565109253, + "learning_rate": 9.355301632363379e-05, + "loss": 0.021225135028362273, + "step": 45440 + }, + { + "epoch": 6.451383960255501, + "grad_norm": 12.573553085327148, + "learning_rate": 9.355159687721788e-05, + "loss": 0.040314275026321414, + "step": 45450 + }, + { + "epoch": 6.452803406671398, + "grad_norm": 8.960000038146973, + "learning_rate": 9.3550177430802e-05, + "loss": 0.04356703162193298, + "step": 45460 + }, + { + "epoch": 6.454222853087296, + "grad_norm": 7.876071929931641, + "learning_rate": 9.354875798438609e-05, + "loss": 0.0315957635641098, + "step": 45470 + }, + { + "epoch": 6.455642299503194, + "grad_norm": 0.33070969581604004, + "learning_rate": 9.35473385379702e-05, + "loss": 0.04888508915901184, + "step": 45480 + }, + { + "epoch": 6.457061745919091, + "grad_norm": 7.309976100921631, + "learning_rate": 9.35459190915543e-05, + "loss": 0.03390091061592102, + "step": 45490 + }, + { + "epoch": 6.45848119233499, + "grad_norm": 10.262081146240234, + "learning_rate": 9.354449964513841e-05, + "loss": 0.05524118542671204, + "step": 45500 + }, + { + "epoch": 6.45848119233499, + "eval_accuracy": 0.9663635785591658, + "eval_loss": 0.11470869183540344, + "eval_runtime": 32.1395, + "eval_samples_per_second": 489.336, + "eval_steps_per_second": 15.308, + "step": 45500 + }, + { + "epoch": 6.459900638750887, + "grad_norm": 0.6667046546936035, + "learning_rate": 9.35430801987225e-05, + "loss": 0.027115851640701294, + "step": 45510 + }, + { + "epoch": 6.461320085166785, + "grad_norm": 2.887190580368042, + "learning_rate": 9.35416607523066e-05, + "loss": 0.06599999070167542, + "step": 45520 + }, + { + "epoch": 6.462739531582683, + "grad_norm": 4.560837268829346, + "learning_rate": 9.35402413058907e-05, + "loss": 0.09502153396606446, + "step": 45530 + }, + { + "epoch": 6.46415897799858, + "grad_norm": 3.5318081378936768, + "learning_rate": 9.353882185947481e-05, + "loss": 0.03770635426044464, + "step": 45540 + }, + { + "epoch": 6.4655784244144785, + "grad_norm": 1.6946613788604736, + "learning_rate": 9.353740241305891e-05, + "loss": 0.04129364490509033, + "step": 45550 + }, + { + "epoch": 6.466997870830376, + "grad_norm": 1.8913307189941406, + "learning_rate": 9.353598296664301e-05, + "loss": 0.054656922817230225, + "step": 45560 + }, + { + "epoch": 6.468417317246274, + "grad_norm": 0.312080442905426, + "learning_rate": 9.353456352022712e-05, + "loss": 0.04530891180038452, + "step": 45570 + }, + { + "epoch": 6.469836763662172, + "grad_norm": 5.941243648529053, + "learning_rate": 9.353314407381122e-05, + "loss": 0.02797747552394867, + "step": 45580 + }, + { + "epoch": 6.47125621007807, + "grad_norm": 0.10799313336610794, + "learning_rate": 9.353172462739533e-05, + "loss": 0.040154564380645755, + "step": 45590 + }, + { + "epoch": 6.472675656493967, + "grad_norm": 8.12649154663086, + "learning_rate": 9.353030518097943e-05, + "loss": 0.0917394757270813, + "step": 45600 + }, + { + "epoch": 6.474095102909865, + "grad_norm": 0.055603571236133575, + "learning_rate": 9.352888573456352e-05, + "loss": 0.021613481640815734, + "step": 45610 + }, + { + "epoch": 6.475514549325763, + "grad_norm": 3.251713275909424, + "learning_rate": 9.352746628814762e-05, + "loss": 0.06455045938491821, + "step": 45620 + }, + { + "epoch": 6.4769339957416605, + "grad_norm": 11.580523490905762, + "learning_rate": 9.352604684173173e-05, + "loss": 0.06646577119827271, + "step": 45630 + }, + { + "epoch": 6.478353442157559, + "grad_norm": 5.402514457702637, + "learning_rate": 9.352462739531583e-05, + "loss": 0.06263988018035889, + "step": 45640 + }, + { + "epoch": 6.479772888573456, + "grad_norm": 0.2940421998500824, + "learning_rate": 9.352320794889994e-05, + "loss": 0.06655374765396119, + "step": 45650 + }, + { + "epoch": 6.4811923349893545, + "grad_norm": 4.967324256896973, + "learning_rate": 9.352178850248404e-05, + "loss": 0.04391606450080872, + "step": 45660 + }, + { + "epoch": 6.482611781405252, + "grad_norm": 11.9403715133667, + "learning_rate": 9.352036905606813e-05, + "loss": 0.06087355017662048, + "step": 45670 + }, + { + "epoch": 6.484031227821149, + "grad_norm": 6.026821136474609, + "learning_rate": 9.351894960965225e-05, + "loss": 0.02247100919485092, + "step": 45680 + }, + { + "epoch": 6.485450674237048, + "grad_norm": 2.9908483028411865, + "learning_rate": 9.351753016323634e-05, + "loss": 0.0618190348148346, + "step": 45690 + }, + { + "epoch": 6.486870120652945, + "grad_norm": 0.9543291926383972, + "learning_rate": 9.351611071682045e-05, + "loss": 0.0716430127620697, + "step": 45700 + }, + { + "epoch": 6.488289567068843, + "grad_norm": 0.33498528599739075, + "learning_rate": 9.351469127040454e-05, + "loss": 0.0465453565120697, + "step": 45710 + }, + { + "epoch": 6.489709013484741, + "grad_norm": 6.390286922454834, + "learning_rate": 9.351327182398865e-05, + "loss": 0.07091315388679505, + "step": 45720 + }, + { + "epoch": 6.491128459900639, + "grad_norm": 1.1601217985153198, + "learning_rate": 9.351185237757275e-05, + "loss": 0.024645933508872987, + "step": 45730 + }, + { + "epoch": 6.4925479063165366, + "grad_norm": 11.113912582397461, + "learning_rate": 9.351043293115686e-05, + "loss": 0.11422479152679443, + "step": 45740 + }, + { + "epoch": 6.493967352732434, + "grad_norm": 2.370166540145874, + "learning_rate": 9.350901348474095e-05, + "loss": 0.05117689967155457, + "step": 45750 + }, + { + "epoch": 6.495386799148332, + "grad_norm": 0.16023661196231842, + "learning_rate": 9.350759403832505e-05, + "loss": 0.10015047788619995, + "step": 45760 + }, + { + "epoch": 6.49680624556423, + "grad_norm": 0.25259077548980713, + "learning_rate": 9.350617459190916e-05, + "loss": 0.05752279162406922, + "step": 45770 + }, + { + "epoch": 6.498225691980128, + "grad_norm": 5.102621078491211, + "learning_rate": 9.350475514549326e-05, + "loss": 0.04667982161045074, + "step": 45780 + }, + { + "epoch": 6.499645138396025, + "grad_norm": 8.65860652923584, + "learning_rate": 9.350333569907737e-05, + "loss": 0.07225523591041565, + "step": 45790 + }, + { + "epoch": 6.501064584811924, + "grad_norm": 1.2118887901306152, + "learning_rate": 9.350191625266147e-05, + "loss": 0.09330646991729737, + "step": 45800 + }, + { + "epoch": 6.502484031227821, + "grad_norm": 3.675856590270996, + "learning_rate": 9.350049680624557e-05, + "loss": 0.09584462642669678, + "step": 45810 + }, + { + "epoch": 6.503903477643719, + "grad_norm": 4.621379375457764, + "learning_rate": 9.349907735982966e-05, + "loss": 0.026826804876327513, + "step": 45820 + }, + { + "epoch": 6.505322924059617, + "grad_norm": 6.8167033195495605, + "learning_rate": 9.349765791341377e-05, + "loss": 0.04220779240131378, + "step": 45830 + }, + { + "epoch": 6.506742370475514, + "grad_norm": 7.388572692871094, + "learning_rate": 9.349623846699787e-05, + "loss": 0.06925356388092041, + "step": 45840 + }, + { + "epoch": 6.508161816891413, + "grad_norm": 1.5208756923675537, + "learning_rate": 9.349481902058198e-05, + "loss": 0.0498742550611496, + "step": 45850 + }, + { + "epoch": 6.50958126330731, + "grad_norm": 1.2147730588912964, + "learning_rate": 9.349339957416608e-05, + "loss": 0.05823368430137634, + "step": 45860 + }, + { + "epoch": 6.511000709723208, + "grad_norm": 7.438129901885986, + "learning_rate": 9.349198012775018e-05, + "loss": 0.02405182123184204, + "step": 45870 + }, + { + "epoch": 6.512420156139106, + "grad_norm": 3.8440439701080322, + "learning_rate": 9.349056068133429e-05, + "loss": 0.08130161166191101, + "step": 45880 + }, + { + "epoch": 6.513839602555003, + "grad_norm": 5.825738906860352, + "learning_rate": 9.348914123491839e-05, + "loss": 0.05340722799301147, + "step": 45890 + }, + { + "epoch": 6.5152590489709015, + "grad_norm": 0.20723451673984528, + "learning_rate": 9.34877217885025e-05, + "loss": 0.08683145642280579, + "step": 45900 + }, + { + "epoch": 6.516678495386799, + "grad_norm": 0.07636448740959167, + "learning_rate": 9.34863023420866e-05, + "loss": 0.013025203347206115, + "step": 45910 + }, + { + "epoch": 6.518097941802697, + "grad_norm": 6.467600345611572, + "learning_rate": 9.348488289567069e-05, + "loss": 0.07896758913993836, + "step": 45920 + }, + { + "epoch": 6.519517388218595, + "grad_norm": 0.15048600733280182, + "learning_rate": 9.348346344925479e-05, + "loss": 0.05304072499275207, + "step": 45930 + }, + { + "epoch": 6.520936834634493, + "grad_norm": 1.0183674097061157, + "learning_rate": 9.34820440028389e-05, + "loss": 0.02095807492733002, + "step": 45940 + }, + { + "epoch": 6.52235628105039, + "grad_norm": 0.6236469149589539, + "learning_rate": 9.3480624556423e-05, + "loss": 0.026838436722755432, + "step": 45950 + }, + { + "epoch": 6.523775727466289, + "grad_norm": 3.4434401988983154, + "learning_rate": 9.347920511000711e-05, + "loss": 0.052097213268280027, + "step": 45960 + }, + { + "epoch": 6.525195173882186, + "grad_norm": 2.725377321243286, + "learning_rate": 9.34777856635912e-05, + "loss": 0.03362095654010773, + "step": 45970 + }, + { + "epoch": 6.5266146202980835, + "grad_norm": 0.5739490985870361, + "learning_rate": 9.34763662171753e-05, + "loss": 0.03694147765636444, + "step": 45980 + }, + { + "epoch": 6.528034066713982, + "grad_norm": 2.8594954013824463, + "learning_rate": 9.347494677075941e-05, + "loss": 0.022614985704421997, + "step": 45990 + }, + { + "epoch": 6.529453513129879, + "grad_norm": 1.5943201780319214, + "learning_rate": 9.347352732434351e-05, + "loss": 0.028483986854553223, + "step": 46000 + }, + { + "epoch": 6.529453513129879, + "eval_accuracy": 0.9761556558784257, + "eval_loss": 0.0719020664691925, + "eval_runtime": 32.803, + "eval_samples_per_second": 479.438, + "eval_steps_per_second": 14.999, + "step": 46000 + }, + { + "epoch": 6.5308729595457775, + "grad_norm": 0.6581725478172302, + "learning_rate": 9.347210787792762e-05, + "loss": 0.024489733576774596, + "step": 46010 + }, + { + "epoch": 6.532292405961675, + "grad_norm": 0.812021791934967, + "learning_rate": 9.34706884315117e-05, + "loss": 0.047544506192207334, + "step": 46020 + }, + { + "epoch": 6.533711852377573, + "grad_norm": 0.18070675432682037, + "learning_rate": 9.346926898509582e-05, + "loss": 0.05682721734046936, + "step": 46030 + }, + { + "epoch": 6.535131298793471, + "grad_norm": 1.2470442056655884, + "learning_rate": 9.346784953867991e-05, + "loss": 0.03914521634578705, + "step": 46040 + }, + { + "epoch": 6.536550745209368, + "grad_norm": 0.18507327139377594, + "learning_rate": 9.346643009226402e-05, + "loss": 0.05161336660385132, + "step": 46050 + }, + { + "epoch": 6.537970191625266, + "grad_norm": 2.305793523788452, + "learning_rate": 9.346501064584812e-05, + "loss": 0.04021240770816803, + "step": 46060 + }, + { + "epoch": 6.539389638041164, + "grad_norm": 0.20650917291641235, + "learning_rate": 9.346359119943222e-05, + "loss": 0.02591737508773804, + "step": 46070 + }, + { + "epoch": 6.540809084457062, + "grad_norm": 1.8644980192184448, + "learning_rate": 9.346217175301633e-05, + "loss": 0.022021229565143585, + "step": 46080 + }, + { + "epoch": 6.5422285308729595, + "grad_norm": 2.503336191177368, + "learning_rate": 9.346075230660043e-05, + "loss": 0.040047654509544374, + "step": 46090 + }, + { + "epoch": 6.543647977288858, + "grad_norm": 0.6199862957000732, + "learning_rate": 9.345933286018454e-05, + "loss": 0.01799771934747696, + "step": 46100 + }, + { + "epoch": 6.545067423704755, + "grad_norm": 0.29369279742240906, + "learning_rate": 9.345791341376864e-05, + "loss": 0.020970551669597624, + "step": 46110 + }, + { + "epoch": 6.546486870120653, + "grad_norm": 0.7996075749397278, + "learning_rate": 9.345649396735273e-05, + "loss": 0.043533599376678465, + "step": 46120 + }, + { + "epoch": 6.547906316536551, + "grad_norm": 0.3149011731147766, + "learning_rate": 9.345507452093683e-05, + "loss": 0.027598875761032104, + "step": 46130 + }, + { + "epoch": 6.549325762952448, + "grad_norm": 0.8261258602142334, + "learning_rate": 9.345365507452094e-05, + "loss": 0.04715876281261444, + "step": 46140 + }, + { + "epoch": 6.550745209368347, + "grad_norm": 4.057319164276123, + "learning_rate": 9.345223562810504e-05, + "loss": 0.03853771984577179, + "step": 46150 + }, + { + "epoch": 6.552164655784244, + "grad_norm": 8.443818092346191, + "learning_rate": 9.345081618168915e-05, + "loss": 0.07661219835281372, + "step": 46160 + }, + { + "epoch": 6.553584102200142, + "grad_norm": 7.326854705810547, + "learning_rate": 9.344939673527325e-05, + "loss": 0.08085020184516907, + "step": 46170 + }, + { + "epoch": 6.55500354861604, + "grad_norm": 0.49740973114967346, + "learning_rate": 9.344797728885734e-05, + "loss": 0.042351937294006346, + "step": 46180 + }, + { + "epoch": 6.556422995031937, + "grad_norm": 2.507627487182617, + "learning_rate": 9.344655784244146e-05, + "loss": 0.08018359541893005, + "step": 46190 + }, + { + "epoch": 6.557842441447836, + "grad_norm": 3.4351646900177, + "learning_rate": 9.344513839602555e-05, + "loss": 0.09477327466011047, + "step": 46200 + }, + { + "epoch": 6.559261887863733, + "grad_norm": 1.8519635200500488, + "learning_rate": 9.344371894960966e-05, + "loss": 0.04426932036876678, + "step": 46210 + }, + { + "epoch": 6.560681334279631, + "grad_norm": 1.091602087020874, + "learning_rate": 9.344229950319376e-05, + "loss": 0.02118045687675476, + "step": 46220 + }, + { + "epoch": 6.562100780695529, + "grad_norm": 4.351092338562012, + "learning_rate": 9.344088005677786e-05, + "loss": 0.039005580544471743, + "step": 46230 + }, + { + "epoch": 6.563520227111427, + "grad_norm": 0.24212029576301575, + "learning_rate": 9.343946061036196e-05, + "loss": 0.05372491478919983, + "step": 46240 + }, + { + "epoch": 6.564939673527324, + "grad_norm": 3.965920925140381, + "learning_rate": 9.343804116394607e-05, + "loss": 0.04061869978904724, + "step": 46250 + }, + { + "epoch": 6.566359119943222, + "grad_norm": 0.806727945804596, + "learning_rate": 9.343662171753016e-05, + "loss": 0.03351776301860809, + "step": 46260 + }, + { + "epoch": 6.56777856635912, + "grad_norm": 0.08710307627916336, + "learning_rate": 9.343520227111428e-05, + "loss": 0.03745451867580414, + "step": 46270 + }, + { + "epoch": 6.569198012775018, + "grad_norm": 2.581657648086548, + "learning_rate": 9.343378282469837e-05, + "loss": 0.03488259315490723, + "step": 46280 + }, + { + "epoch": 6.570617459190916, + "grad_norm": 5.0455427169799805, + "learning_rate": 9.343236337828247e-05, + "loss": 0.04453178346157074, + "step": 46290 + }, + { + "epoch": 6.572036905606813, + "grad_norm": 3.029026985168457, + "learning_rate": 9.343094393186658e-05, + "loss": 0.03993709683418274, + "step": 46300 + }, + { + "epoch": 6.573456352022712, + "grad_norm": 3.476870536804199, + "learning_rate": 9.342952448545068e-05, + "loss": 0.0863077163696289, + "step": 46310 + }, + { + "epoch": 6.574875798438609, + "grad_norm": 13.0468111038208, + "learning_rate": 9.342810503903479e-05, + "loss": 0.0779941201210022, + "step": 46320 + }, + { + "epoch": 6.5762952448545064, + "grad_norm": 0.7732180953025818, + "learning_rate": 9.342668559261887e-05, + "loss": 0.04824472963809967, + "step": 46330 + }, + { + "epoch": 6.577714691270405, + "grad_norm": 2.9980263710021973, + "learning_rate": 9.342526614620298e-05, + "loss": 0.051023286581039426, + "step": 46340 + }, + { + "epoch": 6.579134137686302, + "grad_norm": 3.4411685466766357, + "learning_rate": 9.342384669978708e-05, + "loss": 0.029833096265792846, + "step": 46350 + }, + { + "epoch": 6.5805535841022005, + "grad_norm": 0.820574939250946, + "learning_rate": 9.342242725337119e-05, + "loss": 0.05238626003265381, + "step": 46360 + }, + { + "epoch": 6.581973030518098, + "grad_norm": 0.5479278564453125, + "learning_rate": 9.342100780695529e-05, + "loss": 0.03393109440803528, + "step": 46370 + }, + { + "epoch": 6.583392476933996, + "grad_norm": 1.2850233316421509, + "learning_rate": 9.341958836053939e-05, + "loss": 0.029635608196258545, + "step": 46380 + }, + { + "epoch": 6.584811923349894, + "grad_norm": 0.3429562747478485, + "learning_rate": 9.34181689141235e-05, + "loss": 0.07307572960853577, + "step": 46390 + }, + { + "epoch": 6.586231369765791, + "grad_norm": 0.2704232633113861, + "learning_rate": 9.34167494677076e-05, + "loss": 0.03701426386833191, + "step": 46400 + }, + { + "epoch": 6.587650816181689, + "grad_norm": 1.3456661701202393, + "learning_rate": 9.34153300212917e-05, + "loss": 0.040149462223052976, + "step": 46410 + }, + { + "epoch": 6.589070262597587, + "grad_norm": 2.917856454849243, + "learning_rate": 9.34139105748758e-05, + "loss": 0.038611260056495664, + "step": 46420 + }, + { + "epoch": 6.590489709013485, + "grad_norm": 4.476409912109375, + "learning_rate": 9.34124911284599e-05, + "loss": 0.035803604125976565, + "step": 46430 + }, + { + "epoch": 6.5919091554293825, + "grad_norm": 0.41631996631622314, + "learning_rate": 9.3411071682044e-05, + "loss": 0.023559017479419707, + "step": 46440 + }, + { + "epoch": 6.593328601845281, + "grad_norm": 6.439150810241699, + "learning_rate": 9.340965223562811e-05, + "loss": 0.046225354075431824, + "step": 46450 + }, + { + "epoch": 6.594748048261178, + "grad_norm": 7.897381782531738, + "learning_rate": 9.340823278921221e-05, + "loss": 0.14436500072479247, + "step": 46460 + }, + { + "epoch": 6.596167494677076, + "grad_norm": 3.0115458965301514, + "learning_rate": 9.340681334279632e-05, + "loss": 0.02835048735141754, + "step": 46470 + }, + { + "epoch": 6.597586941092974, + "grad_norm": 1.3637899160385132, + "learning_rate": 9.340539389638042e-05, + "loss": 0.06278788447380065, + "step": 46480 + }, + { + "epoch": 6.599006387508871, + "grad_norm": 0.22502969205379486, + "learning_rate": 9.340397444996451e-05, + "loss": 0.02985972762107849, + "step": 46490 + }, + { + "epoch": 6.60042583392477, + "grad_norm": 4.472122669219971, + "learning_rate": 9.340255500354862e-05, + "loss": 0.07923436164855957, + "step": 46500 + }, + { + "epoch": 6.60042583392477, + "eval_accuracy": 0.9771730145609462, + "eval_loss": 0.07765964418649673, + "eval_runtime": 34.8533, + "eval_samples_per_second": 451.234, + "eval_steps_per_second": 14.116, + "step": 46500 + }, + { + "epoch": 6.601845280340667, + "grad_norm": 1.4962148666381836, + "learning_rate": 9.340113555713272e-05, + "loss": 0.05791752338409424, + "step": 46510 + }, + { + "epoch": 6.603264726756565, + "grad_norm": 9.11119556427002, + "learning_rate": 9.339971611071683e-05, + "loss": 0.09500344395637512, + "step": 46520 + }, + { + "epoch": 6.604684173172463, + "grad_norm": 0.042891647666692734, + "learning_rate": 9.339829666430092e-05, + "loss": 0.07049063444137574, + "step": 46530 + }, + { + "epoch": 6.60610361958836, + "grad_norm": 0.38025587797164917, + "learning_rate": 9.339687721788503e-05, + "loss": 0.022993910312652587, + "step": 46540 + }, + { + "epoch": 6.6075230660042585, + "grad_norm": 1.9691075086593628, + "learning_rate": 9.339545777146912e-05, + "loss": 0.028483304381370544, + "step": 46550 + }, + { + "epoch": 6.608942512420156, + "grad_norm": 4.02490234375, + "learning_rate": 9.339403832505323e-05, + "loss": 0.031239235401153566, + "step": 46560 + }, + { + "epoch": 6.610361958836054, + "grad_norm": 4.263495922088623, + "learning_rate": 9.339261887863733e-05, + "loss": 0.08727318644523621, + "step": 46570 + }, + { + "epoch": 6.611781405251952, + "grad_norm": 0.8540966510772705, + "learning_rate": 9.339119943222144e-05, + "loss": 0.040844264626502993, + "step": 46580 + }, + { + "epoch": 6.61320085166785, + "grad_norm": 7.359906196594238, + "learning_rate": 9.338977998580554e-05, + "loss": 0.09770232439041138, + "step": 46590 + }, + { + "epoch": 6.614620298083747, + "grad_norm": 0.7775536775588989, + "learning_rate": 9.338836053938964e-05, + "loss": 0.04570707976818085, + "step": 46600 + }, + { + "epoch": 6.616039744499645, + "grad_norm": 0.3048425614833832, + "learning_rate": 9.338694109297375e-05, + "loss": 0.013830628991127015, + "step": 46610 + }, + { + "epoch": 6.617459190915543, + "grad_norm": 0.7367076873779297, + "learning_rate": 9.338552164655785e-05, + "loss": 0.05104082226753235, + "step": 46620 + }, + { + "epoch": 6.6188786373314406, + "grad_norm": 0.09017914533615112, + "learning_rate": 9.338410220014196e-05, + "loss": 0.0412735253572464, + "step": 46630 + }, + { + "epoch": 6.620298083747339, + "grad_norm": 5.469369411468506, + "learning_rate": 9.338268275372604e-05, + "loss": 0.015447242558002472, + "step": 46640 + }, + { + "epoch": 6.621717530163236, + "grad_norm": 2.02463436126709, + "learning_rate": 9.338126330731015e-05, + "loss": 0.04360540807247162, + "step": 46650 + }, + { + "epoch": 6.623136976579135, + "grad_norm": 2.3378515243530273, + "learning_rate": 9.337984386089425e-05, + "loss": 0.044150394201278684, + "step": 46660 + }, + { + "epoch": 6.624556422995032, + "grad_norm": 2.2478842735290527, + "learning_rate": 9.337842441447836e-05, + "loss": 0.04635085165500641, + "step": 46670 + }, + { + "epoch": 6.625975869410929, + "grad_norm": 2.4840340614318848, + "learning_rate": 9.337700496806247e-05, + "loss": 0.04768993556499481, + "step": 46680 + }, + { + "epoch": 6.627395315826828, + "grad_norm": 2.0236635208129883, + "learning_rate": 9.337558552164656e-05, + "loss": 0.031044638156890868, + "step": 46690 + }, + { + "epoch": 6.628814762242725, + "grad_norm": 4.0689897537231445, + "learning_rate": 9.337416607523067e-05, + "loss": 0.03407057225704193, + "step": 46700 + }, + { + "epoch": 6.6302342086586235, + "grad_norm": 4.269835948944092, + "learning_rate": 9.337274662881476e-05, + "loss": 0.025110429525375365, + "step": 46710 + }, + { + "epoch": 6.631653655074521, + "grad_norm": 0.36590269207954407, + "learning_rate": 9.337132718239887e-05, + "loss": 0.0714455485343933, + "step": 46720 + }, + { + "epoch": 6.633073101490419, + "grad_norm": 0.7490295767784119, + "learning_rate": 9.336990773598297e-05, + "loss": 0.03289024829864502, + "step": 46730 + }, + { + "epoch": 6.634492547906317, + "grad_norm": 7.103208065032959, + "learning_rate": 9.336848828956707e-05, + "loss": 0.05486240386962891, + "step": 46740 + }, + { + "epoch": 6.635911994322214, + "grad_norm": 0.6095759272575378, + "learning_rate": 9.336706884315117e-05, + "loss": 0.02849574089050293, + "step": 46750 + }, + { + "epoch": 6.637331440738112, + "grad_norm": 8.099491119384766, + "learning_rate": 9.336564939673528e-05, + "loss": 0.06593834161758423, + "step": 46760 + }, + { + "epoch": 6.63875088715401, + "grad_norm": 5.979064464569092, + "learning_rate": 9.336422995031939e-05, + "loss": 0.058841896057128903, + "step": 46770 + }, + { + "epoch": 6.640170333569908, + "grad_norm": 0.5812722444534302, + "learning_rate": 9.336281050390349e-05, + "loss": 0.07408539652824402, + "step": 46780 + }, + { + "epoch": 6.6415897799858055, + "grad_norm": 8.559142112731934, + "learning_rate": 9.336139105748758e-05, + "loss": 0.05026623606681824, + "step": 46790 + }, + { + "epoch": 6.643009226401704, + "grad_norm": 1.433610200881958, + "learning_rate": 9.335997161107168e-05, + "loss": 0.024886251986026765, + "step": 46800 + }, + { + "epoch": 6.644428672817601, + "grad_norm": 5.033099174499512, + "learning_rate": 9.335855216465579e-05, + "loss": 0.04658404290676117, + "step": 46810 + }, + { + "epoch": 6.645848119233499, + "grad_norm": 1.879032850265503, + "learning_rate": 9.335713271823989e-05, + "loss": 0.031316140294075014, + "step": 46820 + }, + { + "epoch": 6.647267565649397, + "grad_norm": 1.1462465524673462, + "learning_rate": 9.3355713271824e-05, + "loss": 0.05361767411231995, + "step": 46830 + }, + { + "epoch": 6.648687012065294, + "grad_norm": 0.19135117530822754, + "learning_rate": 9.335429382540808e-05, + "loss": 0.04055591523647308, + "step": 46840 + }, + { + "epoch": 6.650106458481193, + "grad_norm": 0.23918968439102173, + "learning_rate": 9.33528743789922e-05, + "loss": 0.03483697474002838, + "step": 46850 + }, + { + "epoch": 6.65152590489709, + "grad_norm": 1.0027947425842285, + "learning_rate": 9.33514549325763e-05, + "loss": 0.03233981728553772, + "step": 46860 + }, + { + "epoch": 6.652945351312988, + "grad_norm": 4.6202473640441895, + "learning_rate": 9.33500354861604e-05, + "loss": 0.057700860500335696, + "step": 46870 + }, + { + "epoch": 6.654364797728886, + "grad_norm": 0.49680638313293457, + "learning_rate": 9.334861603974451e-05, + "loss": 0.013076686859130859, + "step": 46880 + }, + { + "epoch": 6.655784244144783, + "grad_norm": 12.804496765136719, + "learning_rate": 9.33471965933286e-05, + "loss": 0.07863889336585998, + "step": 46890 + }, + { + "epoch": 6.6572036905606815, + "grad_norm": 0.31738927960395813, + "learning_rate": 9.334577714691271e-05, + "loss": 0.03100692629814148, + "step": 46900 + }, + { + "epoch": 6.658623136976579, + "grad_norm": 5.837845802307129, + "learning_rate": 9.33443577004968e-05, + "loss": 0.04397173523902893, + "step": 46910 + }, + { + "epoch": 6.660042583392477, + "grad_norm": 3.81868314743042, + "learning_rate": 9.334293825408092e-05, + "loss": 0.033652427792549136, + "step": 46920 + }, + { + "epoch": 6.661462029808375, + "grad_norm": 0.06779835373163223, + "learning_rate": 9.334151880766501e-05, + "loss": 0.022059588134288786, + "step": 46930 + }, + { + "epoch": 6.662881476224273, + "grad_norm": 1.363473653793335, + "learning_rate": 9.334009936124912e-05, + "loss": 0.025214645266532897, + "step": 46940 + }, + { + "epoch": 6.66430092264017, + "grad_norm": 1.0777561664581299, + "learning_rate": 9.333867991483322e-05, + "loss": 0.02584805488586426, + "step": 46950 + }, + { + "epoch": 6.665720369056068, + "grad_norm": 5.158640384674072, + "learning_rate": 9.333726046841732e-05, + "loss": 0.07380213737487792, + "step": 46960 + }, + { + "epoch": 6.667139815471966, + "grad_norm": 0.5082380771636963, + "learning_rate": 9.333584102200143e-05, + "loss": 0.015044075250625611, + "step": 46970 + }, + { + "epoch": 6.6685592618878635, + "grad_norm": 3.573641300201416, + "learning_rate": 9.333442157558553e-05, + "loss": 0.029354152083396912, + "step": 46980 + }, + { + "epoch": 6.669978708303762, + "grad_norm": 2.1343441009521484, + "learning_rate": 9.333300212916964e-05, + "loss": 0.02817715108394623, + "step": 46990 + }, + { + "epoch": 6.671398154719659, + "grad_norm": 1.2065638303756714, + "learning_rate": 9.333158268275372e-05, + "loss": 0.02622884213924408, + "step": 47000 + }, + { + "epoch": 6.671398154719659, + "eval_accuracy": 0.9733579195014942, + "eval_loss": 0.08096129447221756, + "eval_runtime": 32.7148, + "eval_samples_per_second": 480.73, + "eval_steps_per_second": 15.039, + "step": 47000 + }, + { + "epoch": 6.6728176011355576, + "grad_norm": 0.8675025105476379, + "learning_rate": 9.333016323633783e-05, + "loss": 0.0328928142786026, + "step": 47010 + }, + { + "epoch": 6.674237047551455, + "grad_norm": 0.07285960763692856, + "learning_rate": 9.332874378992193e-05, + "loss": 0.09256799221038818, + "step": 47020 + }, + { + "epoch": 6.675656493967352, + "grad_norm": 4.019161224365234, + "learning_rate": 9.332732434350604e-05, + "loss": 0.07572144269943237, + "step": 47030 + }, + { + "epoch": 6.677075940383251, + "grad_norm": 7.3675150871276855, + "learning_rate": 9.332590489709014e-05, + "loss": 0.0726061463356018, + "step": 47040 + }, + { + "epoch": 6.678495386799148, + "grad_norm": 12.73004150390625, + "learning_rate": 9.332448545067424e-05, + "loss": 0.07878984212875366, + "step": 47050 + }, + { + "epoch": 6.679914833215046, + "grad_norm": 8.069070816040039, + "learning_rate": 9.332306600425835e-05, + "loss": 0.07341142892837524, + "step": 47060 + }, + { + "epoch": 6.681334279630944, + "grad_norm": 4.058375835418701, + "learning_rate": 9.332164655784245e-05, + "loss": 0.04533909559249878, + "step": 47070 + }, + { + "epoch": 6.682753726046842, + "grad_norm": 0.957751452922821, + "learning_rate": 9.332022711142656e-05, + "loss": 0.02306291162967682, + "step": 47080 + }, + { + "epoch": 6.68417317246274, + "grad_norm": 1.2759968042373657, + "learning_rate": 9.331880766501065e-05, + "loss": 0.03262047171592712, + "step": 47090 + }, + { + "epoch": 6.685592618878637, + "grad_norm": 3.3727896213531494, + "learning_rate": 9.331738821859475e-05, + "loss": 0.11750341653823852, + "step": 47100 + }, + { + "epoch": 6.687012065294535, + "grad_norm": 5.064836025238037, + "learning_rate": 9.331596877217885e-05, + "loss": 0.07212659120559692, + "step": 47110 + }, + { + "epoch": 6.688431511710433, + "grad_norm": 6.41402530670166, + "learning_rate": 9.331454932576296e-05, + "loss": 0.06090214252471924, + "step": 47120 + }, + { + "epoch": 6.689850958126331, + "grad_norm": 0.642575204372406, + "learning_rate": 9.331312987934706e-05, + "loss": 0.049497807025909425, + "step": 47130 + }, + { + "epoch": 6.691270404542228, + "grad_norm": 5.882664203643799, + "learning_rate": 9.331171043293117e-05, + "loss": 0.04391663372516632, + "step": 47140 + }, + { + "epoch": 6.692689850958127, + "grad_norm": 0.44636282324790955, + "learning_rate": 9.331029098651526e-05, + "loss": 0.02593545913696289, + "step": 47150 + }, + { + "epoch": 6.694109297374024, + "grad_norm": 4.4043989181518555, + "learning_rate": 9.330887154009936e-05, + "loss": 0.03738081157207489, + "step": 47160 + }, + { + "epoch": 6.695528743789922, + "grad_norm": 0.9728304743766785, + "learning_rate": 9.330745209368347e-05, + "loss": 0.044902724027633664, + "step": 47170 + }, + { + "epoch": 6.69694819020582, + "grad_norm": 1.4055997133255005, + "learning_rate": 9.330603264726757e-05, + "loss": 0.05182392597198486, + "step": 47180 + }, + { + "epoch": 6.698367636621717, + "grad_norm": 0.5322553515434265, + "learning_rate": 9.330461320085168e-05, + "loss": 0.03811835050582886, + "step": 47190 + }, + { + "epoch": 6.699787083037616, + "grad_norm": 5.997743606567383, + "learning_rate": 9.330319375443577e-05, + "loss": 0.08122850656509399, + "step": 47200 + }, + { + "epoch": 6.701206529453513, + "grad_norm": 3.8824660778045654, + "learning_rate": 9.330177430801988e-05, + "loss": 0.10573461055755615, + "step": 47210 + }, + { + "epoch": 6.702625975869411, + "grad_norm": 2.0631566047668457, + "learning_rate": 9.330035486160397e-05, + "loss": 0.07894684672355652, + "step": 47220 + }, + { + "epoch": 6.704045422285309, + "grad_norm": 0.43265125155448914, + "learning_rate": 9.329893541518808e-05, + "loss": 0.00781625285744667, + "step": 47230 + }, + { + "epoch": 6.705464868701206, + "grad_norm": 5.595661640167236, + "learning_rate": 9.329751596877218e-05, + "loss": 0.02967623770236969, + "step": 47240 + }, + { + "epoch": 6.7068843151171045, + "grad_norm": 4.208871841430664, + "learning_rate": 9.329609652235629e-05, + "loss": 0.03154032528400421, + "step": 47250 + }, + { + "epoch": 6.708303761533002, + "grad_norm": 2.042327642440796, + "learning_rate": 9.329467707594039e-05, + "loss": 0.035281413793563844, + "step": 47260 + }, + { + "epoch": 6.7097232079489, + "grad_norm": 2.4885547161102295, + "learning_rate": 9.329325762952449e-05, + "loss": 0.06740244030952454, + "step": 47270 + }, + { + "epoch": 6.711142654364798, + "grad_norm": 3.6246352195739746, + "learning_rate": 9.32918381831086e-05, + "loss": 0.03529942333698273, + "step": 47280 + }, + { + "epoch": 6.712562100780696, + "grad_norm": 7.900681018829346, + "learning_rate": 9.32904187366927e-05, + "loss": 0.04457513988018036, + "step": 47290 + }, + { + "epoch": 6.713981547196593, + "grad_norm": 6.030803680419922, + "learning_rate": 9.32889992902768e-05, + "loss": 0.07301679849624634, + "step": 47300 + }, + { + "epoch": 6.715400993612491, + "grad_norm": 1.9671475887298584, + "learning_rate": 9.328757984386089e-05, + "loss": 0.03088918924331665, + "step": 47310 + }, + { + "epoch": 6.716820440028389, + "grad_norm": 0.9832845330238342, + "learning_rate": 9.3286160397445e-05, + "loss": 0.03263017535209656, + "step": 47320 + }, + { + "epoch": 6.7182398864442865, + "grad_norm": 1.1375794410705566, + "learning_rate": 9.32847409510291e-05, + "loss": 0.04314920902252197, + "step": 47330 + }, + { + "epoch": 6.719659332860185, + "grad_norm": 1.5910285711288452, + "learning_rate": 9.328332150461321e-05, + "loss": 0.03253903090953827, + "step": 47340 + }, + { + "epoch": 6.721078779276082, + "grad_norm": 9.413193702697754, + "learning_rate": 9.328190205819731e-05, + "loss": 0.03828516602516174, + "step": 47350 + }, + { + "epoch": 6.7224982256919805, + "grad_norm": 10.063383102416992, + "learning_rate": 9.32804826117814e-05, + "loss": 0.06823945045471191, + "step": 47360 + }, + { + "epoch": 6.723917672107878, + "grad_norm": 10.355716705322266, + "learning_rate": 9.327906316536552e-05, + "loss": 0.06583051681518555, + "step": 47370 + }, + { + "epoch": 6.725337118523775, + "grad_norm": 1.6706312894821167, + "learning_rate": 9.327764371894961e-05, + "loss": 0.050222575664520264, + "step": 47380 + }, + { + "epoch": 6.726756564939674, + "grad_norm": 12.793781280517578, + "learning_rate": 9.327622427253372e-05, + "loss": 0.0696624755859375, + "step": 47390 + }, + { + "epoch": 6.728176011355571, + "grad_norm": 14.380304336547852, + "learning_rate": 9.327480482611782e-05, + "loss": 0.07547839879989623, + "step": 47400 + }, + { + "epoch": 6.729595457771469, + "grad_norm": 0.04411943256855011, + "learning_rate": 9.327338537970192e-05, + "loss": 0.05585261583328247, + "step": 47410 + }, + { + "epoch": 6.731014904187367, + "grad_norm": 5.027952194213867, + "learning_rate": 9.327196593328602e-05, + "loss": 0.0800262987613678, + "step": 47420 + }, + { + "epoch": 6.732434350603265, + "grad_norm": 3.2839534282684326, + "learning_rate": 9.327054648687013e-05, + "loss": 0.09744354486465454, + "step": 47430 + }, + { + "epoch": 6.7338537970191625, + "grad_norm": 1.8493952751159668, + "learning_rate": 9.326912704045422e-05, + "loss": 0.041766023635864256, + "step": 47440 + }, + { + "epoch": 6.73527324343506, + "grad_norm": 3.7212188243865967, + "learning_rate": 9.326770759403834e-05, + "loss": 0.03478267788887024, + "step": 47450 + }, + { + "epoch": 6.736692689850958, + "grad_norm": 2.638964891433716, + "learning_rate": 9.326628814762243e-05, + "loss": 0.03159240484237671, + "step": 47460 + }, + { + "epoch": 6.738112136266856, + "grad_norm": 6.593120574951172, + "learning_rate": 9.326486870120653e-05, + "loss": 0.07078714966773987, + "step": 47470 + }, + { + "epoch": 6.739531582682754, + "grad_norm": 4.137892246246338, + "learning_rate": 9.326344925479064e-05, + "loss": 0.044456595182418825, + "step": 47480 + }, + { + "epoch": 6.740951029098651, + "grad_norm": 1.0298532247543335, + "learning_rate": 9.326202980837474e-05, + "loss": 0.05453903079032898, + "step": 47490 + }, + { + "epoch": 6.74237047551455, + "grad_norm": 5.411275386810303, + "learning_rate": 9.326061036195885e-05, + "loss": 0.03052000105381012, + "step": 47500 + }, + { + "epoch": 6.74237047551455, + "eval_accuracy": 0.9678896165829465, + "eval_loss": 0.10287806391716003, + "eval_runtime": 33.2324, + "eval_samples_per_second": 473.244, + "eval_steps_per_second": 14.805, + "step": 47500 + }, + { + "epoch": 6.743789921930447, + "grad_norm": 0.06812303513288498, + "learning_rate": 9.325919091554293e-05, + "loss": 0.05741068124771118, + "step": 47510 + }, + { + "epoch": 6.7452093683463445, + "grad_norm": 7.734038829803467, + "learning_rate": 9.325777146912704e-05, + "loss": 0.07300693392753602, + "step": 47520 + }, + { + "epoch": 6.746628814762243, + "grad_norm": 4.815206050872803, + "learning_rate": 9.325635202271114e-05, + "loss": 0.02375160902738571, + "step": 47530 + }, + { + "epoch": 6.74804826117814, + "grad_norm": 0.18969817459583282, + "learning_rate": 9.325493257629525e-05, + "loss": 0.05244582295417786, + "step": 47540 + }, + { + "epoch": 6.749467707594039, + "grad_norm": 3.6226210594177246, + "learning_rate": 9.325351312987935e-05, + "loss": 0.0583953857421875, + "step": 47550 + }, + { + "epoch": 6.750887154009936, + "grad_norm": 9.285144805908203, + "learning_rate": 9.325209368346345e-05, + "loss": 0.07046167850494385, + "step": 47560 + }, + { + "epoch": 6.752306600425834, + "grad_norm": 6.898341655731201, + "learning_rate": 9.325067423704756e-05, + "loss": 0.024330171942710876, + "step": 47570 + }, + { + "epoch": 6.753726046841732, + "grad_norm": 1.6316328048706055, + "learning_rate": 9.324925479063166e-05, + "loss": 0.040696841478347776, + "step": 47580 + }, + { + "epoch": 6.755145493257629, + "grad_norm": 0.510357677936554, + "learning_rate": 9.324783534421577e-05, + "loss": 0.015253564715385437, + "step": 47590 + }, + { + "epoch": 6.7565649396735274, + "grad_norm": 4.652739524841309, + "learning_rate": 9.324641589779986e-05, + "loss": 0.03557385802268982, + "step": 47600 + }, + { + "epoch": 6.757984386089425, + "grad_norm": 0.3203953802585602, + "learning_rate": 9.324499645138397e-05, + "loss": 0.010098765790462493, + "step": 47610 + }, + { + "epoch": 6.759403832505323, + "grad_norm": 1.6275372505187988, + "learning_rate": 9.324357700496806e-05, + "loss": 0.03298323750495911, + "step": 47620 + }, + { + "epoch": 6.760823278921221, + "grad_norm": 11.40868854522705, + "learning_rate": 9.324215755855217e-05, + "loss": 0.07673094868659973, + "step": 47630 + }, + { + "epoch": 6.762242725337119, + "grad_norm": 7.8622212409973145, + "learning_rate": 9.324073811213627e-05, + "loss": 0.05273681879043579, + "step": 47640 + }, + { + "epoch": 6.763662171753016, + "grad_norm": 5.881760120391846, + "learning_rate": 9.323931866572038e-05, + "loss": 0.0287151038646698, + "step": 47650 + }, + { + "epoch": 6.765081618168914, + "grad_norm": 8.757052421569824, + "learning_rate": 9.323789921930447e-05, + "loss": 0.10142724514007569, + "step": 47660 + }, + { + "epoch": 6.766501064584812, + "grad_norm": 7.562860012054443, + "learning_rate": 9.323647977288857e-05, + "loss": 0.061179614067077635, + "step": 47670 + }, + { + "epoch": 6.7679205110007095, + "grad_norm": 0.293720006942749, + "learning_rate": 9.323506032647268e-05, + "loss": 0.0769286334514618, + "step": 47680 + }, + { + "epoch": 6.769339957416608, + "grad_norm": 5.228733062744141, + "learning_rate": 9.323364088005678e-05, + "loss": 0.03475149571895599, + "step": 47690 + }, + { + "epoch": 6.770759403832505, + "grad_norm": 0.8717983961105347, + "learning_rate": 9.323222143364089e-05, + "loss": 0.03800695240497589, + "step": 47700 + }, + { + "epoch": 6.7721788502484035, + "grad_norm": 16.308420181274414, + "learning_rate": 9.323080198722499e-05, + "loss": 0.04515612721443176, + "step": 47710 + }, + { + "epoch": 6.773598296664301, + "grad_norm": 5.392683029174805, + "learning_rate": 9.322938254080909e-05, + "loss": 0.030790060758590698, + "step": 47720 + }, + { + "epoch": 6.775017743080198, + "grad_norm": 4.039763927459717, + "learning_rate": 9.322796309439318e-05, + "loss": 0.03297184109687805, + "step": 47730 + }, + { + "epoch": 6.776437189496097, + "grad_norm": 5.786772727966309, + "learning_rate": 9.32265436479773e-05, + "loss": 0.05468297004699707, + "step": 47740 + }, + { + "epoch": 6.777856635911994, + "grad_norm": 16.83050537109375, + "learning_rate": 9.322512420156139e-05, + "loss": 0.04757193326950073, + "step": 47750 + }, + { + "epoch": 6.779276082327892, + "grad_norm": 4.77114200592041, + "learning_rate": 9.32237047551455e-05, + "loss": 0.03624268174171448, + "step": 47760 + }, + { + "epoch": 6.78069552874379, + "grad_norm": 10.355727195739746, + "learning_rate": 9.32222853087296e-05, + "loss": 0.039849352836608884, + "step": 47770 + }, + { + "epoch": 6.782114975159688, + "grad_norm": 5.408930778503418, + "learning_rate": 9.32208658623137e-05, + "loss": 0.05064322948455811, + "step": 47780 + }, + { + "epoch": 6.7835344215755855, + "grad_norm": 1.2858765125274658, + "learning_rate": 9.321944641589781e-05, + "loss": 0.04928310811519623, + "step": 47790 + }, + { + "epoch": 6.784953867991483, + "grad_norm": 0.4276614487171173, + "learning_rate": 9.32180269694819e-05, + "loss": 0.051668965816497804, + "step": 47800 + }, + { + "epoch": 6.786373314407381, + "grad_norm": 0.45648816227912903, + "learning_rate": 9.321660752306602e-05, + "loss": 0.03476710915565491, + "step": 47810 + }, + { + "epoch": 6.787792760823279, + "grad_norm": 23.13014793395996, + "learning_rate": 9.32151880766501e-05, + "loss": 0.043744403123855594, + "step": 47820 + }, + { + "epoch": 6.789212207239177, + "grad_norm": 5.712381362915039, + "learning_rate": 9.321376863023421e-05, + "loss": 0.112934410572052, + "step": 47830 + }, + { + "epoch": 6.790631653655074, + "grad_norm": 0.8790338039398193, + "learning_rate": 9.321234918381831e-05, + "loss": 0.054661625623703004, + "step": 47840 + }, + { + "epoch": 6.792051100070973, + "grad_norm": 8.923171997070312, + "learning_rate": 9.321092973740242e-05, + "loss": 0.043182292580604555, + "step": 47850 + }, + { + "epoch": 6.79347054648687, + "grad_norm": 4.031979560852051, + "learning_rate": 9.320951029098652e-05, + "loss": 0.06587361693382263, + "step": 47860 + }, + { + "epoch": 6.7948899929027675, + "grad_norm": 1.1798633337020874, + "learning_rate": 9.320809084457061e-05, + "loss": 0.06723747253417969, + "step": 47870 + }, + { + "epoch": 6.796309439318666, + "grad_norm": 0.15142561495304108, + "learning_rate": 9.320681334279631e-05, + "loss": 0.0663286030292511, + "step": 47880 + }, + { + "epoch": 6.797728885734563, + "grad_norm": 0.30359768867492676, + "learning_rate": 9.320539389638041e-05, + "loss": 0.028592944145202637, + "step": 47890 + }, + { + "epoch": 6.7991483321504615, + "grad_norm": 0.025870347395539284, + "learning_rate": 9.320397444996452e-05, + "loss": 0.03683372735977173, + "step": 47900 + }, + { + "epoch": 6.800567778566359, + "grad_norm": 1.0389518737792969, + "learning_rate": 9.320255500354862e-05, + "loss": 0.0489422082901001, + "step": 47910 + }, + { + "epoch": 6.801987224982257, + "grad_norm": 2.1409695148468018, + "learning_rate": 9.320113555713273e-05, + "loss": 0.07247651815414428, + "step": 47920 + }, + { + "epoch": 6.803406671398155, + "grad_norm": 5.814974784851074, + "learning_rate": 9.319971611071683e-05, + "loss": 0.06912614107131958, + "step": 47930 + }, + { + "epoch": 6.804826117814052, + "grad_norm": 0.1616591513156891, + "learning_rate": 9.319829666430094e-05, + "loss": 0.05542629361152649, + "step": 47940 + }, + { + "epoch": 6.80624556422995, + "grad_norm": 5.357841968536377, + "learning_rate": 9.319687721788502e-05, + "loss": 0.04025912284851074, + "step": 47950 + }, + { + "epoch": 6.807665010645848, + "grad_norm": 1.2836575508117676, + "learning_rate": 9.319545777146913e-05, + "loss": 0.0756769597530365, + "step": 47960 + }, + { + "epoch": 6.809084457061746, + "grad_norm": 0.03740439563989639, + "learning_rate": 9.319403832505323e-05, + "loss": 0.0715951144695282, + "step": 47970 + }, + { + "epoch": 6.810503903477644, + "grad_norm": 0.518546462059021, + "learning_rate": 9.319261887863734e-05, + "loss": 0.04978099465370178, + "step": 47980 + }, + { + "epoch": 6.811923349893542, + "grad_norm": 0.18438740074634552, + "learning_rate": 9.319119943222144e-05, + "loss": 0.01751907467842102, + "step": 47990 + }, + { + "epoch": 6.813342796309439, + "grad_norm": 3.142836570739746, + "learning_rate": 9.318977998580554e-05, + "loss": 0.02677932381629944, + "step": 48000 + }, + { + "epoch": 6.813342796309439, + "eval_accuracy": 0.9809881096203981, + "eval_loss": 0.05622277408838272, + "eval_runtime": 32.8658, + "eval_samples_per_second": 478.522, + "eval_steps_per_second": 14.97, + "step": 48000 + }, + { + "epoch": 6.814762242725337, + "grad_norm": 2.0536131858825684, + "learning_rate": 9.318836053938965e-05, + "loss": 0.015174926817417144, + "step": 48010 + }, + { + "epoch": 6.816181689141235, + "grad_norm": 0.4630908966064453, + "learning_rate": 9.318694109297374e-05, + "loss": 0.030945992469787596, + "step": 48020 + }, + { + "epoch": 6.817601135557132, + "grad_norm": 4.387209415435791, + "learning_rate": 9.318552164655785e-05, + "loss": 0.03779844045639038, + "step": 48030 + }, + { + "epoch": 6.819020581973031, + "grad_norm": 2.2216169834136963, + "learning_rate": 9.318410220014195e-05, + "loss": 0.060065722465515135, + "step": 48040 + }, + { + "epoch": 6.820440028388928, + "grad_norm": 5.043936252593994, + "learning_rate": 9.318268275372605e-05, + "loss": 0.051817584037780764, + "step": 48050 + }, + { + "epoch": 6.8218594748048265, + "grad_norm": 5.093997001647949, + "learning_rate": 9.318126330731015e-05, + "loss": 0.0350986510515213, + "step": 48060 + }, + { + "epoch": 6.823278921220724, + "grad_norm": 0.955636203289032, + "learning_rate": 9.317984386089426e-05, + "loss": 0.06274473071098327, + "step": 48070 + }, + { + "epoch": 6.824698367636621, + "grad_norm": 0.037599656730890274, + "learning_rate": 9.317842441447836e-05, + "loss": 0.04869303405284882, + "step": 48080 + }, + { + "epoch": 6.82611781405252, + "grad_norm": 0.32732093334198, + "learning_rate": 9.317700496806247e-05, + "loss": 0.03161635100841522, + "step": 48090 + }, + { + "epoch": 6.827537260468417, + "grad_norm": 2.418246030807495, + "learning_rate": 9.317558552164656e-05, + "loss": 0.029885494709014894, + "step": 48100 + }, + { + "epoch": 6.828956706884315, + "grad_norm": 6.135824203491211, + "learning_rate": 9.317416607523066e-05, + "loss": 0.03382058739662171, + "step": 48110 + }, + { + "epoch": 6.830376153300213, + "grad_norm": 3.056666374206543, + "learning_rate": 9.317274662881477e-05, + "loss": 0.030065348744392394, + "step": 48120 + }, + { + "epoch": 6.831795599716111, + "grad_norm": 2.694615602493286, + "learning_rate": 9.317132718239887e-05, + "loss": 0.08778796792030334, + "step": 48130 + }, + { + "epoch": 6.8332150461320085, + "grad_norm": 0.2775319218635559, + "learning_rate": 9.316990773598298e-05, + "loss": 0.07494470477104187, + "step": 48140 + }, + { + "epoch": 6.834634492547906, + "grad_norm": 4.01226282119751, + "learning_rate": 9.316848828956706e-05, + "loss": 0.02631029188632965, + "step": 48150 + }, + { + "epoch": 6.836053938963804, + "grad_norm": 6.277001857757568, + "learning_rate": 9.316706884315118e-05, + "loss": 0.046081721782684326, + "step": 48160 + }, + { + "epoch": 6.837473385379702, + "grad_norm": 6.287518501281738, + "learning_rate": 9.316564939673527e-05, + "loss": 0.07654207348823547, + "step": 48170 + }, + { + "epoch": 6.8388928317956, + "grad_norm": 2.8772833347320557, + "learning_rate": 9.316422995031938e-05, + "loss": 0.026699700951576234, + "step": 48180 + }, + { + "epoch": 6.840312278211497, + "grad_norm": 11.019613265991211, + "learning_rate": 9.316281050390348e-05, + "loss": 0.06047802567481995, + "step": 48190 + }, + { + "epoch": 6.841731724627396, + "grad_norm": 1.1007559299468994, + "learning_rate": 9.316139105748758e-05, + "loss": 0.01713552176952362, + "step": 48200 + }, + { + "epoch": 6.843151171043293, + "grad_norm": 1.7577693462371826, + "learning_rate": 9.315997161107169e-05, + "loss": 0.07861257791519165, + "step": 48210 + }, + { + "epoch": 6.8445706174591905, + "grad_norm": 0.16370804607868195, + "learning_rate": 9.315855216465579e-05, + "loss": 0.02424170821905136, + "step": 48220 + }, + { + "epoch": 6.845990063875089, + "grad_norm": 0.40334025025367737, + "learning_rate": 9.31571327182399e-05, + "loss": 0.0454858660697937, + "step": 48230 + }, + { + "epoch": 6.847409510290986, + "grad_norm": 0.16940270364284515, + "learning_rate": 9.3155713271824e-05, + "loss": 0.020930516719818115, + "step": 48240 + }, + { + "epoch": 6.8488289567068845, + "grad_norm": 2.689833402633667, + "learning_rate": 9.315429382540809e-05, + "loss": 0.06456713676452637, + "step": 48250 + }, + { + "epoch": 6.850248403122782, + "grad_norm": 0.9107224345207214, + "learning_rate": 9.315287437899219e-05, + "loss": 0.0072081081569194795, + "step": 48260 + }, + { + "epoch": 6.85166784953868, + "grad_norm": 4.121362686157227, + "learning_rate": 9.31514549325763e-05, + "loss": 0.0435828447341919, + "step": 48270 + }, + { + "epoch": 6.853087295954578, + "grad_norm": 1.2586746215820312, + "learning_rate": 9.31500354861604e-05, + "loss": 0.09252724051475525, + "step": 48280 + }, + { + "epoch": 6.854506742370475, + "grad_norm": 1.9886664152145386, + "learning_rate": 9.314861603974451e-05, + "loss": 0.04249245822429657, + "step": 48290 + }, + { + "epoch": 6.855926188786373, + "grad_norm": 4.510488986968994, + "learning_rate": 9.31471965933286e-05, + "loss": 0.016424933075904848, + "step": 48300 + }, + { + "epoch": 6.857345635202271, + "grad_norm": 0.23531511425971985, + "learning_rate": 9.31457771469127e-05, + "loss": 0.02162374258041382, + "step": 48310 + }, + { + "epoch": 6.858765081618169, + "grad_norm": 7.018204689025879, + "learning_rate": 9.314435770049681e-05, + "loss": 0.05110843181610107, + "step": 48320 + }, + { + "epoch": 6.8601845280340665, + "grad_norm": 4.952686309814453, + "learning_rate": 9.314293825408091e-05, + "loss": 0.0650827169418335, + "step": 48330 + }, + { + "epoch": 6.861603974449965, + "grad_norm": 0.2199895679950714, + "learning_rate": 9.314151880766502e-05, + "loss": 0.018927814066410066, + "step": 48340 + }, + { + "epoch": 6.863023420865862, + "grad_norm": 6.606371879577637, + "learning_rate": 9.314009936124912e-05, + "loss": 0.032240912318229675, + "step": 48350 + }, + { + "epoch": 6.86444286728176, + "grad_norm": 1.6385270357131958, + "learning_rate": 9.313867991483322e-05, + "loss": 0.023154914379119873, + "step": 48360 + }, + { + "epoch": 6.865862313697658, + "grad_norm": 4.304486274719238, + "learning_rate": 9.313726046841731e-05, + "loss": 0.05492810606956482, + "step": 48370 + }, + { + "epoch": 6.867281760113555, + "grad_norm": 2.8906960487365723, + "learning_rate": 9.313584102200143e-05, + "loss": 0.06663978099822998, + "step": 48380 + }, + { + "epoch": 6.868701206529454, + "grad_norm": 0.6142754554748535, + "learning_rate": 9.313442157558552e-05, + "loss": 0.0662991464138031, + "step": 48390 + }, + { + "epoch": 6.870120652945351, + "grad_norm": 0.6287058591842651, + "learning_rate": 9.313300212916963e-05, + "loss": 0.06578343510627746, + "step": 48400 + }, + { + "epoch": 6.871540099361249, + "grad_norm": 6.606374740600586, + "learning_rate": 9.313158268275373e-05, + "loss": 0.10795985460281372, + "step": 48410 + }, + { + "epoch": 6.872959545777147, + "grad_norm": 3.5225772857666016, + "learning_rate": 9.313016323633783e-05, + "loss": 0.01849919557571411, + "step": 48420 + }, + { + "epoch": 6.874378992193044, + "grad_norm": 0.921964704990387, + "learning_rate": 9.312874378992194e-05, + "loss": 0.041754227876663205, + "step": 48430 + }, + { + "epoch": 6.875798438608943, + "grad_norm": 0.8441599011421204, + "learning_rate": 9.312732434350604e-05, + "loss": 0.11150503158569336, + "step": 48440 + }, + { + "epoch": 6.87721788502484, + "grad_norm": 0.2408447563648224, + "learning_rate": 9.312590489709015e-05, + "loss": 0.04510400295257568, + "step": 48450 + }, + { + "epoch": 6.878637331440738, + "grad_norm": 4.951006889343262, + "learning_rate": 9.312448545067423e-05, + "loss": 0.06654103994369506, + "step": 48460 + }, + { + "epoch": 6.880056777856636, + "grad_norm": 0.20190109312534332, + "learning_rate": 9.312306600425834e-05, + "loss": 0.04683546721935272, + "step": 48470 + }, + { + "epoch": 6.881476224272534, + "grad_norm": 0.44238659739494324, + "learning_rate": 9.312164655784244e-05, + "loss": 0.07028831243515014, + "step": 48480 + }, + { + "epoch": 6.8828956706884314, + "grad_norm": 0.18961113691329956, + "learning_rate": 9.312022711142655e-05, + "loss": 0.0513995349407196, + "step": 48490 + }, + { + "epoch": 6.884315117104329, + "grad_norm": 1.5219275951385498, + "learning_rate": 9.311880766501065e-05, + "loss": 0.04622653126716614, + "step": 48500 + }, + { + "epoch": 6.884315117104329, + "eval_accuracy": 0.9761556558784257, + "eval_loss": 0.08012785017490387, + "eval_runtime": 33.1858, + "eval_samples_per_second": 473.908, + "eval_steps_per_second": 14.826, + "step": 48500 + }, + { + "epoch": 6.885734563520227, + "grad_norm": 7.168130397796631, + "learning_rate": 9.311738821859475e-05, + "loss": 0.0142391636967659, + "step": 48510 + }, + { + "epoch": 6.887154009936125, + "grad_norm": 0.36366888880729675, + "learning_rate": 9.311596877217886e-05, + "loss": 0.00685754343867302, + "step": 48520 + }, + { + "epoch": 6.888573456352023, + "grad_norm": 6.618060111999512, + "learning_rate": 9.311454932576295e-05, + "loss": 0.03779117166996002, + "step": 48530 + }, + { + "epoch": 6.88999290276792, + "grad_norm": 1.2769519090652466, + "learning_rate": 9.311312987934707e-05, + "loss": 0.05502520203590393, + "step": 48540 + }, + { + "epoch": 6.891412349183819, + "grad_norm": 2.1486284732818604, + "learning_rate": 9.311171043293116e-05, + "loss": 0.05189969539642334, + "step": 48550 + }, + { + "epoch": 6.892831795599716, + "grad_norm": 5.32498025894165, + "learning_rate": 9.311029098651526e-05, + "loss": 0.041870713233947754, + "step": 48560 + }, + { + "epoch": 6.8942512420156135, + "grad_norm": 7.3712005615234375, + "learning_rate": 9.310887154009936e-05, + "loss": 0.05584554672241211, + "step": 48570 + }, + { + "epoch": 6.895670688431512, + "grad_norm": 0.048228669911623, + "learning_rate": 9.310745209368347e-05, + "loss": 0.025919014215469362, + "step": 48580 + }, + { + "epoch": 6.897090134847409, + "grad_norm": 0.2741209864616394, + "learning_rate": 9.310603264726757e-05, + "loss": 0.057539498805999754, + "step": 48590 + }, + { + "epoch": 6.8985095812633075, + "grad_norm": 0.655285656452179, + "learning_rate": 9.310461320085168e-05, + "loss": 0.10678447484970092, + "step": 48600 + }, + { + "epoch": 6.899929027679205, + "grad_norm": 0.24642018973827362, + "learning_rate": 9.310319375443577e-05, + "loss": 0.02704497575759888, + "step": 48610 + }, + { + "epoch": 6.901348474095103, + "grad_norm": 0.37187427282333374, + "learning_rate": 9.310177430801987e-05, + "loss": 0.02497977912425995, + "step": 48620 + }, + { + "epoch": 6.902767920511001, + "grad_norm": 1.8962280750274658, + "learning_rate": 9.310035486160398e-05, + "loss": 0.0649878740310669, + "step": 48630 + }, + { + "epoch": 6.904187366926898, + "grad_norm": 5.100073337554932, + "learning_rate": 9.309893541518808e-05, + "loss": 0.04320423901081085, + "step": 48640 + }, + { + "epoch": 6.905606813342796, + "grad_norm": 6.323354721069336, + "learning_rate": 9.309751596877219e-05, + "loss": 0.058730268478393556, + "step": 48650 + }, + { + "epoch": 6.907026259758694, + "grad_norm": 2.025700330734253, + "learning_rate": 9.309609652235629e-05, + "loss": 0.06454595327377319, + "step": 48660 + }, + { + "epoch": 6.908445706174592, + "grad_norm": 5.871410846710205, + "learning_rate": 9.309467707594039e-05, + "loss": 0.09823285341262818, + "step": 48670 + }, + { + "epoch": 6.9098651525904895, + "grad_norm": 11.58802604675293, + "learning_rate": 9.309325762952448e-05, + "loss": 0.07219201326370239, + "step": 48680 + }, + { + "epoch": 6.911284599006388, + "grad_norm": 6.74500846862793, + "learning_rate": 9.30918381831086e-05, + "loss": 0.058211779594421385, + "step": 48690 + }, + { + "epoch": 6.912704045422285, + "grad_norm": 12.418977737426758, + "learning_rate": 9.309041873669269e-05, + "loss": 0.09747650027275086, + "step": 48700 + }, + { + "epoch": 6.914123491838183, + "grad_norm": 4.093213081359863, + "learning_rate": 9.30889992902768e-05, + "loss": 0.02640637755393982, + "step": 48710 + }, + { + "epoch": 6.915542938254081, + "grad_norm": 10.278468132019043, + "learning_rate": 9.30875798438609e-05, + "loss": 0.030793681740760803, + "step": 48720 + }, + { + "epoch": 6.916962384669978, + "grad_norm": 5.775954723358154, + "learning_rate": 9.3086160397445e-05, + "loss": 0.06745225191116333, + "step": 48730 + }, + { + "epoch": 6.918381831085877, + "grad_norm": 10.210089683532715, + "learning_rate": 9.308474095102911e-05, + "loss": 0.10631564855575562, + "step": 48740 + }, + { + "epoch": 6.919801277501774, + "grad_norm": 12.40491008758545, + "learning_rate": 9.30833215046132e-05, + "loss": 0.07292826771736145, + "step": 48750 + }, + { + "epoch": 6.921220723917672, + "grad_norm": 0.05215975269675255, + "learning_rate": 9.308190205819732e-05, + "loss": 0.051340538263320926, + "step": 48760 + }, + { + "epoch": 6.92264017033357, + "grad_norm": 0.206340029835701, + "learning_rate": 9.30804826117814e-05, + "loss": 0.017073613405227662, + "step": 48770 + }, + { + "epoch": 6.924059616749467, + "grad_norm": 1.0038329362869263, + "learning_rate": 9.307906316536551e-05, + "loss": 0.07039524912834168, + "step": 48780 + }, + { + "epoch": 6.9254790631653655, + "grad_norm": 0.03913341462612152, + "learning_rate": 9.307764371894961e-05, + "loss": 0.055343860387802125, + "step": 48790 + }, + { + "epoch": 6.926898509581263, + "grad_norm": 4.186570167541504, + "learning_rate": 9.307622427253372e-05, + "loss": 0.038757961988449094, + "step": 48800 + }, + { + "epoch": 6.928317955997161, + "grad_norm": 2.0609304904937744, + "learning_rate": 9.307480482611782e-05, + "loss": 0.04881545305252075, + "step": 48810 + }, + { + "epoch": 6.929737402413059, + "grad_norm": 8.912321090698242, + "learning_rate": 9.307338537970191e-05, + "loss": 0.07206100821495057, + "step": 48820 + }, + { + "epoch": 6.931156848828957, + "grad_norm": 0.3043152689933777, + "learning_rate": 9.307196593328602e-05, + "loss": 0.0357146680355072, + "step": 48830 + }, + { + "epoch": 6.932576295244854, + "grad_norm": 1.626572847366333, + "learning_rate": 9.307054648687012e-05, + "loss": 0.03512680530548096, + "step": 48840 + }, + { + "epoch": 6.933995741660752, + "grad_norm": 0.24631650745868683, + "learning_rate": 9.306912704045423e-05, + "loss": 0.032503852248191835, + "step": 48850 + }, + { + "epoch": 6.93541518807665, + "grad_norm": 7.898240566253662, + "learning_rate": 9.306770759403833e-05, + "loss": 0.04666814804077148, + "step": 48860 + }, + { + "epoch": 6.936834634492548, + "grad_norm": 8.17198371887207, + "learning_rate": 9.306628814762243e-05, + "loss": 0.04198618233203888, + "step": 48870 + }, + { + "epoch": 6.938254080908446, + "grad_norm": 4.493419170379639, + "learning_rate": 9.306486870120652e-05, + "loss": 0.10232670307159424, + "step": 48880 + }, + { + "epoch": 6.939673527324343, + "grad_norm": 4.874084949493408, + "learning_rate": 9.306344925479064e-05, + "loss": 0.04881571829319, + "step": 48890 + }, + { + "epoch": 6.941092973740242, + "grad_norm": 0.2118980437517166, + "learning_rate": 9.306202980837473e-05, + "loss": 0.029319232702255248, + "step": 48900 + }, + { + "epoch": 6.942512420156139, + "grad_norm": 4.677948474884033, + "learning_rate": 9.306061036195884e-05, + "loss": 0.0549718976020813, + "step": 48910 + }, + { + "epoch": 6.943931866572036, + "grad_norm": 7.681908130645752, + "learning_rate": 9.305919091554294e-05, + "loss": 0.036868888139724734, + "step": 48920 + }, + { + "epoch": 6.945351312987935, + "grad_norm": 9.071455001831055, + "learning_rate": 9.305777146912704e-05, + "loss": 0.08308836221694946, + "step": 48930 + }, + { + "epoch": 6.946770759403832, + "grad_norm": 4.830014228820801, + "learning_rate": 9.305635202271115e-05, + "loss": 0.03471656739711761, + "step": 48940 + }, + { + "epoch": 6.9481902058197305, + "grad_norm": 2.677654504776001, + "learning_rate": 9.305493257629525e-05, + "loss": 0.03147899806499481, + "step": 48950 + }, + { + "epoch": 6.949609652235628, + "grad_norm": 8.488234519958496, + "learning_rate": 9.305351312987936e-05, + "loss": 0.040718674659729004, + "step": 48960 + }, + { + "epoch": 6.951029098651526, + "grad_norm": 1.8252172470092773, + "learning_rate": 9.305209368346344e-05, + "loss": 0.04164456129074097, + "step": 48970 + }, + { + "epoch": 6.952448545067424, + "grad_norm": 0.5312723517417908, + "learning_rate": 9.305067423704755e-05, + "loss": 0.04740914106369019, + "step": 48980 + }, + { + "epoch": 6.953867991483321, + "grad_norm": 4.89031982421875, + "learning_rate": 9.304925479063165e-05, + "loss": 0.07554548978805542, + "step": 48990 + }, + { + "epoch": 6.955287437899219, + "grad_norm": 1.1586805582046509, + "learning_rate": 9.304783534421576e-05, + "loss": 0.036046579480171204, + "step": 49000 + }, + { + "epoch": 6.955287437899219, + "eval_accuracy": 0.9715775418070833, + "eval_loss": 0.08274991810321808, + "eval_runtime": 33.2502, + "eval_samples_per_second": 472.99, + "eval_steps_per_second": 14.797, + "step": 49000 + }, + { + "epoch": 6.956706884315117, + "grad_norm": 3.4457671642303467, + "learning_rate": 9.304641589779987e-05, + "loss": 0.0518226146697998, + "step": 49010 + }, + { + "epoch": 6.958126330731015, + "grad_norm": 1.6784181594848633, + "learning_rate": 9.304499645138397e-05, + "loss": 0.08865286707878113, + "step": 49020 + }, + { + "epoch": 6.9595457771469125, + "grad_norm": 5.46137809753418, + "learning_rate": 9.304357700496807e-05, + "loss": 0.05337139964103699, + "step": 49030 + }, + { + "epoch": 6.960965223562811, + "grad_norm": 0.19309842586517334, + "learning_rate": 9.304215755855216e-05, + "loss": 0.08255500793457031, + "step": 49040 + }, + { + "epoch": 6.962384669978708, + "grad_norm": 6.043724536895752, + "learning_rate": 9.304073811213628e-05, + "loss": 0.02880779802799225, + "step": 49050 + }, + { + "epoch": 6.963804116394606, + "grad_norm": 7.318505764007568, + "learning_rate": 9.303931866572037e-05, + "loss": 0.02839864492416382, + "step": 49060 + }, + { + "epoch": 6.965223562810504, + "grad_norm": 0.155448779463768, + "learning_rate": 9.303789921930448e-05, + "loss": 0.05681364536285401, + "step": 49070 + }, + { + "epoch": 6.966643009226401, + "grad_norm": 6.362581253051758, + "learning_rate": 9.303647977288857e-05, + "loss": 0.053470975160598753, + "step": 49080 + }, + { + "epoch": 6.9680624556423, + "grad_norm": 0.9104495644569397, + "learning_rate": 9.303506032647268e-05, + "loss": 0.042231276631355286, + "step": 49090 + }, + { + "epoch": 6.969481902058197, + "grad_norm": 0.3613532483577728, + "learning_rate": 9.303364088005679e-05, + "loss": 0.01866423785686493, + "step": 49100 + }, + { + "epoch": 6.970901348474095, + "grad_norm": 0.20154109597206116, + "learning_rate": 9.303222143364089e-05, + "loss": 0.020688405632972716, + "step": 49110 + }, + { + "epoch": 6.972320794889993, + "grad_norm": 0.06894083321094513, + "learning_rate": 9.3030801987225e-05, + "loss": 0.023301401734352113, + "step": 49120 + }, + { + "epoch": 6.97374024130589, + "grad_norm": 0.7792617678642273, + "learning_rate": 9.302938254080908e-05, + "loss": 0.035962840914726256, + "step": 49130 + }, + { + "epoch": 6.9751596877217885, + "grad_norm": 3.8199219703674316, + "learning_rate": 9.302796309439319e-05, + "loss": 0.05565328001976013, + "step": 49140 + }, + { + "epoch": 6.976579134137686, + "grad_norm": 4.88375186920166, + "learning_rate": 9.302654364797729e-05, + "loss": 0.018589270114898682, + "step": 49150 + }, + { + "epoch": 6.977998580553584, + "grad_norm": 2.5005977153778076, + "learning_rate": 9.30251242015614e-05, + "loss": 0.052044129371643065, + "step": 49160 + }, + { + "epoch": 6.979418026969482, + "grad_norm": 0.2199069857597351, + "learning_rate": 9.30237047551455e-05, + "loss": 0.06527388095855713, + "step": 49170 + }, + { + "epoch": 6.98083747338538, + "grad_norm": 5.643954277038574, + "learning_rate": 9.30222853087296e-05, + "loss": 0.03402488529682159, + "step": 49180 + }, + { + "epoch": 6.982256919801277, + "grad_norm": 9.927678108215332, + "learning_rate": 9.30208658623137e-05, + "loss": 0.08523820042610168, + "step": 49190 + }, + { + "epoch": 6.983676366217175, + "grad_norm": 10.16280460357666, + "learning_rate": 9.30194464158978e-05, + "loss": 0.07182406187057495, + "step": 49200 + }, + { + "epoch": 6.985095812633073, + "grad_norm": 8.577272415161133, + "learning_rate": 9.301802696948191e-05, + "loss": 0.08476966619491577, + "step": 49210 + }, + { + "epoch": 6.9865152590489705, + "grad_norm": 3.291269063949585, + "learning_rate": 9.301660752306601e-05, + "loss": 0.05766218900680542, + "step": 49220 + }, + { + "epoch": 6.987934705464869, + "grad_norm": 0.36031633615493774, + "learning_rate": 9.301518807665011e-05, + "loss": 0.03990486562252045, + "step": 49230 + }, + { + "epoch": 6.989354151880766, + "grad_norm": 2.6701886653900146, + "learning_rate": 9.30137686302342e-05, + "loss": 0.04657737016677856, + "step": 49240 + }, + { + "epoch": 6.990773598296665, + "grad_norm": 6.119255065917969, + "learning_rate": 9.301234918381832e-05, + "loss": 0.058797252178192136, + "step": 49250 + }, + { + "epoch": 6.992193044712562, + "grad_norm": 11.877795219421387, + "learning_rate": 9.301092973740241e-05, + "loss": 0.07917346358299256, + "step": 49260 + }, + { + "epoch": 6.99361249112846, + "grad_norm": 5.940694332122803, + "learning_rate": 9.300951029098653e-05, + "loss": 0.048585915565490724, + "step": 49270 + }, + { + "epoch": 6.995031937544358, + "grad_norm": 0.06978671997785568, + "learning_rate": 9.300809084457062e-05, + "loss": 0.038115686178207396, + "step": 49280 + }, + { + "epoch": 6.996451383960255, + "grad_norm": 4.792726039886475, + "learning_rate": 9.300667139815472e-05, + "loss": 0.04403967261314392, + "step": 49290 + }, + { + "epoch": 6.997870830376153, + "grad_norm": 0.2641933858394623, + "learning_rate": 9.300525195173883e-05, + "loss": 0.09480243921279907, + "step": 49300 + }, + { + "epoch": 6.999290276792051, + "grad_norm": 0.3769291043281555, + "learning_rate": 9.300383250532293e-05, + "loss": 0.04385235905647278, + "step": 49310 + }, + { + "epoch": 7.000709723207949, + "grad_norm": 1.2710357904434204, + "learning_rate": 9.300241305890704e-05, + "loss": 0.04902540445327759, + "step": 49320 + }, + { + "epoch": 7.002129169623847, + "grad_norm": 3.25337815284729, + "learning_rate": 9.300099361249112e-05, + "loss": 0.06324410438537598, + "step": 49330 + }, + { + "epoch": 7.003548616039745, + "grad_norm": 3.0381252765655518, + "learning_rate": 9.299957416607523e-05, + "loss": 0.02603963911533356, + "step": 49340 + }, + { + "epoch": 7.004968062455642, + "grad_norm": 5.802962303161621, + "learning_rate": 9.299815471965933e-05, + "loss": 0.07219669818878174, + "step": 49350 + }, + { + "epoch": 7.00638750887154, + "grad_norm": 1.1525763273239136, + "learning_rate": 9.299673527324344e-05, + "loss": 0.026941490173339844, + "step": 49360 + }, + { + "epoch": 7.007806955287438, + "grad_norm": 3.169665575027466, + "learning_rate": 9.299531582682754e-05, + "loss": 0.01637158840894699, + "step": 49370 + }, + { + "epoch": 7.009226401703335, + "grad_norm": 6.44553279876709, + "learning_rate": 9.299389638041165e-05, + "loss": 0.01533316820859909, + "step": 49380 + }, + { + "epoch": 7.010645848119234, + "grad_norm": 0.5937087535858154, + "learning_rate": 9.299247693399575e-05, + "loss": 0.03281160593032837, + "step": 49390 + }, + { + "epoch": 7.012065294535131, + "grad_norm": 0.40654799342155457, + "learning_rate": 9.299105748757985e-05, + "loss": 0.05567447543144226, + "step": 49400 + }, + { + "epoch": 7.0134847409510295, + "grad_norm": 1.6091256141662598, + "learning_rate": 9.298963804116396e-05, + "loss": 0.0387098491191864, + "step": 49410 + }, + { + "epoch": 7.014904187366927, + "grad_norm": 0.13329650461673737, + "learning_rate": 9.298821859474805e-05, + "loss": 0.07836799025535583, + "step": 49420 + }, + { + "epoch": 7.016323633782824, + "grad_norm": 5.576003074645996, + "learning_rate": 9.298679914833217e-05, + "loss": 0.08060545921325683, + "step": 49430 + }, + { + "epoch": 7.017743080198723, + "grad_norm": 0.44313862919807434, + "learning_rate": 9.298537970191625e-05, + "loss": 0.0336797297000885, + "step": 49440 + }, + { + "epoch": 7.01916252661462, + "grad_norm": 7.476051330566406, + "learning_rate": 9.298396025550036e-05, + "loss": 0.06888166666030884, + "step": 49450 + }, + { + "epoch": 7.020581973030518, + "grad_norm": 0.964314877986908, + "learning_rate": 9.298254080908446e-05, + "loss": 0.08780239224433899, + "step": 49460 + }, + { + "epoch": 7.022001419446416, + "grad_norm": 5.419530391693115, + "learning_rate": 9.298112136266857e-05, + "loss": 0.04995315372943878, + "step": 49470 + }, + { + "epoch": 7.023420865862314, + "grad_norm": 1.7097113132476807, + "learning_rate": 9.297970191625267e-05, + "loss": 0.01210094690322876, + "step": 49480 + }, + { + "epoch": 7.0248403122782115, + "grad_norm": 4.466053485870361, + "learning_rate": 9.297828246983676e-05, + "loss": 0.05786159634590149, + "step": 49490 + }, + { + "epoch": 7.026259758694109, + "grad_norm": 4.688916206359863, + "learning_rate": 9.297686302342087e-05, + "loss": 0.032867801189422605, + "step": 49500 + }, + { + "epoch": 7.026259758694109, + "eval_accuracy": 0.9810516945380555, + "eval_loss": 0.05638516694307327, + "eval_runtime": 32.4152, + "eval_samples_per_second": 485.174, + "eval_steps_per_second": 15.178, + "step": 49500 + }, + { + "epoch": 7.027679205110007, + "grad_norm": 0.7310358881950378, + "learning_rate": 9.297544357700497e-05, + "loss": 0.019714725017547608, + "step": 49510 + }, + { + "epoch": 7.029098651525905, + "grad_norm": 0.09676510095596313, + "learning_rate": 9.297402413058908e-05, + "loss": 0.034003442525863646, + "step": 49520 + }, + { + "epoch": 7.030518097941803, + "grad_norm": 0.2067088782787323, + "learning_rate": 9.297260468417318e-05, + "loss": 0.031689074635505673, + "step": 49530 + }, + { + "epoch": 7.0319375443577, + "grad_norm": 0.0486597940325737, + "learning_rate": 9.297118523775728e-05, + "loss": 0.02177567034959793, + "step": 49540 + }, + { + "epoch": 7.033356990773599, + "grad_norm": 0.3509371876716614, + "learning_rate": 9.296976579134137e-05, + "loss": 0.032621800899505615, + "step": 49550 + }, + { + "epoch": 7.034776437189496, + "grad_norm": 1.1296992301940918, + "learning_rate": 9.296834634492549e-05, + "loss": 0.081687331199646, + "step": 49560 + }, + { + "epoch": 7.0361958836053935, + "grad_norm": 8.385297775268555, + "learning_rate": 9.296692689850958e-05, + "loss": 0.02719823718070984, + "step": 49570 + }, + { + "epoch": 7.037615330021292, + "grad_norm": 0.13415709137916565, + "learning_rate": 9.29655074520937e-05, + "loss": 0.028894478082656862, + "step": 49580 + }, + { + "epoch": 7.039034776437189, + "grad_norm": 1.7251049280166626, + "learning_rate": 9.296408800567779e-05, + "loss": 0.04444833397865296, + "step": 49590 + }, + { + "epoch": 7.0404542228530875, + "grad_norm": 0.09141341596841812, + "learning_rate": 9.296266855926189e-05, + "loss": 0.12254937887191772, + "step": 49600 + }, + { + "epoch": 7.041873669268985, + "grad_norm": 1.8580803871154785, + "learning_rate": 9.2961249112846e-05, + "loss": 0.047639891505241394, + "step": 49610 + }, + { + "epoch": 7.043293115684883, + "grad_norm": 2.467607259750366, + "learning_rate": 9.29598296664301e-05, + "loss": 0.04458892643451691, + "step": 49620 + }, + { + "epoch": 7.044712562100781, + "grad_norm": 0.6312891244888306, + "learning_rate": 9.295841022001421e-05, + "loss": 0.034400665760040285, + "step": 49630 + }, + { + "epoch": 7.046132008516678, + "grad_norm": 6.18006706237793, + "learning_rate": 9.295699077359829e-05, + "loss": 0.04445726573467255, + "step": 49640 + }, + { + "epoch": 7.047551454932576, + "grad_norm": 6.739034652709961, + "learning_rate": 9.29555713271824e-05, + "loss": 0.10003730058670043, + "step": 49650 + }, + { + "epoch": 7.048970901348474, + "grad_norm": 2.1692278385162354, + "learning_rate": 9.29541518807665e-05, + "loss": 0.044187459349632266, + "step": 49660 + }, + { + "epoch": 7.050390347764372, + "grad_norm": 0.19314740598201752, + "learning_rate": 9.295273243435061e-05, + "loss": 0.05206958651542663, + "step": 49670 + }, + { + "epoch": 7.0518097941802695, + "grad_norm": 0.11539632081985474, + "learning_rate": 9.295131298793471e-05, + "loss": 0.007926839590072631, + "step": 49680 + }, + { + "epoch": 7.053229240596168, + "grad_norm": 0.31761738657951355, + "learning_rate": 9.29498935415188e-05, + "loss": 0.024791139364242553, + "step": 49690 + }, + { + "epoch": 7.054648687012065, + "grad_norm": 4.63431978225708, + "learning_rate": 9.294847409510292e-05, + "loss": 0.04683954417705536, + "step": 49700 + }, + { + "epoch": 7.056068133427963, + "grad_norm": 4.371407985687256, + "learning_rate": 9.294705464868701e-05, + "loss": 0.06283467411994934, + "step": 49710 + }, + { + "epoch": 7.057487579843861, + "grad_norm": 2.356304168701172, + "learning_rate": 9.294563520227112e-05, + "loss": 0.01404203027486801, + "step": 49720 + }, + { + "epoch": 7.058907026259758, + "grad_norm": 6.363495349884033, + "learning_rate": 9.294421575585522e-05, + "loss": 0.04853463172912598, + "step": 49730 + }, + { + "epoch": 7.060326472675657, + "grad_norm": 3.6162188053131104, + "learning_rate": 9.294279630943933e-05, + "loss": 0.019771024584770203, + "step": 49740 + }, + { + "epoch": 7.061745919091554, + "grad_norm": 0.28996092081069946, + "learning_rate": 9.294137686302342e-05, + "loss": 0.009965144097805023, + "step": 49750 + }, + { + "epoch": 7.063165365507452, + "grad_norm": 1.2177865505218506, + "learning_rate": 9.293995741660753e-05, + "loss": 0.025553053617477416, + "step": 49760 + }, + { + "epoch": 7.06458481192335, + "grad_norm": 6.402510643005371, + "learning_rate": 9.293853797019163e-05, + "loss": 0.07992073893547058, + "step": 49770 + }, + { + "epoch": 7.066004258339247, + "grad_norm": 1.7435178756713867, + "learning_rate": 9.293711852377574e-05, + "loss": 0.03621697425842285, + "step": 49780 + }, + { + "epoch": 7.067423704755146, + "grad_norm": 0.5364729762077332, + "learning_rate": 9.293569907735983e-05, + "loss": 0.03418395221233368, + "step": 49790 + }, + { + "epoch": 7.068843151171043, + "grad_norm": 3.1348342895507812, + "learning_rate": 9.293427963094393e-05, + "loss": 0.025732126832008363, + "step": 49800 + }, + { + "epoch": 7.070262597586941, + "grad_norm": 1.3703564405441284, + "learning_rate": 9.293286018452804e-05, + "loss": 0.03487118184566498, + "step": 49810 + }, + { + "epoch": 7.071682044002839, + "grad_norm": 0.9203046560287476, + "learning_rate": 9.293144073811214e-05, + "loss": 0.04656402170658112, + "step": 49820 + }, + { + "epoch": 7.073101490418737, + "grad_norm": 6.010906219482422, + "learning_rate": 9.293002129169625e-05, + "loss": 0.06883003711700439, + "step": 49830 + }, + { + "epoch": 7.0745209368346345, + "grad_norm": 0.20025935769081116, + "learning_rate": 9.292860184528035e-05, + "loss": 0.022409272193908692, + "step": 49840 + }, + { + "epoch": 7.075940383250532, + "grad_norm": 0.16005253791809082, + "learning_rate": 9.292718239886444e-05, + "loss": 0.028452104330062865, + "step": 49850 + }, + { + "epoch": 7.07735982966643, + "grad_norm": 2.2057294845581055, + "learning_rate": 9.292576295244854e-05, + "loss": 0.03202352523803711, + "step": 49860 + }, + { + "epoch": 7.078779276082328, + "grad_norm": 0.6192593574523926, + "learning_rate": 9.292434350603265e-05, + "loss": 0.03418879210948944, + "step": 49870 + }, + { + "epoch": 7.080198722498226, + "grad_norm": 5.115660190582275, + "learning_rate": 9.292292405961675e-05, + "loss": 0.019133344292640686, + "step": 49880 + }, + { + "epoch": 7.081618168914123, + "grad_norm": 0.1792406588792801, + "learning_rate": 9.292150461320086e-05, + "loss": 0.029404124617576598, + "step": 49890 + }, + { + "epoch": 7.083037615330022, + "grad_norm": 4.115320682525635, + "learning_rate": 9.292008516678496e-05, + "loss": 0.009263063967227935, + "step": 49900 + }, + { + "epoch": 7.084457061745919, + "grad_norm": 1.0660799741744995, + "learning_rate": 9.291866572036906e-05, + "loss": 0.024234510958194733, + "step": 49910 + }, + { + "epoch": 7.0858765081618165, + "grad_norm": 0.9303340911865234, + "learning_rate": 9.291724627395317e-05, + "loss": 0.04151077270507812, + "step": 49920 + }, + { + "epoch": 7.087295954577715, + "grad_norm": 3.377434253692627, + "learning_rate": 9.291582682753726e-05, + "loss": 0.0200410395860672, + "step": 49930 + }, + { + "epoch": 7.088715400993612, + "grad_norm": 12.069241523742676, + "learning_rate": 9.291440738112138e-05, + "loss": 0.0754818320274353, + "step": 49940 + }, + { + "epoch": 7.0901348474095105, + "grad_norm": 0.038232311606407166, + "learning_rate": 9.291298793470546e-05, + "loss": 0.009347131848335266, + "step": 49950 + }, + { + "epoch": 7.091554293825408, + "grad_norm": 4.304689884185791, + "learning_rate": 9.291156848828957e-05, + "loss": 0.06845600605010986, + "step": 49960 + }, + { + "epoch": 7.092973740241306, + "grad_norm": 0.4425681233406067, + "learning_rate": 9.291014904187367e-05, + "loss": 0.0835187554359436, + "step": 49970 + }, + { + "epoch": 7.094393186657204, + "grad_norm": 3.1910290718078613, + "learning_rate": 9.290872959545778e-05, + "loss": 0.0663329541683197, + "step": 49980 + }, + { + "epoch": 7.095812633073101, + "grad_norm": 13.372017860412598, + "learning_rate": 9.290731014904188e-05, + "loss": 0.06052567958831787, + "step": 49990 + }, + { + "epoch": 7.097232079488999, + "grad_norm": 15.484184265136719, + "learning_rate": 9.290589070262597e-05, + "loss": 0.0745700716972351, + "step": 50000 + }, + { + "epoch": 7.097232079488999, + "eval_accuracy": 0.9763464106313983, + "eval_loss": 0.08104771375656128, + "eval_runtime": 31.7815, + "eval_samples_per_second": 494.847, + "eval_steps_per_second": 15.481, + "step": 50000 + }, + { + "epoch": 7.098651525904897, + "grad_norm": 0.3437268137931824, + "learning_rate": 9.290447125621008e-05, + "loss": 0.032279747724533084, + "step": 50010 + }, + { + "epoch": 7.100070972320795, + "grad_norm": 1.0124702453613281, + "learning_rate": 9.290305180979418e-05, + "loss": 0.04395381212234497, + "step": 50020 + }, + { + "epoch": 7.1014904187366925, + "grad_norm": 0.9249765276908875, + "learning_rate": 9.290163236337829e-05, + "loss": 0.09170477390289307, + "step": 50030 + }, + { + "epoch": 7.102909865152591, + "grad_norm": 0.7250400185585022, + "learning_rate": 9.290021291696239e-05, + "loss": 0.009999457001686095, + "step": 50040 + }, + { + "epoch": 7.104329311568488, + "grad_norm": 5.4864044189453125, + "learning_rate": 9.28987934705465e-05, + "loss": 0.026081347465515138, + "step": 50050 + }, + { + "epoch": 7.105748757984386, + "grad_norm": 6.135810375213623, + "learning_rate": 9.289737402413058e-05, + "loss": 0.04017038345336914, + "step": 50060 + }, + { + "epoch": 7.107168204400284, + "grad_norm": 0.10332240909337997, + "learning_rate": 9.28959545777147e-05, + "loss": 0.028695687651634216, + "step": 50070 + }, + { + "epoch": 7.108587650816181, + "grad_norm": 0.26266011595726013, + "learning_rate": 9.289453513129879e-05, + "loss": 0.018574948608875274, + "step": 50080 + }, + { + "epoch": 7.11000709723208, + "grad_norm": 7.670289993286133, + "learning_rate": 9.28931156848829e-05, + "loss": 0.022504398226737977, + "step": 50090 + }, + { + "epoch": 7.111426543647977, + "grad_norm": 0.9531182050704956, + "learning_rate": 9.2891696238467e-05, + "loss": 0.0202168732881546, + "step": 50100 + }, + { + "epoch": 7.112845990063875, + "grad_norm": 1.2940996885299683, + "learning_rate": 9.28902767920511e-05, + "loss": 0.014882153272628785, + "step": 50110 + }, + { + "epoch": 7.114265436479773, + "grad_norm": 0.21062831580638885, + "learning_rate": 9.288885734563521e-05, + "loss": 0.01849692016839981, + "step": 50120 + }, + { + "epoch": 7.115684882895671, + "grad_norm": 0.2405148595571518, + "learning_rate": 9.28874378992193e-05, + "loss": 0.029791396856307984, + "step": 50130 + }, + { + "epoch": 7.1171043293115686, + "grad_norm": 0.14352326095104218, + "learning_rate": 9.288601845280342e-05, + "loss": 0.061136239767074586, + "step": 50140 + }, + { + "epoch": 7.118523775727466, + "grad_norm": 9.354962348937988, + "learning_rate": 9.288459900638752e-05, + "loss": 0.04090102016925812, + "step": 50150 + }, + { + "epoch": 7.119943222143364, + "grad_norm": 0.7026478052139282, + "learning_rate": 9.288317955997161e-05, + "loss": 0.009190419316291809, + "step": 50160 + }, + { + "epoch": 7.121362668559262, + "grad_norm": 0.5306406617164612, + "learning_rate": 9.288176011355571e-05, + "loss": 0.05512397289276123, + "step": 50170 + }, + { + "epoch": 7.12278211497516, + "grad_norm": 1.5007140636444092, + "learning_rate": 9.288034066713982e-05, + "loss": 0.059499156475067136, + "step": 50180 + }, + { + "epoch": 7.124201561391057, + "grad_norm": 2.3778538703918457, + "learning_rate": 9.287892122072392e-05, + "loss": 0.010339123010635377, + "step": 50190 + }, + { + "epoch": 7.125621007806956, + "grad_norm": 2.3378491401672363, + "learning_rate": 9.287750177430803e-05, + "loss": 0.027239561080932617, + "step": 50200 + }, + { + "epoch": 7.127040454222853, + "grad_norm": 5.091325283050537, + "learning_rate": 9.287608232789213e-05, + "loss": 0.04783933460712433, + "step": 50210 + }, + { + "epoch": 7.128459900638751, + "grad_norm": 6.505466461181641, + "learning_rate": 9.287466288147622e-05, + "loss": 0.04781743884086609, + "step": 50220 + }, + { + "epoch": 7.129879347054649, + "grad_norm": 5.033595561981201, + "learning_rate": 9.287324343506033e-05, + "loss": 0.0876664400100708, + "step": 50230 + }, + { + "epoch": 7.131298793470546, + "grad_norm": 0.27636995911598206, + "learning_rate": 9.287182398864443e-05, + "loss": 0.03849413990974426, + "step": 50240 + }, + { + "epoch": 7.132718239886445, + "grad_norm": 8.013169288635254, + "learning_rate": 9.287040454222854e-05, + "loss": 0.06555190682411194, + "step": 50250 + }, + { + "epoch": 7.134137686302342, + "grad_norm": 1.1230844259262085, + "learning_rate": 9.286898509581263e-05, + "loss": 0.06196191906929016, + "step": 50260 + }, + { + "epoch": 7.13555713271824, + "grad_norm": 1.3510446548461914, + "learning_rate": 9.286756564939674e-05, + "loss": 0.058738571405410764, + "step": 50270 + }, + { + "epoch": 7.136976579134138, + "grad_norm": 1.6742855310440063, + "learning_rate": 9.286614620298084e-05, + "loss": 0.06358702182769775, + "step": 50280 + }, + { + "epoch": 7.138396025550035, + "grad_norm": 5.48180627822876, + "learning_rate": 9.286472675656495e-05, + "loss": 0.051518088579177855, + "step": 50290 + }, + { + "epoch": 7.1398154719659335, + "grad_norm": 0.646102249622345, + "learning_rate": 9.286330731014906e-05, + "loss": 0.019508914649486543, + "step": 50300 + }, + { + "epoch": 7.141234918381831, + "grad_norm": 4.530608177185059, + "learning_rate": 9.286188786373314e-05, + "loss": 0.038636896014213565, + "step": 50310 + }, + { + "epoch": 7.142654364797729, + "grad_norm": 0.1688341349363327, + "learning_rate": 9.286046841731725e-05, + "loss": 0.020194944739341737, + "step": 50320 + }, + { + "epoch": 7.144073811213627, + "grad_norm": 0.30358272790908813, + "learning_rate": 9.285904897090135e-05, + "loss": 0.01204570233821869, + "step": 50330 + }, + { + "epoch": 7.145493257629525, + "grad_norm": 0.09064985811710358, + "learning_rate": 9.285762952448546e-05, + "loss": 0.04475564062595368, + "step": 50340 + }, + { + "epoch": 7.146912704045422, + "grad_norm": 3.235868453979492, + "learning_rate": 9.285621007806956e-05, + "loss": 0.016041412949562073, + "step": 50350 + }, + { + "epoch": 7.14833215046132, + "grad_norm": 0.30754321813583374, + "learning_rate": 9.285479063165365e-05, + "loss": 0.034186741709709166, + "step": 50360 + }, + { + "epoch": 7.149751596877218, + "grad_norm": 1.8013185262680054, + "learning_rate": 9.285337118523775e-05, + "loss": 0.037149444222450256, + "step": 50370 + }, + { + "epoch": 7.1511710432931155, + "grad_norm": 3.8225793838500977, + "learning_rate": 9.285195173882186e-05, + "loss": 0.026364266872406006, + "step": 50380 + }, + { + "epoch": 7.152590489709014, + "grad_norm": 1.6880011558532715, + "learning_rate": 9.285053229240597e-05, + "loss": 0.04398062229156494, + "step": 50390 + }, + { + "epoch": 7.154009936124911, + "grad_norm": 0.6532132029533386, + "learning_rate": 9.284911284599007e-05, + "loss": 0.04088074564933777, + "step": 50400 + }, + { + "epoch": 7.1554293825408095, + "grad_norm": 0.6275845170021057, + "learning_rate": 9.284769339957418e-05, + "loss": 0.03311673402786255, + "step": 50410 + }, + { + "epoch": 7.156848828956707, + "grad_norm": 8.379541397094727, + "learning_rate": 9.284627395315827e-05, + "loss": 0.06421371102333069, + "step": 50420 + }, + { + "epoch": 7.158268275372604, + "grad_norm": 0.3997638523578644, + "learning_rate": 9.284485450674238e-05, + "loss": 0.03623417913913727, + "step": 50430 + }, + { + "epoch": 7.159687721788503, + "grad_norm": 3.67484450340271, + "learning_rate": 9.284343506032647e-05, + "loss": 0.07285212278366089, + "step": 50440 + }, + { + "epoch": 7.1611071682044, + "grad_norm": 10.62584400177002, + "learning_rate": 9.284201561391059e-05, + "loss": 0.05084116458892822, + "step": 50450 + }, + { + "epoch": 7.162526614620298, + "grad_norm": 4.304548740386963, + "learning_rate": 9.284059616749468e-05, + "loss": 0.05473562479019165, + "step": 50460 + }, + { + "epoch": 7.163946061036196, + "grad_norm": 0.5049075484275818, + "learning_rate": 9.283917672107878e-05, + "loss": 0.03764554262161255, + "step": 50470 + }, + { + "epoch": 7.165365507452094, + "grad_norm": 7.426001071929932, + "learning_rate": 9.283775727466288e-05, + "loss": 0.07254430651664734, + "step": 50480 + }, + { + "epoch": 7.1667849538679915, + "grad_norm": 7.990915775299072, + "learning_rate": 9.283633782824699e-05, + "loss": 0.07232113480567932, + "step": 50490 + }, + { + "epoch": 7.168204400283889, + "grad_norm": 6.232173919677734, + "learning_rate": 9.28349183818311e-05, + "loss": 0.057746291160583496, + "step": 50500 + }, + { + "epoch": 7.168204400283889, + "eval_accuracy": 0.9734850893368093, + "eval_loss": 0.08124034851789474, + "eval_runtime": 32.7664, + "eval_samples_per_second": 479.973, + "eval_steps_per_second": 15.015, + "step": 50500 + }, + { + "epoch": 7.169623846699787, + "grad_norm": 1.4507334232330322, + "learning_rate": 9.28334989354152e-05, + "loss": 0.02919427752494812, + "step": 50510 + }, + { + "epoch": 7.171043293115685, + "grad_norm": 5.406381607055664, + "learning_rate": 9.28320794889993e-05, + "loss": 0.06213176846504211, + "step": 50520 + }, + { + "epoch": 7.172462739531583, + "grad_norm": 0.06627151370048523, + "learning_rate": 9.283066004258339e-05, + "loss": 0.020755791664123537, + "step": 50530 + }, + { + "epoch": 7.17388218594748, + "grad_norm": 6.034979820251465, + "learning_rate": 9.28292405961675e-05, + "loss": 0.04492558836936951, + "step": 50540 + }, + { + "epoch": 7.175301632363379, + "grad_norm": 5.751996994018555, + "learning_rate": 9.28278211497516e-05, + "loss": 0.03700354397296905, + "step": 50550 + }, + { + "epoch": 7.176721078779276, + "grad_norm": 4.298292636871338, + "learning_rate": 9.282640170333571e-05, + "loss": 0.0337158739566803, + "step": 50560 + }, + { + "epoch": 7.1781405251951735, + "grad_norm": 0.4003825783729553, + "learning_rate": 9.28249822569198e-05, + "loss": 0.04332021772861481, + "step": 50570 + }, + { + "epoch": 7.179559971611072, + "grad_norm": 5.302791595458984, + "learning_rate": 9.28235628105039e-05, + "loss": 0.02911527454853058, + "step": 50580 + }, + { + "epoch": 7.180979418026969, + "grad_norm": 6.431861400604248, + "learning_rate": 9.282214336408802e-05, + "loss": 0.05217592716217041, + "step": 50590 + }, + { + "epoch": 7.182398864442868, + "grad_norm": 0.46359333395957947, + "learning_rate": 9.28208658623137e-05, + "loss": 0.10249303579330445, + "step": 50600 + }, + { + "epoch": 7.183818310858765, + "grad_norm": 0.7804433703422546, + "learning_rate": 9.28194464158978e-05, + "loss": 0.031027427315711974, + "step": 50610 + }, + { + "epoch": 7.185237757274663, + "grad_norm": 0.3068195581436157, + "learning_rate": 9.281802696948191e-05, + "loss": 0.050842708349227904, + "step": 50620 + }, + { + "epoch": 7.186657203690561, + "grad_norm": 0.4566839337348938, + "learning_rate": 9.281660752306601e-05, + "loss": 0.05524548292160034, + "step": 50630 + }, + { + "epoch": 7.188076650106458, + "grad_norm": 12.260385513305664, + "learning_rate": 9.28151880766501e-05, + "loss": 0.07461308240890503, + "step": 50640 + }, + { + "epoch": 7.189496096522356, + "grad_norm": 8.496342658996582, + "learning_rate": 9.281376863023422e-05, + "loss": 0.03306256532669068, + "step": 50650 + }, + { + "epoch": 7.190915542938254, + "grad_norm": 0.06557629257440567, + "learning_rate": 9.281234918381831e-05, + "loss": 0.023129934072494508, + "step": 50660 + }, + { + "epoch": 7.192334989354152, + "grad_norm": 0.11483972519636154, + "learning_rate": 9.281092973740242e-05, + "loss": 0.039566820859909056, + "step": 50670 + }, + { + "epoch": 7.19375443577005, + "grad_norm": 0.21403329074382782, + "learning_rate": 9.280951029098652e-05, + "loss": 0.026081389188766478, + "step": 50680 + }, + { + "epoch": 7.195173882185948, + "grad_norm": 0.513241171836853, + "learning_rate": 9.280809084457062e-05, + "loss": 0.07000666260719299, + "step": 50690 + }, + { + "epoch": 7.196593328601845, + "grad_norm": 0.19217133522033691, + "learning_rate": 9.280667139815472e-05, + "loss": 0.025768482685089113, + "step": 50700 + }, + { + "epoch": 7.198012775017743, + "grad_norm": 10.442272186279297, + "learning_rate": 9.280525195173883e-05, + "loss": 0.049917465448379515, + "step": 50710 + }, + { + "epoch": 7.199432221433641, + "grad_norm": 4.785938262939453, + "learning_rate": 9.280383250532292e-05, + "loss": 0.05000673532485962, + "step": 50720 + }, + { + "epoch": 7.2008516678495385, + "grad_norm": 0.14389902353286743, + "learning_rate": 9.280241305890704e-05, + "loss": 0.06158308386802673, + "step": 50730 + }, + { + "epoch": 7.202271114265437, + "grad_norm": 0.6582837104797363, + "learning_rate": 9.280099361249113e-05, + "loss": 0.007644937932491302, + "step": 50740 + }, + { + "epoch": 7.203690560681334, + "grad_norm": 6.236335277557373, + "learning_rate": 9.279957416607523e-05, + "loss": 0.0622941255569458, + "step": 50750 + }, + { + "epoch": 7.2051100070972325, + "grad_norm": 4.699357509613037, + "learning_rate": 9.279815471965934e-05, + "loss": 0.10394492149353027, + "step": 50760 + }, + { + "epoch": 7.20652945351313, + "grad_norm": 2.788632392883301, + "learning_rate": 9.279673527324344e-05, + "loss": 0.05861709117889404, + "step": 50770 + }, + { + "epoch": 7.207948899929027, + "grad_norm": 7.0571513175964355, + "learning_rate": 9.279531582682755e-05, + "loss": 0.09752376079559326, + "step": 50780 + }, + { + "epoch": 7.209368346344926, + "grad_norm": 1.116761565208435, + "learning_rate": 9.279389638041165e-05, + "loss": 0.07887290120124817, + "step": 50790 + }, + { + "epoch": 7.210787792760823, + "grad_norm": 2.5623505115509033, + "learning_rate": 9.279247693399574e-05, + "loss": 0.06459965705871581, + "step": 50800 + }, + { + "epoch": 7.212207239176721, + "grad_norm": 0.9416432976722717, + "learning_rate": 9.279105748757984e-05, + "loss": 0.02110120952129364, + "step": 50810 + }, + { + "epoch": 7.213626685592619, + "grad_norm": 0.32998695969581604, + "learning_rate": 9.278963804116395e-05, + "loss": 0.054609715938568115, + "step": 50820 + }, + { + "epoch": 7.215046132008517, + "grad_norm": 0.8758980631828308, + "learning_rate": 9.278821859474805e-05, + "loss": 0.03850732147693634, + "step": 50830 + }, + { + "epoch": 7.2164655784244145, + "grad_norm": 4.321909427642822, + "learning_rate": 9.278679914833216e-05, + "loss": 0.01589832454919815, + "step": 50840 + }, + { + "epoch": 7.217885024840312, + "grad_norm": 0.6561715602874756, + "learning_rate": 9.278537970191626e-05, + "loss": 0.04444275200366974, + "step": 50850 + }, + { + "epoch": 7.21930447125621, + "grad_norm": 0.4713881015777588, + "learning_rate": 9.278396025550036e-05, + "loss": 0.05532388091087341, + "step": 50860 + }, + { + "epoch": 7.220723917672108, + "grad_norm": 3.666248083114624, + "learning_rate": 9.278254080908447e-05, + "loss": 0.03535724282264709, + "step": 50870 + }, + { + "epoch": 7.222143364088006, + "grad_norm": 7.525345802307129, + "learning_rate": 9.278112136266856e-05, + "loss": 0.062016028165817264, + "step": 50880 + }, + { + "epoch": 7.223562810503903, + "grad_norm": 4.87885046005249, + "learning_rate": 9.277970191625267e-05, + "loss": 0.031429398059844973, + "step": 50890 + }, + { + "epoch": 7.224982256919802, + "grad_norm": 0.5300387144088745, + "learning_rate": 9.277828246983676e-05, + "loss": 0.025107333064079286, + "step": 50900 + }, + { + "epoch": 7.226401703335699, + "grad_norm": 2.4232094287872314, + "learning_rate": 9.277686302342087e-05, + "loss": 0.043664786219596866, + "step": 50910 + }, + { + "epoch": 7.2278211497515965, + "grad_norm": 1.0089149475097656, + "learning_rate": 9.277544357700497e-05, + "loss": 0.013369449973106384, + "step": 50920 + }, + { + "epoch": 7.229240596167495, + "grad_norm": 3.6185240745544434, + "learning_rate": 9.277402413058908e-05, + "loss": 0.04417063593864441, + "step": 50930 + }, + { + "epoch": 7.230660042583392, + "grad_norm": 1.650692343711853, + "learning_rate": 9.277260468417317e-05, + "loss": 0.02690485417842865, + "step": 50940 + }, + { + "epoch": 7.2320794889992905, + "grad_norm": 0.08742735534906387, + "learning_rate": 9.277118523775727e-05, + "loss": 0.06844819188117982, + "step": 50950 + }, + { + "epoch": 7.233498935415188, + "grad_norm": 0.1685476005077362, + "learning_rate": 9.276976579134138e-05, + "loss": 0.07185621857643128, + "step": 50960 + }, + { + "epoch": 7.234918381831086, + "grad_norm": 0.1583138257265091, + "learning_rate": 9.276834634492548e-05, + "loss": 0.019683878123760223, + "step": 50970 + }, + { + "epoch": 7.236337828246984, + "grad_norm": 1.6047672033309937, + "learning_rate": 9.276692689850959e-05, + "loss": 0.038810908794403076, + "step": 50980 + }, + { + "epoch": 7.237757274662881, + "grad_norm": 1.8266907930374146, + "learning_rate": 9.276550745209369e-05, + "loss": 0.04252366423606872, + "step": 50990 + }, + { + "epoch": 7.239176721078779, + "grad_norm": 3.5572457313537598, + "learning_rate": 9.276408800567779e-05, + "loss": 0.032119011878967284, + "step": 51000 + }, + { + "epoch": 7.239176721078779, + "eval_accuracy": 0.9795256565142748, + "eval_loss": 0.058488957583904266, + "eval_runtime": 33.388, + "eval_samples_per_second": 471.038, + "eval_steps_per_second": 14.736, + "step": 51000 + }, + { + "epoch": 7.240596167494677, + "grad_norm": 0.8323566317558289, + "learning_rate": 9.276266855926188e-05, + "loss": 0.03890877366065979, + "step": 51010 + }, + { + "epoch": 7.242015613910575, + "grad_norm": 5.316452980041504, + "learning_rate": 9.2761249112846e-05, + "loss": 0.045993471145629884, + "step": 51020 + }, + { + "epoch": 7.2434350603264726, + "grad_norm": 0.26393166184425354, + "learning_rate": 9.275982966643009e-05, + "loss": 0.051994693279266355, + "step": 51030 + }, + { + "epoch": 7.244854506742371, + "grad_norm": 0.12484807521104813, + "learning_rate": 9.27584102200142e-05, + "loss": 0.021381711959838866, + "step": 51040 + }, + { + "epoch": 7.246273953158268, + "grad_norm": 1.613708257675171, + "learning_rate": 9.27569907735983e-05, + "loss": 0.06344324350357056, + "step": 51050 + }, + { + "epoch": 7.247693399574166, + "grad_norm": 0.032920923084020615, + "learning_rate": 9.27555713271824e-05, + "loss": 0.06143348813056946, + "step": 51060 + }, + { + "epoch": 7.249112845990064, + "grad_norm": 7.846259593963623, + "learning_rate": 9.275415188076651e-05, + "loss": 0.027347713708877563, + "step": 51070 + }, + { + "epoch": 7.250532292405961, + "grad_norm": 16.31854248046875, + "learning_rate": 9.27527324343506e-05, + "loss": 0.03906906247138977, + "step": 51080 + }, + { + "epoch": 7.25195173882186, + "grad_norm": 1.4159921407699585, + "learning_rate": 9.275131298793472e-05, + "loss": 0.02941213846206665, + "step": 51090 + }, + { + "epoch": 7.253371185237757, + "grad_norm": 7.948617935180664, + "learning_rate": 9.274989354151881e-05, + "loss": 0.05467774271965027, + "step": 51100 + }, + { + "epoch": 7.2547906316536555, + "grad_norm": 0.22301502525806427, + "learning_rate": 9.274847409510291e-05, + "loss": 0.057220518589019775, + "step": 51110 + }, + { + "epoch": 7.256210078069553, + "grad_norm": 3.7436046600341797, + "learning_rate": 9.274705464868701e-05, + "loss": 0.07023456692695618, + "step": 51120 + }, + { + "epoch": 7.25762952448545, + "grad_norm": 0.8607221245765686, + "learning_rate": 9.274563520227112e-05, + "loss": 0.028274688124656677, + "step": 51130 + }, + { + "epoch": 7.259048970901349, + "grad_norm": 2.3814191818237305, + "learning_rate": 9.274421575585522e-05, + "loss": 0.02873871922492981, + "step": 51140 + }, + { + "epoch": 7.260468417317246, + "grad_norm": 0.7200617790222168, + "learning_rate": 9.274279630943933e-05, + "loss": 0.023874977231025697, + "step": 51150 + }, + { + "epoch": 7.261887863733144, + "grad_norm": 6.621379375457764, + "learning_rate": 9.274137686302343e-05, + "loss": 0.036438983678817746, + "step": 51160 + }, + { + "epoch": 7.263307310149042, + "grad_norm": 1.800775170326233, + "learning_rate": 9.273995741660752e-05, + "loss": 0.023814010620117187, + "step": 51170 + }, + { + "epoch": 7.26472675656494, + "grad_norm": 1.1323634386062622, + "learning_rate": 9.273853797019163e-05, + "loss": 0.03467157781124115, + "step": 51180 + }, + { + "epoch": 7.2661462029808375, + "grad_norm": 2.3260347843170166, + "learning_rate": 9.273711852377573e-05, + "loss": 0.03501830995082855, + "step": 51190 + }, + { + "epoch": 7.267565649396735, + "grad_norm": 0.3461279571056366, + "learning_rate": 9.273569907735984e-05, + "loss": 0.02026255279779434, + "step": 51200 + }, + { + "epoch": 7.268985095812633, + "grad_norm": 8.559354782104492, + "learning_rate": 9.273427963094393e-05, + "loss": 0.0435549259185791, + "step": 51210 + }, + { + "epoch": 7.270404542228531, + "grad_norm": 5.178162574768066, + "learning_rate": 9.273286018452804e-05, + "loss": 0.03414537012577057, + "step": 51220 + }, + { + "epoch": 7.271823988644429, + "grad_norm": 4.830866813659668, + "learning_rate": 9.273144073811213e-05, + "loss": 0.0681527316570282, + "step": 51230 + }, + { + "epoch": 7.273243435060326, + "grad_norm": 1.7771570682525635, + "learning_rate": 9.273002129169625e-05, + "loss": 0.0468337744474411, + "step": 51240 + }, + { + "epoch": 7.274662881476225, + "grad_norm": 0.07269584387540817, + "learning_rate": 9.272860184528036e-05, + "loss": 0.02570006251335144, + "step": 51250 + }, + { + "epoch": 7.276082327892122, + "grad_norm": 5.584953308105469, + "learning_rate": 9.272718239886444e-05, + "loss": 0.06317769289016724, + "step": 51260 + }, + { + "epoch": 7.2775017743080195, + "grad_norm": 5.165135383605957, + "learning_rate": 9.272576295244855e-05, + "loss": 0.039837440848350524, + "step": 51270 + }, + { + "epoch": 7.278921220723918, + "grad_norm": 2.866654634475708, + "learning_rate": 9.272434350603265e-05, + "loss": 0.03418941795825958, + "step": 51280 + }, + { + "epoch": 7.280340667139815, + "grad_norm": 7.37003755569458, + "learning_rate": 9.272292405961676e-05, + "loss": 0.09569458365440368, + "step": 51290 + }, + { + "epoch": 7.2817601135557135, + "grad_norm": 0.13972480595111847, + "learning_rate": 9.272150461320086e-05, + "loss": 0.043325915932655334, + "step": 51300 + }, + { + "epoch": 7.283179559971611, + "grad_norm": 6.135573387145996, + "learning_rate": 9.272008516678495e-05, + "loss": 0.04155711829662323, + "step": 51310 + }, + { + "epoch": 7.284599006387509, + "grad_norm": 0.10803493857383728, + "learning_rate": 9.271866572036905e-05, + "loss": 0.06946607828140258, + "step": 51320 + }, + { + "epoch": 7.286018452803407, + "grad_norm": 0.6979092955589294, + "learning_rate": 9.271724627395316e-05, + "loss": 0.03188393712043762, + "step": 51330 + }, + { + "epoch": 7.287437899219304, + "grad_norm": 2.248213291168213, + "learning_rate": 9.271582682753727e-05, + "loss": 0.057016730308532715, + "step": 51340 + }, + { + "epoch": 7.288857345635202, + "grad_norm": 0.8164470791816711, + "learning_rate": 9.271440738112137e-05, + "loss": 0.0411460280418396, + "step": 51350 + }, + { + "epoch": 7.2902767920511, + "grad_norm": 1.162987232208252, + "learning_rate": 9.271298793470547e-05, + "loss": 0.06204650998115539, + "step": 51360 + }, + { + "epoch": 7.291696238466998, + "grad_norm": 0.28414785861968994, + "learning_rate": 9.271156848828957e-05, + "loss": 0.07645946145057678, + "step": 51370 + }, + { + "epoch": 7.2931156848828955, + "grad_norm": 4.3621506690979, + "learning_rate": 9.271014904187368e-05, + "loss": 0.06504054665565491, + "step": 51380 + }, + { + "epoch": 7.294535131298794, + "grad_norm": 1.3629231452941895, + "learning_rate": 9.270872959545777e-05, + "loss": 0.05250757336616516, + "step": 51390 + }, + { + "epoch": 7.295954577714691, + "grad_norm": 0.1615055948495865, + "learning_rate": 9.270731014904188e-05, + "loss": 0.06251652240753174, + "step": 51400 + }, + { + "epoch": 7.297374024130589, + "grad_norm": 0.4495696723461151, + "learning_rate": 9.270589070262597e-05, + "loss": 0.02343766689300537, + "step": 51410 + }, + { + "epoch": 7.298793470546487, + "grad_norm": 0.5210222005844116, + "learning_rate": 9.270447125621008e-05, + "loss": 0.02886645793914795, + "step": 51420 + }, + { + "epoch": 7.300212916962384, + "grad_norm": 1.460491418838501, + "learning_rate": 9.270305180979419e-05, + "loss": 0.05959440469741821, + "step": 51430 + }, + { + "epoch": 7.301632363378283, + "grad_norm": 8.31848430633545, + "learning_rate": 9.270163236337829e-05, + "loss": 0.027948886156082153, + "step": 51440 + }, + { + "epoch": 7.30305180979418, + "grad_norm": 0.22754132747650146, + "learning_rate": 9.27002129169624e-05, + "loss": 0.0412900447845459, + "step": 51450 + }, + { + "epoch": 7.304471256210078, + "grad_norm": 1.9300298690795898, + "learning_rate": 9.26987934705465e-05, + "loss": 0.07148487567901611, + "step": 51460 + }, + { + "epoch": 7.305890702625976, + "grad_norm": 3.128782272338867, + "learning_rate": 9.269737402413059e-05, + "loss": 0.035791015625, + "step": 51470 + }, + { + "epoch": 7.307310149041873, + "grad_norm": 2.424389600753784, + "learning_rate": 9.269595457771469e-05, + "loss": 0.04335363209247589, + "step": 51480 + }, + { + "epoch": 7.308729595457772, + "grad_norm": 6.377645969390869, + "learning_rate": 9.26945351312988e-05, + "loss": 0.027349942922592164, + "step": 51490 + }, + { + "epoch": 7.310149041873669, + "grad_norm": 2.207902193069458, + "learning_rate": 9.26931156848829e-05, + "loss": 0.0474819153547287, + "step": 51500 + }, + { + "epoch": 7.310149041873669, + "eval_accuracy": 0.9765371653843709, + "eval_loss": 0.07471198588609695, + "eval_runtime": 33.8232, + "eval_samples_per_second": 464.976, + "eval_steps_per_second": 14.546, + "step": 51500 + }, + { + "epoch": 7.311568488289567, + "grad_norm": 1.7475389242172241, + "learning_rate": 9.269169623846701e-05, + "loss": 0.032795050740242006, + "step": 51510 + }, + { + "epoch": 7.312987934705465, + "grad_norm": 0.86590576171875, + "learning_rate": 9.269027679205111e-05, + "loss": 0.023722925782203676, + "step": 51520 + }, + { + "epoch": 7.314407381121363, + "grad_norm": 2.6370997428894043, + "learning_rate": 9.26888573456352e-05, + "loss": 0.05528920292854309, + "step": 51530 + }, + { + "epoch": 7.31582682753726, + "grad_norm": 0.4650154411792755, + "learning_rate": 9.268743789921932e-05, + "loss": 0.0307805597782135, + "step": 51540 + }, + { + "epoch": 7.317246273953158, + "grad_norm": 0.9520929455757141, + "learning_rate": 9.268601845280341e-05, + "loss": 0.0699259340763092, + "step": 51550 + }, + { + "epoch": 7.318665720369056, + "grad_norm": 0.24121342599391937, + "learning_rate": 9.268459900638752e-05, + "loss": 0.024345090985298155, + "step": 51560 + }, + { + "epoch": 7.320085166784954, + "grad_norm": 3.9583966732025146, + "learning_rate": 9.268317955997161e-05, + "loss": 0.018575282394886018, + "step": 51570 + }, + { + "epoch": 7.321504613200852, + "grad_norm": 2.0327770709991455, + "learning_rate": 9.268176011355572e-05, + "loss": 0.04089633226394653, + "step": 51580 + }, + { + "epoch": 7.322924059616749, + "grad_norm": 9.078707695007324, + "learning_rate": 9.268034066713982e-05, + "loss": 0.041544276475906375, + "step": 51590 + }, + { + "epoch": 7.324343506032648, + "grad_norm": 1.0922521352767944, + "learning_rate": 9.267892122072393e-05, + "loss": 0.03794052600860596, + "step": 51600 + }, + { + "epoch": 7.325762952448545, + "grad_norm": 12.491849899291992, + "learning_rate": 9.267750177430802e-05, + "loss": 0.02722683846950531, + "step": 51610 + }, + { + "epoch": 7.3271823988644424, + "grad_norm": 0.5909501910209656, + "learning_rate": 9.267608232789212e-05, + "loss": 0.01713113784790039, + "step": 51620 + }, + { + "epoch": 7.328601845280341, + "grad_norm": 6.109976768493652, + "learning_rate": 9.267466288147623e-05, + "loss": 0.033209878206253055, + "step": 51630 + }, + { + "epoch": 7.330021291696238, + "grad_norm": 0.014347659423947334, + "learning_rate": 9.267324343506033e-05, + "loss": 0.03499612212181091, + "step": 51640 + }, + { + "epoch": 7.3314407381121365, + "grad_norm": 3.9987330436706543, + "learning_rate": 9.267182398864444e-05, + "loss": 0.06285910606384278, + "step": 51650 + }, + { + "epoch": 7.332860184528034, + "grad_norm": 1.168338418006897, + "learning_rate": 9.267040454222854e-05, + "loss": 0.05484226942062378, + "step": 51660 + }, + { + "epoch": 7.334279630943932, + "grad_norm": 0.682669997215271, + "learning_rate": 9.266898509581264e-05, + "loss": 0.07656214237213135, + "step": 51670 + }, + { + "epoch": 7.33569907735983, + "grad_norm": 0.18298189342021942, + "learning_rate": 9.266756564939673e-05, + "loss": 0.01965651214122772, + "step": 51680 + }, + { + "epoch": 7.337118523775727, + "grad_norm": 0.39247646927833557, + "learning_rate": 9.266614620298084e-05, + "loss": 0.04974568784236908, + "step": 51690 + }, + { + "epoch": 7.338537970191625, + "grad_norm": 1.2671245336532593, + "learning_rate": 9.266472675656494e-05, + "loss": 0.03542390167713165, + "step": 51700 + }, + { + "epoch": 7.339957416607523, + "grad_norm": 11.112316131591797, + "learning_rate": 9.266330731014905e-05, + "loss": 0.04434594511985779, + "step": 51710 + }, + { + "epoch": 7.341376863023421, + "grad_norm": 1.1118406057357788, + "learning_rate": 9.266188786373315e-05, + "loss": 0.02555614709854126, + "step": 51720 + }, + { + "epoch": 7.3427963094393185, + "grad_norm": 3.3655858039855957, + "learning_rate": 9.266046841731725e-05, + "loss": 0.014670975506305695, + "step": 51730 + }, + { + "epoch": 7.344215755855217, + "grad_norm": 4.719293117523193, + "learning_rate": 9.265904897090136e-05, + "loss": 0.042307913303375244, + "step": 51740 + }, + { + "epoch": 7.345635202271114, + "grad_norm": 0.2682708203792572, + "learning_rate": 9.265762952448546e-05, + "loss": 0.018885372579097746, + "step": 51750 + }, + { + "epoch": 7.347054648687012, + "grad_norm": 4.208601474761963, + "learning_rate": 9.265621007806957e-05, + "loss": 0.06062667965888977, + "step": 51760 + }, + { + "epoch": 7.34847409510291, + "grad_norm": 2.183436155319214, + "learning_rate": 9.265479063165365e-05, + "loss": 0.054003679752349855, + "step": 51770 + }, + { + "epoch": 7.349893541518807, + "grad_norm": 2.069380760192871, + "learning_rate": 9.265337118523776e-05, + "loss": 0.020816315710544587, + "step": 51780 + }, + { + "epoch": 7.351312987934706, + "grad_norm": 0.5530464053153992, + "learning_rate": 9.265195173882186e-05, + "loss": 0.038864347338676455, + "step": 51790 + }, + { + "epoch": 7.352732434350603, + "grad_norm": 5.707481861114502, + "learning_rate": 9.265053229240597e-05, + "loss": 0.06501655578613282, + "step": 51800 + }, + { + "epoch": 7.354151880766501, + "grad_norm": 6.6138153076171875, + "learning_rate": 9.264911284599007e-05, + "loss": 0.055298763513565066, + "step": 51810 + }, + { + "epoch": 7.355571327182399, + "grad_norm": 1.1508212089538574, + "learning_rate": 9.264769339957418e-05, + "loss": 0.01087142527103424, + "step": 51820 + }, + { + "epoch": 7.356990773598296, + "grad_norm": 6.416627883911133, + "learning_rate": 9.264627395315827e-05, + "loss": 0.046122944355010985, + "step": 51830 + }, + { + "epoch": 7.3584102200141945, + "grad_norm": 0.3396334946155548, + "learning_rate": 9.264485450674237e-05, + "loss": 0.037940219044685364, + "step": 51840 + }, + { + "epoch": 7.359829666430092, + "grad_norm": 3.5331318378448486, + "learning_rate": 9.264343506032648e-05, + "loss": 0.04645732939243317, + "step": 51850 + }, + { + "epoch": 7.36124911284599, + "grad_norm": 0.4708951711654663, + "learning_rate": 9.264201561391058e-05, + "loss": 0.06385657787322999, + "step": 51860 + }, + { + "epoch": 7.362668559261888, + "grad_norm": 0.5442249178886414, + "learning_rate": 9.264059616749469e-05, + "loss": 0.021492500603199006, + "step": 51870 + }, + { + "epoch": 7.364088005677786, + "grad_norm": 4.089636325836182, + "learning_rate": 9.263917672107878e-05, + "loss": 0.018961572647094728, + "step": 51880 + }, + { + "epoch": 7.365507452093683, + "grad_norm": 4.164590358734131, + "learning_rate": 9.263775727466289e-05, + "loss": 0.021182073652744292, + "step": 51890 + }, + { + "epoch": 7.366926898509581, + "grad_norm": 0.1637565642595291, + "learning_rate": 9.263633782824698e-05, + "loss": 0.01856728196144104, + "step": 51900 + }, + { + "epoch": 7.368346344925479, + "grad_norm": 7.452282428741455, + "learning_rate": 9.26349183818311e-05, + "loss": 0.049343031644821164, + "step": 51910 + }, + { + "epoch": 7.3697657913413765, + "grad_norm": 3.365506172180176, + "learning_rate": 9.263349893541519e-05, + "loss": 0.042214390635490415, + "step": 51920 + }, + { + "epoch": 7.371185237757275, + "grad_norm": 0.08252433687448502, + "learning_rate": 9.263207948899929e-05, + "loss": 0.01741803586483002, + "step": 51930 + }, + { + "epoch": 7.372604684173172, + "grad_norm": 1.889232873916626, + "learning_rate": 9.26306600425834e-05, + "loss": 0.020835280418395996, + "step": 51940 + }, + { + "epoch": 7.374024130589071, + "grad_norm": 7.806169509887695, + "learning_rate": 9.26292405961675e-05, + "loss": 0.07517807483673096, + "step": 51950 + }, + { + "epoch": 7.375443577004968, + "grad_norm": 5.743335723876953, + "learning_rate": 9.262782114975161e-05, + "loss": 0.030621072649955748, + "step": 51960 + }, + { + "epoch": 7.376863023420865, + "grad_norm": 7.148108959197998, + "learning_rate": 9.26264017033357e-05, + "loss": 0.05479738712310791, + "step": 51970 + }, + { + "epoch": 7.378282469836764, + "grad_norm": 0.13797208666801453, + "learning_rate": 9.26249822569198e-05, + "loss": 0.01111309826374054, + "step": 51980 + }, + { + "epoch": 7.379701916252661, + "grad_norm": 0.9274955987930298, + "learning_rate": 9.26235628105039e-05, + "loss": 0.019465911388397216, + "step": 51990 + }, + { + "epoch": 7.3811213626685594, + "grad_norm": 1.8465386629104614, + "learning_rate": 9.262214336408801e-05, + "loss": 0.02890079915523529, + "step": 52000 + }, + { + "epoch": 7.3811213626685594, + "eval_accuracy": 0.9748203726076174, + "eval_loss": 0.07954176515340805, + "eval_runtime": 32.7811, + "eval_samples_per_second": 479.758, + "eval_steps_per_second": 15.009, + "step": 52000 + }, + { + "epoch": 7.382540809084457, + "grad_norm": 8.025490760803223, + "learning_rate": 9.262072391767211e-05, + "loss": 0.10265064239501953, + "step": 52010 + }, + { + "epoch": 7.383960255500355, + "grad_norm": 0.5458213686943054, + "learning_rate": 9.261930447125622e-05, + "loss": 0.02167295664548874, + "step": 52020 + }, + { + "epoch": 7.385379701916253, + "grad_norm": 1.3527144193649292, + "learning_rate": 9.261788502484032e-05, + "loss": 0.016633424162864684, + "step": 52030 + }, + { + "epoch": 7.38679914833215, + "grad_norm": 4.071643829345703, + "learning_rate": 9.261646557842441e-05, + "loss": 0.05185266137123108, + "step": 52040 + }, + { + "epoch": 7.388218594748048, + "grad_norm": 2.140740156173706, + "learning_rate": 9.261504613200853e-05, + "loss": 0.008098404109477996, + "step": 52050 + }, + { + "epoch": 7.389638041163946, + "grad_norm": 0.37996429204940796, + "learning_rate": 9.261362668559262e-05, + "loss": 0.03676808774471283, + "step": 52060 + }, + { + "epoch": 7.391057487579844, + "grad_norm": 3.958872079849243, + "learning_rate": 9.261220723917673e-05, + "loss": 0.030224177241325378, + "step": 52070 + }, + { + "epoch": 7.3924769339957415, + "grad_norm": 2.3154561519622803, + "learning_rate": 9.261078779276082e-05, + "loss": 0.0440337210893631, + "step": 52080 + }, + { + "epoch": 7.39389638041164, + "grad_norm": 10.971945762634277, + "learning_rate": 9.260936834634493e-05, + "loss": 0.08902759552001953, + "step": 52090 + }, + { + "epoch": 7.395315826827537, + "grad_norm": 3.9263570308685303, + "learning_rate": 9.260794889992903e-05, + "loss": 0.07159250974655151, + "step": 52100 + }, + { + "epoch": 7.396735273243435, + "grad_norm": 7.3109049797058105, + "learning_rate": 9.260652945351314e-05, + "loss": 0.0619121789932251, + "step": 52110 + }, + { + "epoch": 7.398154719659333, + "grad_norm": 2.75917911529541, + "learning_rate": 9.260511000709723e-05, + "loss": 0.05869649052619934, + "step": 52120 + }, + { + "epoch": 7.39957416607523, + "grad_norm": 2.691560983657837, + "learning_rate": 9.260369056068133e-05, + "loss": 0.07508642673492431, + "step": 52130 + }, + { + "epoch": 7.400993612491129, + "grad_norm": 0.7861066460609436, + "learning_rate": 9.260227111426544e-05, + "loss": 0.08113704323768615, + "step": 52140 + }, + { + "epoch": 7.402413058907026, + "grad_norm": 11.484288215637207, + "learning_rate": 9.260085166784954e-05, + "loss": 0.04761863350868225, + "step": 52150 + }, + { + "epoch": 7.403832505322924, + "grad_norm": 2.5290586948394775, + "learning_rate": 9.259943222143365e-05, + "loss": 0.053852963447570804, + "step": 52160 + }, + { + "epoch": 7.405251951738822, + "grad_norm": 7.964162826538086, + "learning_rate": 9.259801277501775e-05, + "loss": 0.03996648192405701, + "step": 52170 + }, + { + "epoch": 7.406671398154719, + "grad_norm": 0.3172842562198639, + "learning_rate": 9.259659332860186e-05, + "loss": 0.03223178386688232, + "step": 52180 + }, + { + "epoch": 7.4080908445706175, + "grad_norm": 1.804693341255188, + "learning_rate": 9.259517388218594e-05, + "loss": 0.04552800059318542, + "step": 52190 + }, + { + "epoch": 7.409510290986515, + "grad_norm": 4.020263195037842, + "learning_rate": 9.259375443577005e-05, + "loss": 0.10396513938903809, + "step": 52200 + }, + { + "epoch": 7.410929737402413, + "grad_norm": 0.20161767303943634, + "learning_rate": 9.259233498935415e-05, + "loss": 0.031718161702156064, + "step": 52210 + }, + { + "epoch": 7.412349183818311, + "grad_norm": 0.14328189194202423, + "learning_rate": 9.259091554293826e-05, + "loss": 0.028231072425842284, + "step": 52220 + }, + { + "epoch": 7.413768630234209, + "grad_norm": 3.6482315063476562, + "learning_rate": 9.258949609652236e-05, + "loss": 0.01824479103088379, + "step": 52230 + }, + { + "epoch": 7.415188076650106, + "grad_norm": 3.887988328933716, + "learning_rate": 9.258807665010646e-05, + "loss": 0.030091965198516847, + "step": 52240 + }, + { + "epoch": 7.416607523066004, + "grad_norm": 0.033909570425748825, + "learning_rate": 9.258665720369057e-05, + "loss": 0.04004532396793366, + "step": 52250 + }, + { + "epoch": 7.418026969481902, + "grad_norm": 1.5418171882629395, + "learning_rate": 9.258523775727467e-05, + "loss": 0.009149040281772613, + "step": 52260 + }, + { + "epoch": 7.4194464158977995, + "grad_norm": 5.650069713592529, + "learning_rate": 9.258381831085878e-05, + "loss": 0.019167867302894593, + "step": 52270 + }, + { + "epoch": 7.420865862313698, + "grad_norm": 1.2293617725372314, + "learning_rate": 9.258239886444287e-05, + "loss": 0.042755690217018125, + "step": 52280 + }, + { + "epoch": 7.422285308729595, + "grad_norm": 6.387624263763428, + "learning_rate": 9.258097941802697e-05, + "loss": 0.026754480600357056, + "step": 52290 + }, + { + "epoch": 7.4237047551454936, + "grad_norm": 10.783160209655762, + "learning_rate": 9.257955997161107e-05, + "loss": 0.04979313015937805, + "step": 52300 + }, + { + "epoch": 7.425124201561391, + "grad_norm": 11.399337768554688, + "learning_rate": 9.257814052519518e-05, + "loss": 0.050091004371643065, + "step": 52310 + }, + { + "epoch": 7.426543647977288, + "grad_norm": 5.577615737915039, + "learning_rate": 9.257672107877928e-05, + "loss": 0.03373092114925384, + "step": 52320 + }, + { + "epoch": 7.427963094393187, + "grad_norm": 4.476070880889893, + "learning_rate": 9.257530163236339e-05, + "loss": 0.05749149918556214, + "step": 52330 + }, + { + "epoch": 7.429382540809084, + "grad_norm": 0.015291991643607616, + "learning_rate": 9.257388218594748e-05, + "loss": 0.03193310499191284, + "step": 52340 + }, + { + "epoch": 7.430801987224982, + "grad_norm": 10.38379192352295, + "learning_rate": 9.257246273953158e-05, + "loss": 0.06969671249389649, + "step": 52350 + }, + { + "epoch": 7.43222143364088, + "grad_norm": 0.3225395381450653, + "learning_rate": 9.25710432931157e-05, + "loss": 0.025198325514793396, + "step": 52360 + }, + { + "epoch": 7.433640880056778, + "grad_norm": 3.8596174716949463, + "learning_rate": 9.256962384669979e-05, + "loss": 0.033972108364105226, + "step": 52370 + }, + { + "epoch": 7.435060326472676, + "grad_norm": 6.728971481323242, + "learning_rate": 9.25682044002839e-05, + "loss": 0.14436639547348024, + "step": 52380 + }, + { + "epoch": 7.436479772888574, + "grad_norm": 0.24112731218338013, + "learning_rate": 9.256678495386799e-05, + "loss": 0.03348296284675598, + "step": 52390 + }, + { + "epoch": 7.437899219304471, + "grad_norm": 4.913303852081299, + "learning_rate": 9.25653655074521e-05, + "loss": 0.03284276723861694, + "step": 52400 + }, + { + "epoch": 7.439318665720369, + "grad_norm": 0.03473828732967377, + "learning_rate": 9.25639460610362e-05, + "loss": 0.020089390873908996, + "step": 52410 + }, + { + "epoch": 7.440738112136267, + "grad_norm": 11.010733604431152, + "learning_rate": 9.25625266146203e-05, + "loss": 0.03317167162895203, + "step": 52420 + }, + { + "epoch": 7.442157558552164, + "grad_norm": 0.09397601336240768, + "learning_rate": 9.25611071682044e-05, + "loss": 0.05286313891410828, + "step": 52430 + }, + { + "epoch": 7.443577004968063, + "grad_norm": 5.776946544647217, + "learning_rate": 9.25596877217885e-05, + "loss": 0.061042767763137815, + "step": 52440 + }, + { + "epoch": 7.44499645138396, + "grad_norm": 10.642087936401367, + "learning_rate": 9.255826827537261e-05, + "loss": 0.08132562637329102, + "step": 52450 + }, + { + "epoch": 7.4464158977998585, + "grad_norm": 0.35297006368637085, + "learning_rate": 9.255684882895671e-05, + "loss": 0.016444140672683717, + "step": 52460 + }, + { + "epoch": 7.447835344215756, + "grad_norm": 7.420588493347168, + "learning_rate": 9.255542938254082e-05, + "loss": 0.07923081517219543, + "step": 52470 + }, + { + "epoch": 7.449254790631653, + "grad_norm": 0.862054705619812, + "learning_rate": 9.255400993612492e-05, + "loss": 0.008070911467075347, + "step": 52480 + }, + { + "epoch": 7.450674237047552, + "grad_norm": 1.598931074142456, + "learning_rate": 9.255259048970903e-05, + "loss": 0.0262810617685318, + "step": 52490 + }, + { + "epoch": 7.452093683463449, + "grad_norm": 0.47471916675567627, + "learning_rate": 9.255117104329311e-05, + "loss": 0.05102187395095825, + "step": 52500 + }, + { + "epoch": 7.452093683463449, + "eval_accuracy": 0.983976600750302, + "eval_loss": 0.048219986259937286, + "eval_runtime": 32.6578, + "eval_samples_per_second": 481.569, + "eval_steps_per_second": 15.065, + "step": 52500 + }, + { + "epoch": 7.453513129879347, + "grad_norm": 0.05551528558135033, + "learning_rate": 9.254975159687722e-05, + "loss": 0.0365988701581955, + "step": 52510 + }, + { + "epoch": 7.454932576295245, + "grad_norm": 0.480398952960968, + "learning_rate": 9.254833215046132e-05, + "loss": 0.04306910634040832, + "step": 52520 + }, + { + "epoch": 7.456352022711143, + "grad_norm": 11.236347198486328, + "learning_rate": 9.254691270404543e-05, + "loss": 0.05073235034942627, + "step": 52530 + }, + { + "epoch": 7.4577714691270405, + "grad_norm": 2.5973517894744873, + "learning_rate": 9.254549325762954e-05, + "loss": 0.02977212965488434, + "step": 52540 + }, + { + "epoch": 7.459190915542938, + "grad_norm": 0.21879629790782928, + "learning_rate": 9.254407381121362e-05, + "loss": 0.06971742510795594, + "step": 52550 + }, + { + "epoch": 7.460610361958836, + "grad_norm": 2.3356027603149414, + "learning_rate": 9.254265436479774e-05, + "loss": 0.05115787982940674, + "step": 52560 + }, + { + "epoch": 7.462029808374734, + "grad_norm": 0.18059638142585754, + "learning_rate": 9.254123491838183e-05, + "loss": 0.04363165497779846, + "step": 52570 + }, + { + "epoch": 7.463449254790632, + "grad_norm": 0.360032320022583, + "learning_rate": 9.253981547196594e-05, + "loss": 0.032967600226402285, + "step": 52580 + }, + { + "epoch": 7.464868701206529, + "grad_norm": 2.464231491088867, + "learning_rate": 9.253839602555004e-05, + "loss": 0.011615180224180222, + "step": 52590 + }, + { + "epoch": 7.466288147622428, + "grad_norm": 0.08998509496450424, + "learning_rate": 9.253697657913414e-05, + "loss": 0.06994263529777527, + "step": 52600 + }, + { + "epoch": 7.467707594038325, + "grad_norm": 0.38058972358703613, + "learning_rate": 9.253555713271824e-05, + "loss": 0.03467016220092774, + "step": 52610 + }, + { + "epoch": 7.4691270404542225, + "grad_norm": 1.0466068983078003, + "learning_rate": 9.253413768630235e-05, + "loss": 0.07900729179382324, + "step": 52620 + }, + { + "epoch": 7.470546486870121, + "grad_norm": 8.88718318939209, + "learning_rate": 9.253271823988644e-05, + "loss": 0.043406492471694945, + "step": 52630 + }, + { + "epoch": 7.471965933286018, + "grad_norm": 9.799579620361328, + "learning_rate": 9.253129879347056e-05, + "loss": 0.07508601546287537, + "step": 52640 + }, + { + "epoch": 7.4733853797019165, + "grad_norm": 0.019487710669636726, + "learning_rate": 9.252987934705465e-05, + "loss": 0.03812042474746704, + "step": 52650 + }, + { + "epoch": 7.474804826117814, + "grad_norm": 5.969607830047607, + "learning_rate": 9.252845990063875e-05, + "loss": 0.07696850299835205, + "step": 52660 + }, + { + "epoch": 7.476224272533712, + "grad_norm": 0.35891488194465637, + "learning_rate": 9.252704045422286e-05, + "loss": 0.026730889081954957, + "step": 52670 + }, + { + "epoch": 7.47764371894961, + "grad_norm": 1.599028468132019, + "learning_rate": 9.252562100780696e-05, + "loss": 0.020498314499855043, + "step": 52680 + }, + { + "epoch": 7.479063165365507, + "grad_norm": 0.4268326759338379, + "learning_rate": 9.252420156139107e-05, + "loss": 0.06487103700637817, + "step": 52690 + }, + { + "epoch": 7.480482611781405, + "grad_norm": 8.808951377868652, + "learning_rate": 9.252278211497515e-05, + "loss": 0.06087350845336914, + "step": 52700 + }, + { + "epoch": 7.481902058197303, + "grad_norm": 1.0655536651611328, + "learning_rate": 9.252136266855926e-05, + "loss": 0.02627456784248352, + "step": 52710 + }, + { + "epoch": 7.483321504613201, + "grad_norm": 0.672973096370697, + "learning_rate": 9.251994322214336e-05, + "loss": 0.03866781890392303, + "step": 52720 + }, + { + "epoch": 7.4847409510290985, + "grad_norm": 7.424531936645508, + "learning_rate": 9.251852377572747e-05, + "loss": 0.02564384639263153, + "step": 52730 + }, + { + "epoch": 7.486160397444997, + "grad_norm": 3.822476863861084, + "learning_rate": 9.251710432931158e-05, + "loss": 0.02148028612136841, + "step": 52740 + }, + { + "epoch": 7.487579843860894, + "grad_norm": 4.0289306640625, + "learning_rate": 9.251568488289567e-05, + "loss": 0.03844795525074005, + "step": 52750 + }, + { + "epoch": 7.488999290276792, + "grad_norm": 3.0319392681121826, + "learning_rate": 9.251426543647978e-05, + "loss": 0.05241814851760864, + "step": 52760 + }, + { + "epoch": 7.49041873669269, + "grad_norm": 0.08324091881513596, + "learning_rate": 9.251284599006388e-05, + "loss": 0.02561030685901642, + "step": 52770 + }, + { + "epoch": 7.491838183108587, + "grad_norm": 0.40191054344177246, + "learning_rate": 9.251142654364799e-05, + "loss": 0.03448401093482971, + "step": 52780 + }, + { + "epoch": 7.493257629524486, + "grad_norm": 8.233901977539062, + "learning_rate": 9.251000709723208e-05, + "loss": 0.06319097876548767, + "step": 52790 + }, + { + "epoch": 7.494677075940383, + "grad_norm": 0.34823575615882874, + "learning_rate": 9.250858765081618e-05, + "loss": 0.038350042700767514, + "step": 52800 + }, + { + "epoch": 7.496096522356281, + "grad_norm": 0.2975291907787323, + "learning_rate": 9.250716820440028e-05, + "loss": 0.057574158906936644, + "step": 52810 + }, + { + "epoch": 7.497515968772179, + "grad_norm": 4.405351638793945, + "learning_rate": 9.250574875798439e-05, + "loss": 0.032033723592758176, + "step": 52820 + }, + { + "epoch": 7.498935415188076, + "grad_norm": 10.231863021850586, + "learning_rate": 9.25043293115685e-05, + "loss": 0.05307228565216064, + "step": 52830 + }, + { + "epoch": 7.500354861603975, + "grad_norm": 4.901642799377441, + "learning_rate": 9.25029098651526e-05, + "loss": 0.09208908081054687, + "step": 52840 + }, + { + "epoch": 7.501774308019872, + "grad_norm": 5.971859931945801, + "learning_rate": 9.250149041873671e-05, + "loss": 0.05865171551704407, + "step": 52850 + }, + { + "epoch": 7.50319375443577, + "grad_norm": 0.3640846312046051, + "learning_rate": 9.250007097232079e-05, + "loss": 0.040382787585258484, + "step": 52860 + }, + { + "epoch": 7.504613200851668, + "grad_norm": 0.23144324123859406, + "learning_rate": 9.24986515259049e-05, + "loss": 0.08787302970886231, + "step": 52870 + }, + { + "epoch": 7.506032647267566, + "grad_norm": 1.2069907188415527, + "learning_rate": 9.2497232079489e-05, + "loss": 0.057518255710601804, + "step": 52880 + }, + { + "epoch": 7.5074520936834634, + "grad_norm": 0.7209001183509827, + "learning_rate": 9.249581263307311e-05, + "loss": 0.05926448106765747, + "step": 52890 + }, + { + "epoch": 7.508871540099361, + "grad_norm": 3.756991386413574, + "learning_rate": 9.249439318665721e-05, + "loss": 0.055469298362731935, + "step": 52900 + }, + { + "epoch": 7.510290986515259, + "grad_norm": 13.026274681091309, + "learning_rate": 9.24929737402413e-05, + "loss": 0.07141894102096558, + "step": 52910 + }, + { + "epoch": 7.511710432931157, + "grad_norm": 1.6600936651229858, + "learning_rate": 9.249155429382542e-05, + "loss": 0.02206961214542389, + "step": 52920 + }, + { + "epoch": 7.513129879347055, + "grad_norm": 4.592076301574707, + "learning_rate": 9.249013484740951e-05, + "loss": 0.068220454454422, + "step": 52930 + }, + { + "epoch": 7.514549325762952, + "grad_norm": 5.431219577789307, + "learning_rate": 9.248871540099363e-05, + "loss": 0.03009980320930481, + "step": 52940 + }, + { + "epoch": 7.515968772178851, + "grad_norm": 0.23516087234020233, + "learning_rate": 9.248729595457772e-05, + "loss": 0.05364044308662415, + "step": 52950 + }, + { + "epoch": 7.517388218594748, + "grad_norm": 0.16321633756160736, + "learning_rate": 9.248587650816182e-05, + "loss": 0.01695691645145416, + "step": 52960 + }, + { + "epoch": 7.518807665010646, + "grad_norm": 7.968390941619873, + "learning_rate": 9.248445706174592e-05, + "loss": 0.046441465616226196, + "step": 52970 + }, + { + "epoch": 7.520227111426544, + "grad_norm": 0.1707213670015335, + "learning_rate": 9.248303761533003e-05, + "loss": 0.022419868409633635, + "step": 52980 + }, + { + "epoch": 7.521646557842441, + "grad_norm": 0.9471738338470459, + "learning_rate": 9.248161816891413e-05, + "loss": 0.04635497331619263, + "step": 52990 + }, + { + "epoch": 7.5230660042583395, + "grad_norm": 5.733717918395996, + "learning_rate": 9.248019872249824e-05, + "loss": 0.05494365692138672, + "step": 53000 + }, + { + "epoch": 7.5230660042583395, + "eval_accuracy": 0.979779996184905, + "eval_loss": 0.0653390884399414, + "eval_runtime": 33.676, + "eval_samples_per_second": 467.009, + "eval_steps_per_second": 14.61, + "step": 53000 + }, + { + "epoch": 7.524485450674237, + "grad_norm": 1.4633334875106812, + "learning_rate": 9.247877927608233e-05, + "loss": 0.04977775514125824, + "step": 53010 + }, + { + "epoch": 7.525904897090135, + "grad_norm": 6.960347652435303, + "learning_rate": 9.247735982966643e-05, + "loss": 0.057191604375839235, + "step": 53020 + }, + { + "epoch": 7.527324343506033, + "grad_norm": 12.815581321716309, + "learning_rate": 9.247594038325054e-05, + "loss": 0.08237308859825135, + "step": 53030 + }, + { + "epoch": 7.528743789921931, + "grad_norm": 0.03340001776814461, + "learning_rate": 9.247452093683464e-05, + "loss": 0.024731306731700896, + "step": 53040 + }, + { + "epoch": 7.530163236337828, + "grad_norm": 8.023792266845703, + "learning_rate": 9.247310149041875e-05, + "loss": 0.024743181467056275, + "step": 53050 + }, + { + "epoch": 7.531582682753726, + "grad_norm": 0.13054397702217102, + "learning_rate": 9.247168204400283e-05, + "loss": 0.07355377674102784, + "step": 53060 + }, + { + "epoch": 7.533002129169624, + "grad_norm": 2.0724384784698486, + "learning_rate": 9.247026259758695e-05, + "loss": 0.036674332618713376, + "step": 53070 + }, + { + "epoch": 7.5344215755855215, + "grad_norm": 6.162533760070801, + "learning_rate": 9.246884315117104e-05, + "loss": 0.09419107437133789, + "step": 53080 + }, + { + "epoch": 7.53584102200142, + "grad_norm": 6.801360607147217, + "learning_rate": 9.246742370475515e-05, + "loss": 0.046077826619148256, + "step": 53090 + }, + { + "epoch": 7.537260468417317, + "grad_norm": 2.5614705085754395, + "learning_rate": 9.246600425833925e-05, + "loss": 0.029644250869750977, + "step": 53100 + }, + { + "epoch": 7.5386799148332155, + "grad_norm": 5.066242694854736, + "learning_rate": 9.246458481192335e-05, + "loss": 0.0544149100780487, + "step": 53110 + }, + { + "epoch": 7.540099361249113, + "grad_norm": 0.11999693512916565, + "learning_rate": 9.246316536550746e-05, + "loss": 0.040882185101509094, + "step": 53120 + }, + { + "epoch": 7.54151880766501, + "grad_norm": 0.4337463080883026, + "learning_rate": 9.246174591909156e-05, + "loss": 0.04220533668994904, + "step": 53130 + }, + { + "epoch": 7.542938254080909, + "grad_norm": 6.067582130432129, + "learning_rate": 9.246032647267567e-05, + "loss": 0.05769921541213989, + "step": 53140 + }, + { + "epoch": 7.544357700496806, + "grad_norm": 4.153407573699951, + "learning_rate": 9.245890702625977e-05, + "loss": 0.033078896999359134, + "step": 53150 + }, + { + "epoch": 7.545777146912704, + "grad_norm": 3.1943609714508057, + "learning_rate": 9.245748757984386e-05, + "loss": 0.04769000113010406, + "step": 53160 + }, + { + "epoch": 7.547196593328602, + "grad_norm": 1.6068007946014404, + "learning_rate": 9.245606813342796e-05, + "loss": 0.024349580705165862, + "step": 53170 + }, + { + "epoch": 7.5486160397445, + "grad_norm": 0.45679542422294617, + "learning_rate": 9.245464868701207e-05, + "loss": 0.03165770173072815, + "step": 53180 + }, + { + "epoch": 7.5500354861603975, + "grad_norm": 4.295965671539307, + "learning_rate": 9.245322924059617e-05, + "loss": 0.05426824688911438, + "step": 53190 + }, + { + "epoch": 7.551454932576295, + "grad_norm": 0.14807292819023132, + "learning_rate": 9.245180979418028e-05, + "loss": 0.050548434257507324, + "step": 53200 + }, + { + "epoch": 7.552874378992193, + "grad_norm": 0.31772199273109436, + "learning_rate": 9.245039034776438e-05, + "loss": 0.050780308246612546, + "step": 53210 + }, + { + "epoch": 7.554293825408091, + "grad_norm": 4.849133491516113, + "learning_rate": 9.244897090134847e-05, + "loss": 0.02840524911880493, + "step": 53220 + }, + { + "epoch": 7.555713271823989, + "grad_norm": 0.765864908695221, + "learning_rate": 9.244755145493259e-05, + "loss": 0.05476508140563965, + "step": 53230 + }, + { + "epoch": 7.557132718239886, + "grad_norm": 0.023372527211904526, + "learning_rate": 9.244613200851668e-05, + "loss": 0.053467082977294925, + "step": 53240 + }, + { + "epoch": 7.558552164655785, + "grad_norm": 0.3663049638271332, + "learning_rate": 9.24447125621008e-05, + "loss": 0.018398307263851166, + "step": 53250 + }, + { + "epoch": 7.559971611071682, + "grad_norm": 0.11951316893100739, + "learning_rate": 9.244329311568489e-05, + "loss": 0.07248743176460266, + "step": 53260 + }, + { + "epoch": 7.56139105748758, + "grad_norm": 0.1535031497478485, + "learning_rate": 9.244187366926899e-05, + "loss": 0.01143306791782379, + "step": 53270 + }, + { + "epoch": 7.562810503903478, + "grad_norm": 0.5286732316017151, + "learning_rate": 9.244045422285309e-05, + "loss": 0.00730847492814064, + "step": 53280 + }, + { + "epoch": 7.564229950319375, + "grad_norm": 1.2395968437194824, + "learning_rate": 9.24390347764372e-05, + "loss": 0.02569035589694977, + "step": 53290 + }, + { + "epoch": 7.565649396735274, + "grad_norm": 4.477364540100098, + "learning_rate": 9.24376153300213e-05, + "loss": 0.06130185127258301, + "step": 53300 + }, + { + "epoch": 7.567068843151171, + "grad_norm": 0.203590527176857, + "learning_rate": 9.24361958836054e-05, + "loss": 0.05250626802444458, + "step": 53310 + }, + { + "epoch": 7.568488289567069, + "grad_norm": 5.73386812210083, + "learning_rate": 9.24347764371895e-05, + "loss": 0.019776782393455504, + "step": 53320 + }, + { + "epoch": 7.569907735982967, + "grad_norm": 0.7581711411476135, + "learning_rate": 9.24333569907736e-05, + "loss": 0.006641269475221634, + "step": 53330 + }, + { + "epoch": 7.571327182398864, + "grad_norm": 3.9233145713806152, + "learning_rate": 9.243193754435771e-05, + "loss": 0.05464006662368774, + "step": 53340 + }, + { + "epoch": 7.5727466288147625, + "grad_norm": 0.6209933757781982, + "learning_rate": 9.243051809794181e-05, + "loss": 0.04515405893325806, + "step": 53350 + }, + { + "epoch": 7.57416607523066, + "grad_norm": 0.1766492873430252, + "learning_rate": 9.242909865152592e-05, + "loss": 0.022017842531204222, + "step": 53360 + }, + { + "epoch": 7.575585521646558, + "grad_norm": 1.5515861511230469, + "learning_rate": 9.242767920511e-05, + "loss": 0.03363422155380249, + "step": 53370 + }, + { + "epoch": 7.577004968062456, + "grad_norm": 0.28514936566352844, + "learning_rate": 9.242625975869411e-05, + "loss": 0.027630746364593506, + "step": 53380 + }, + { + "epoch": 7.578424414478354, + "grad_norm": 0.16114062070846558, + "learning_rate": 9.242484031227821e-05, + "loss": 0.0107742041349411, + "step": 53390 + }, + { + "epoch": 7.579843860894251, + "grad_norm": 5.650407791137695, + "learning_rate": 9.242342086586232e-05, + "loss": 0.05854092836380005, + "step": 53400 + }, + { + "epoch": 7.581263307310149, + "grad_norm": 1.162428855895996, + "learning_rate": 9.242200141944642e-05, + "loss": 0.07622578144073486, + "step": 53410 + }, + { + "epoch": 7.582682753726047, + "grad_norm": 0.809592604637146, + "learning_rate": 9.242058197303052e-05, + "loss": 0.03163085877895355, + "step": 53420 + }, + { + "epoch": 7.5841022001419445, + "grad_norm": 9.978492736816406, + "learning_rate": 9.241916252661463e-05, + "loss": 0.026983675360679627, + "step": 53430 + }, + { + "epoch": 7.585521646557843, + "grad_norm": 2.833834171295166, + "learning_rate": 9.241774308019872e-05, + "loss": 0.04985363781452179, + "step": 53440 + }, + { + "epoch": 7.58694109297374, + "grad_norm": 4.750146389007568, + "learning_rate": 9.241632363378284e-05, + "loss": 0.07900604009628295, + "step": 53450 + }, + { + "epoch": 7.5883605393896385, + "grad_norm": 5.776832103729248, + "learning_rate": 9.241490418736693e-05, + "loss": 0.08184942603111267, + "step": 53460 + }, + { + "epoch": 7.589779985805536, + "grad_norm": 0.146357461810112, + "learning_rate": 9.241348474095103e-05, + "loss": 0.04078640043735504, + "step": 53470 + }, + { + "epoch": 7.591199432221433, + "grad_norm": 2.9139442443847656, + "learning_rate": 9.241206529453513e-05, + "loss": 0.035071760416030884, + "step": 53480 + }, + { + "epoch": 7.592618878637332, + "grad_norm": 9.05538272857666, + "learning_rate": 9.241064584811924e-05, + "loss": 0.059223884344100954, + "step": 53490 + }, + { + "epoch": 7.594038325053229, + "grad_norm": 4.537329196929932, + "learning_rate": 9.240922640170334e-05, + "loss": 0.09483524560928344, + "step": 53500 + }, + { + "epoch": 7.594038325053229, + "eval_accuracy": 0.9792077319259872, + "eval_loss": 0.07600707560777664, + "eval_runtime": 33.1148, + "eval_samples_per_second": 474.923, + "eval_steps_per_second": 14.857, + "step": 53500 + }, + { + "epoch": 7.595457771469127, + "grad_norm": 16.435022354125977, + "learning_rate": 9.240780695528745e-05, + "loss": 0.059524184465408324, + "step": 53510 + }, + { + "epoch": 7.596877217885025, + "grad_norm": 6.466281414031982, + "learning_rate": 9.240638750887154e-05, + "loss": 0.06247789859771728, + "step": 53520 + }, + { + "epoch": 7.598296664300923, + "grad_norm": 2.0099196434020996, + "learning_rate": 9.240496806245564e-05, + "loss": 0.03750507235527038, + "step": 53530 + }, + { + "epoch": 7.5997161107168205, + "grad_norm": 0.4146358072757721, + "learning_rate": 9.240354861603975e-05, + "loss": 0.03174733221530914, + "step": 53540 + }, + { + "epoch": 7.601135557132718, + "grad_norm": 0.6508600115776062, + "learning_rate": 9.240212916962385e-05, + "loss": 0.015483300387859344, + "step": 53550 + }, + { + "epoch": 7.602555003548616, + "grad_norm": 0.19503478705883026, + "learning_rate": 9.240070972320796e-05, + "loss": 0.008923622220754624, + "step": 53560 + }, + { + "epoch": 7.603974449964514, + "grad_norm": 0.29882973432540894, + "learning_rate": 9.239929027679206e-05, + "loss": 0.02937857210636139, + "step": 53570 + }, + { + "epoch": 7.605393896380412, + "grad_norm": 5.896130084991455, + "learning_rate": 9.239787083037616e-05, + "loss": 0.05464982390403748, + "step": 53580 + }, + { + "epoch": 7.606813342796309, + "grad_norm": 0.8281605839729309, + "learning_rate": 9.239645138396025e-05, + "loss": 0.02264205664396286, + "step": 53590 + }, + { + "epoch": 7.608232789212208, + "grad_norm": 5.777096271514893, + "learning_rate": 9.239503193754436e-05, + "loss": 0.03788665533065796, + "step": 53600 + }, + { + "epoch": 7.609652235628105, + "grad_norm": 1.0653955936431885, + "learning_rate": 9.239361249112846e-05, + "loss": 0.03864677846431732, + "step": 53610 + }, + { + "epoch": 7.6110716820440025, + "grad_norm": 2.2212467193603516, + "learning_rate": 9.239219304471257e-05, + "loss": 0.06111682653427124, + "step": 53620 + }, + { + "epoch": 7.612491128459901, + "grad_norm": 8.857563018798828, + "learning_rate": 9.239077359829667e-05, + "loss": 0.0751349151134491, + "step": 53630 + }, + { + "epoch": 7.613910574875798, + "grad_norm": 0.5623897314071655, + "learning_rate": 9.238935415188077e-05, + "loss": 0.04178241789340973, + "step": 53640 + }, + { + "epoch": 7.615330021291697, + "grad_norm": 13.483894348144531, + "learning_rate": 9.238793470546488e-05, + "loss": 0.10559332370758057, + "step": 53650 + }, + { + "epoch": 7.616749467707594, + "grad_norm": 1.6335182189941406, + "learning_rate": 9.238651525904898e-05, + "loss": 0.05756605863571167, + "step": 53660 + }, + { + "epoch": 7.618168914123492, + "grad_norm": 0.2131602019071579, + "learning_rate": 9.238509581263309e-05, + "loss": 0.019205693900585175, + "step": 53670 + }, + { + "epoch": 7.61958836053939, + "grad_norm": 8.674543380737305, + "learning_rate": 9.238381831085877e-05, + "loss": 0.08388531804084778, + "step": 53680 + }, + { + "epoch": 7.621007806955287, + "grad_norm": 0.8919023275375366, + "learning_rate": 9.238239886444288e-05, + "loss": 0.04539136588573456, + "step": 53690 + }, + { + "epoch": 7.622427253371185, + "grad_norm": 0.10621856898069382, + "learning_rate": 9.238097941802697e-05, + "loss": 0.034367746114730834, + "step": 53700 + }, + { + "epoch": 7.623846699787083, + "grad_norm": 1.795030117034912, + "learning_rate": 9.237955997161108e-05, + "loss": 0.02624286413192749, + "step": 53710 + }, + { + "epoch": 7.625266146202981, + "grad_norm": 6.546425819396973, + "learning_rate": 9.237814052519517e-05, + "loss": 0.06192071437835693, + "step": 53720 + }, + { + "epoch": 7.626685592618879, + "grad_norm": 1.9943439960479736, + "learning_rate": 9.237672107877929e-05, + "loss": 0.03261047303676605, + "step": 53730 + }, + { + "epoch": 7.628105039034777, + "grad_norm": 0.13797317445278168, + "learning_rate": 9.237530163236338e-05, + "loss": 0.09458110928535461, + "step": 53740 + }, + { + "epoch": 7.629524485450674, + "grad_norm": 0.1786557137966156, + "learning_rate": 9.237388218594748e-05, + "loss": 0.04373227059841156, + "step": 53750 + }, + { + "epoch": 7.630943931866572, + "grad_norm": 0.11414031684398651, + "learning_rate": 9.237246273953159e-05, + "loss": 0.04276902675628662, + "step": 53760 + }, + { + "epoch": 7.63236337828247, + "grad_norm": 0.09007790684700012, + "learning_rate": 9.237104329311569e-05, + "loss": 0.026761719584465028, + "step": 53770 + }, + { + "epoch": 7.633782824698367, + "grad_norm": 5.950411319732666, + "learning_rate": 9.23696238466998e-05, + "loss": 0.022057650983333586, + "step": 53780 + }, + { + "epoch": 7.635202271114266, + "grad_norm": 0.6405834555625916, + "learning_rate": 9.23682044002839e-05, + "loss": 0.03160939812660217, + "step": 53790 + }, + { + "epoch": 7.636621717530163, + "grad_norm": 8.794205665588379, + "learning_rate": 9.2366784953868e-05, + "loss": 0.04112916588783264, + "step": 53800 + }, + { + "epoch": 7.6380411639460615, + "grad_norm": 0.031496480107307434, + "learning_rate": 9.236536550745209e-05, + "loss": 0.017792116105556487, + "step": 53810 + }, + { + "epoch": 7.639460610361959, + "grad_norm": 3.8262858390808105, + "learning_rate": 9.23639460610362e-05, + "loss": 0.01980331391096115, + "step": 53820 + }, + { + "epoch": 7.640880056777856, + "grad_norm": 0.5895381569862366, + "learning_rate": 9.23625266146203e-05, + "loss": 0.02830488085746765, + "step": 53830 + }, + { + "epoch": 7.642299503193755, + "grad_norm": 2.3932108879089355, + "learning_rate": 9.236110716820441e-05, + "loss": 0.01563961207866669, + "step": 53840 + }, + { + "epoch": 7.643718949609652, + "grad_norm": 0.7757654190063477, + "learning_rate": 9.235968772178851e-05, + "loss": 0.01142323911190033, + "step": 53850 + }, + { + "epoch": 7.64513839602555, + "grad_norm": 1.4721145629882812, + "learning_rate": 9.23582682753726e-05, + "loss": 0.017298223078250886, + "step": 53860 + }, + { + "epoch": 7.646557842441448, + "grad_norm": 14.840720176696777, + "learning_rate": 9.235684882895672e-05, + "loss": 0.09527291059494018, + "step": 53870 + }, + { + "epoch": 7.647977288857346, + "grad_norm": 0.0690033808350563, + "learning_rate": 9.235542938254081e-05, + "loss": 0.05147637128829956, + "step": 53880 + }, + { + "epoch": 7.6493967352732435, + "grad_norm": 17.268529891967773, + "learning_rate": 9.235400993612492e-05, + "loss": 0.08880094289779664, + "step": 53890 + }, + { + "epoch": 7.650816181689141, + "grad_norm": 0.3241688013076782, + "learning_rate": 9.235259048970902e-05, + "loss": 0.024964214861392976, + "step": 53900 + }, + { + "epoch": 7.652235628105039, + "grad_norm": 0.484392911195755, + "learning_rate": 9.235117104329312e-05, + "loss": 0.009301058948040009, + "step": 53910 + }, + { + "epoch": 7.653655074520937, + "grad_norm": 1.3884038925170898, + "learning_rate": 9.234975159687722e-05, + "loss": 0.05612800121307373, + "step": 53920 + }, + { + "epoch": 7.655074520936835, + "grad_norm": 6.86073637008667, + "learning_rate": 9.234833215046133e-05, + "loss": 0.02074751555919647, + "step": 53930 + }, + { + "epoch": 7.656493967352732, + "grad_norm": 3.2943639755249023, + "learning_rate": 9.234691270404543e-05, + "loss": 0.07102017402648926, + "step": 53940 + }, + { + "epoch": 7.657913413768631, + "grad_norm": 4.378195285797119, + "learning_rate": 9.234549325762954e-05, + "loss": 0.03647010326385498, + "step": 53950 + }, + { + "epoch": 7.659332860184528, + "grad_norm": 1.0259917974472046, + "learning_rate": 9.234407381121363e-05, + "loss": 0.037917932868003844, + "step": 53960 + }, + { + "epoch": 7.6607523066004255, + "grad_norm": 0.6418030261993408, + "learning_rate": 9.234265436479773e-05, + "loss": 0.047413745522499086, + "step": 53970 + }, + { + "epoch": 7.662171753016324, + "grad_norm": 6.070641040802002, + "learning_rate": 9.234123491838184e-05, + "loss": 0.06167091727256775, + "step": 53980 + }, + { + "epoch": 7.663591199432221, + "grad_norm": 6.273818016052246, + "learning_rate": 9.233981547196594e-05, + "loss": 0.07756027579307556, + "step": 53990 + }, + { + "epoch": 7.6650106458481195, + "grad_norm": 6.54415225982666, + "learning_rate": 9.233839602555005e-05, + "loss": 0.03750898838043213, + "step": 54000 + }, + { + "epoch": 7.6650106458481195, + "eval_accuracy": 0.972849240160234, + "eval_loss": 0.09029248356819153, + "eval_runtime": 32.2102, + "eval_samples_per_second": 488.262, + "eval_steps_per_second": 15.275, + "step": 54000 + }, + { + "epoch": 7.666430092264017, + "grad_norm": 8.113903045654297, + "learning_rate": 9.233697657913413e-05, + "loss": 0.06530644297599793, + "step": 54010 + }, + { + "epoch": 7.667849538679915, + "grad_norm": 4.259474754333496, + "learning_rate": 9.233555713271824e-05, + "loss": 0.023596912622451782, + "step": 54020 + }, + { + "epoch": 7.669268985095813, + "grad_norm": 0.5020537972450256, + "learning_rate": 9.233413768630234e-05, + "loss": 0.01666277050971985, + "step": 54030 + }, + { + "epoch": 7.67068843151171, + "grad_norm": 0.9523594975471497, + "learning_rate": 9.233271823988645e-05, + "loss": 0.016902516782283782, + "step": 54040 + }, + { + "epoch": 7.672107877927608, + "grad_norm": 4.621238708496094, + "learning_rate": 9.233129879347055e-05, + "loss": 0.05011897087097168, + "step": 54050 + }, + { + "epoch": 7.673527324343506, + "grad_norm": 0.058928538113832474, + "learning_rate": 9.232987934705465e-05, + "loss": 0.05659050941467285, + "step": 54060 + }, + { + "epoch": 7.674946770759404, + "grad_norm": 4.141076564788818, + "learning_rate": 9.232845990063876e-05, + "loss": 0.05621238946914673, + "step": 54070 + }, + { + "epoch": 7.6763662171753015, + "grad_norm": 0.6577054858207703, + "learning_rate": 9.232704045422286e-05, + "loss": 0.030709424614906312, + "step": 54080 + }, + { + "epoch": 7.6777856635912, + "grad_norm": 7.629159927368164, + "learning_rate": 9.232562100780697e-05, + "loss": 0.05306870341300964, + "step": 54090 + }, + { + "epoch": 7.679205110007097, + "grad_norm": 2.1149821281433105, + "learning_rate": 9.232420156139106e-05, + "loss": 0.05833685994148254, + "step": 54100 + }, + { + "epoch": 7.680624556422995, + "grad_norm": 0.2776797115802765, + "learning_rate": 9.232278211497516e-05, + "loss": 0.04917903542518616, + "step": 54110 + }, + { + "epoch": 7.682044002838893, + "grad_norm": 2.9681408405303955, + "learning_rate": 9.232136266855926e-05, + "loss": 0.014532405138015746, + "step": 54120 + }, + { + "epoch": 7.68346344925479, + "grad_norm": 8.385042190551758, + "learning_rate": 9.231994322214337e-05, + "loss": 0.10322569608688355, + "step": 54130 + }, + { + "epoch": 7.684882895670689, + "grad_norm": 0.19604772329330444, + "learning_rate": 9.231852377572747e-05, + "loss": 0.026059174537658693, + "step": 54140 + }, + { + "epoch": 7.686302342086586, + "grad_norm": 1.7579096555709839, + "learning_rate": 9.231710432931158e-05, + "loss": 0.05938120484352112, + "step": 54150 + }, + { + "epoch": 7.687721788502484, + "grad_norm": 0.03762279078364372, + "learning_rate": 9.231568488289568e-05, + "loss": 0.10924702882766724, + "step": 54160 + }, + { + "epoch": 7.689141234918382, + "grad_norm": 0.2558661103248596, + "learning_rate": 9.231426543647977e-05, + "loss": 0.09950585961341858, + "step": 54170 + }, + { + "epoch": 7.690560681334279, + "grad_norm": 1.1616108417510986, + "learning_rate": 9.231284599006388e-05, + "loss": 0.0400057464838028, + "step": 54180 + }, + { + "epoch": 7.691980127750178, + "grad_norm": 4.835945129394531, + "learning_rate": 9.231142654364798e-05, + "loss": 0.07117159366607666, + "step": 54190 + }, + { + "epoch": 7.693399574166075, + "grad_norm": 5.930656909942627, + "learning_rate": 9.231000709723209e-05, + "loss": 0.09944562911987305, + "step": 54200 + }, + { + "epoch": 7.694819020581973, + "grad_norm": 8.935443878173828, + "learning_rate": 9.230858765081618e-05, + "loss": 0.06826504468917846, + "step": 54210 + }, + { + "epoch": 7.696238466997871, + "grad_norm": 9.199212074279785, + "learning_rate": 9.230716820440029e-05, + "loss": 0.04564814865589142, + "step": 54220 + }, + { + "epoch": 7.697657913413769, + "grad_norm": 0.19474904239177704, + "learning_rate": 9.230574875798438e-05, + "loss": 0.01926818788051605, + "step": 54230 + }, + { + "epoch": 7.6990773598296665, + "grad_norm": 5.388856410980225, + "learning_rate": 9.23043293115685e-05, + "loss": 0.020480193197727203, + "step": 54240 + }, + { + "epoch": 7.700496806245564, + "grad_norm": 0.6576113104820251, + "learning_rate": 9.230290986515259e-05, + "loss": 0.022709192335605623, + "step": 54250 + }, + { + "epoch": 7.701916252661462, + "grad_norm": 6.927825927734375, + "learning_rate": 9.23014904187367e-05, + "loss": 0.11104840040206909, + "step": 54260 + }, + { + "epoch": 7.70333569907736, + "grad_norm": 2.824536085128784, + "learning_rate": 9.23000709723208e-05, + "loss": 0.06214058995246887, + "step": 54270 + }, + { + "epoch": 7.704755145493258, + "grad_norm": 1.143356204032898, + "learning_rate": 9.22986515259049e-05, + "loss": 0.030827879905700684, + "step": 54280 + }, + { + "epoch": 7.706174591909155, + "grad_norm": 0.2769818902015686, + "learning_rate": 9.229723207948901e-05, + "loss": 0.03429543673992157, + "step": 54290 + }, + { + "epoch": 7.707594038325054, + "grad_norm": 1.1894688606262207, + "learning_rate": 9.229581263307311e-05, + "loss": 0.04327844679355621, + "step": 54300 + }, + { + "epoch": 7.709013484740951, + "grad_norm": 1.5082286596298218, + "learning_rate": 9.229439318665722e-05, + "loss": 0.06623492836952209, + "step": 54310 + }, + { + "epoch": 7.7104329311568485, + "grad_norm": 2.6335270404815674, + "learning_rate": 9.22929737402413e-05, + "loss": 0.02056298851966858, + "step": 54320 + }, + { + "epoch": 7.711852377572747, + "grad_norm": 0.04475034773349762, + "learning_rate": 9.229155429382541e-05, + "loss": 0.0460908055305481, + "step": 54330 + }, + { + "epoch": 7.713271823988644, + "grad_norm": 0.06595787405967712, + "learning_rate": 9.229013484740951e-05, + "loss": 0.04475194215774536, + "step": 54340 + }, + { + "epoch": 7.7146912704045425, + "grad_norm": 10.94398307800293, + "learning_rate": 9.228871540099362e-05, + "loss": 0.07244617938995361, + "step": 54350 + }, + { + "epoch": 7.71611071682044, + "grad_norm": 3.780848503112793, + "learning_rate": 9.228729595457772e-05, + "loss": 0.0663948893547058, + "step": 54360 + }, + { + "epoch": 7.717530163236338, + "grad_norm": 4.40339994430542, + "learning_rate": 9.228587650816182e-05, + "loss": 0.05741128921508789, + "step": 54370 + }, + { + "epoch": 7.718949609652236, + "grad_norm": 0.6055914759635925, + "learning_rate": 9.228445706174593e-05, + "loss": 0.06476907134056091, + "step": 54380 + }, + { + "epoch": 7.720369056068133, + "grad_norm": 0.7589472532272339, + "learning_rate": 9.228303761533002e-05, + "loss": 0.04766846895217895, + "step": 54390 + }, + { + "epoch": 7.721788502484031, + "grad_norm": 3.3787660598754883, + "learning_rate": 9.228161816891413e-05, + "loss": 0.034449401497840884, + "step": 54400 + }, + { + "epoch": 7.723207948899929, + "grad_norm": 1.4176545143127441, + "learning_rate": 9.228019872249823e-05, + "loss": 0.01606842428445816, + "step": 54410 + }, + { + "epoch": 7.724627395315827, + "grad_norm": 1.9891983270645142, + "learning_rate": 9.227877927608233e-05, + "loss": 0.04497620463371277, + "step": 54420 + }, + { + "epoch": 7.7260468417317245, + "grad_norm": 7.962987899780273, + "learning_rate": 9.227735982966643e-05, + "loss": 0.11056967973709106, + "step": 54430 + }, + { + "epoch": 7.727466288147623, + "grad_norm": 0.19446644186973572, + "learning_rate": 9.227594038325054e-05, + "loss": 0.05493360161781311, + "step": 54440 + }, + { + "epoch": 7.72888573456352, + "grad_norm": 0.07538071274757385, + "learning_rate": 9.227452093683464e-05, + "loss": 0.021178624033927916, + "step": 54450 + }, + { + "epoch": 7.730305180979418, + "grad_norm": 2.5883734226226807, + "learning_rate": 9.227310149041875e-05, + "loss": 0.07062762975692749, + "step": 54460 + }, + { + "epoch": 7.731724627395316, + "grad_norm": 9.73288631439209, + "learning_rate": 9.227168204400284e-05, + "loss": 0.03467971682548523, + "step": 54470 + }, + { + "epoch": 7.733144073811213, + "grad_norm": 4.348404407501221, + "learning_rate": 9.227026259758694e-05, + "loss": 0.07101811170578003, + "step": 54480 + }, + { + "epoch": 7.734563520227112, + "grad_norm": 7.970381259918213, + "learning_rate": 9.226884315117105e-05, + "loss": 0.09218829870223999, + "step": 54490 + }, + { + "epoch": 7.735982966643009, + "grad_norm": 0.49827030301094055, + "learning_rate": 9.226742370475515e-05, + "loss": 0.02308831512928009, + "step": 54500 + }, + { + "epoch": 7.735982966643009, + "eval_accuracy": 0.9711960323011382, + "eval_loss": 0.09986808896064758, + "eval_runtime": 32.8864, + "eval_samples_per_second": 478.222, + "eval_steps_per_second": 14.961, + "step": 54500 + }, + { + "epoch": 7.737402413058907, + "grad_norm": 4.274623870849609, + "learning_rate": 9.226600425833926e-05, + "loss": 0.04567549228668213, + "step": 54510 + }, + { + "epoch": 7.738821859474805, + "grad_norm": 0.24907518923282623, + "learning_rate": 9.226458481192334e-05, + "loss": 0.08113893866539001, + "step": 54520 + }, + { + "epoch": 7.740241305890702, + "grad_norm": 0.5531359314918518, + "learning_rate": 9.226316536550745e-05, + "loss": 0.048743787407875064, + "step": 54530 + }, + { + "epoch": 7.741660752306601, + "grad_norm": 0.20647698640823364, + "learning_rate": 9.226174591909155e-05, + "loss": 0.05019644498825073, + "step": 54540 + }, + { + "epoch": 7.743080198722498, + "grad_norm": 0.23882421851158142, + "learning_rate": 9.226032647267566e-05, + "loss": 0.04471515417098999, + "step": 54550 + }, + { + "epoch": 7.744499645138396, + "grad_norm": 1.7147654294967651, + "learning_rate": 9.225890702625976e-05, + "loss": 0.04694445133209228, + "step": 54560 + }, + { + "epoch": 7.745919091554294, + "grad_norm": 3.365558624267578, + "learning_rate": 9.225748757984386e-05, + "loss": 0.02688348889350891, + "step": 54570 + }, + { + "epoch": 7.747338537970192, + "grad_norm": 0.3501296937465668, + "learning_rate": 9.225606813342797e-05, + "loss": 0.03100045919418335, + "step": 54580 + }, + { + "epoch": 7.748757984386089, + "grad_norm": 8.124733924865723, + "learning_rate": 9.225464868701207e-05, + "loss": 0.04513312876224518, + "step": 54590 + }, + { + "epoch": 7.750177430801987, + "grad_norm": 0.1646394431591034, + "learning_rate": 9.225322924059618e-05, + "loss": 0.01222996562719345, + "step": 54600 + }, + { + "epoch": 7.751596877217885, + "grad_norm": 7.725193500518799, + "learning_rate": 9.225180979418027e-05, + "loss": 0.023356731235980987, + "step": 54610 + }, + { + "epoch": 7.753016323633783, + "grad_norm": 0.9481512904167175, + "learning_rate": 9.225039034776439e-05, + "loss": 0.013700984418392181, + "step": 54620 + }, + { + "epoch": 7.754435770049681, + "grad_norm": 0.10453180968761444, + "learning_rate": 9.224897090134847e-05, + "loss": 0.058964455127716066, + "step": 54630 + }, + { + "epoch": 7.755855216465578, + "grad_norm": 0.024097450077533722, + "learning_rate": 9.224755145493258e-05, + "loss": 0.014117154479026794, + "step": 54640 + }, + { + "epoch": 7.757274662881477, + "grad_norm": 5.321264266967773, + "learning_rate": 9.224613200851668e-05, + "loss": 0.03250417113304138, + "step": 54650 + }, + { + "epoch": 7.758694109297374, + "grad_norm": 0.7793516516685486, + "learning_rate": 9.224471256210079e-05, + "loss": 0.007282558083534241, + "step": 54660 + }, + { + "epoch": 7.760113555713271, + "grad_norm": 5.44902229309082, + "learning_rate": 9.224329311568489e-05, + "loss": 0.019059914350509643, + "step": 54670 + }, + { + "epoch": 7.76153300212917, + "grad_norm": 0.17302462458610535, + "learning_rate": 9.224187366926898e-05, + "loss": 0.013754206895828246, + "step": 54680 + }, + { + "epoch": 7.762952448545067, + "grad_norm": 3.7517333030700684, + "learning_rate": 9.22404542228531e-05, + "loss": 0.07426886558532715, + "step": 54690 + }, + { + "epoch": 7.7643718949609655, + "grad_norm": 1.1429849863052368, + "learning_rate": 9.223903477643719e-05, + "loss": 0.04882776141166687, + "step": 54700 + }, + { + "epoch": 7.765791341376863, + "grad_norm": 0.07773357629776001, + "learning_rate": 9.22376153300213e-05, + "loss": 0.08146860003471375, + "step": 54710 + }, + { + "epoch": 7.767210787792761, + "grad_norm": 0.2552967667579651, + "learning_rate": 9.22361958836054e-05, + "loss": 0.034405875205993655, + "step": 54720 + }, + { + "epoch": 7.768630234208659, + "grad_norm": 1.1035807132720947, + "learning_rate": 9.22347764371895e-05, + "loss": 0.035977023839950564, + "step": 54730 + }, + { + "epoch": 7.770049680624556, + "grad_norm": 8.450946807861328, + "learning_rate": 9.22333569907736e-05, + "loss": 0.034113773703575136, + "step": 54740 + }, + { + "epoch": 7.771469127040454, + "grad_norm": 4.452673435211182, + "learning_rate": 9.22319375443577e-05, + "loss": 0.06801862716674804, + "step": 54750 + }, + { + "epoch": 7.772888573456352, + "grad_norm": 4.749868869781494, + "learning_rate": 9.22305180979418e-05, + "loss": 0.033065930008888245, + "step": 54760 + }, + { + "epoch": 7.77430801987225, + "grad_norm": 0.5759822726249695, + "learning_rate": 9.222909865152591e-05, + "loss": 0.01797463297843933, + "step": 54770 + }, + { + "epoch": 7.7757274662881475, + "grad_norm": 2.1026320457458496, + "learning_rate": 9.222767920511001e-05, + "loss": 0.061784428358078, + "step": 54780 + }, + { + "epoch": 7.777146912704046, + "grad_norm": 0.6788957118988037, + "learning_rate": 9.222625975869411e-05, + "loss": 0.008597303926944733, + "step": 54790 + }, + { + "epoch": 7.778566359119943, + "grad_norm": 4.156731128692627, + "learning_rate": 9.222484031227822e-05, + "loss": 0.03035602569580078, + "step": 54800 + }, + { + "epoch": 7.779985805535841, + "grad_norm": 0.14249205589294434, + "learning_rate": 9.222342086586232e-05, + "loss": 0.012423336505889893, + "step": 54810 + }, + { + "epoch": 7.781405251951739, + "grad_norm": 0.955420970916748, + "learning_rate": 9.222200141944643e-05, + "loss": 0.06543527245521545, + "step": 54820 + }, + { + "epoch": 7.782824698367636, + "grad_norm": 0.056475285440683365, + "learning_rate": 9.222058197303051e-05, + "loss": 0.02058243304491043, + "step": 54830 + }, + { + "epoch": 7.784244144783535, + "grad_norm": 0.41992461681365967, + "learning_rate": 9.221916252661462e-05, + "loss": 0.0476148247718811, + "step": 54840 + }, + { + "epoch": 7.785663591199432, + "grad_norm": 8.196581840515137, + "learning_rate": 9.221774308019872e-05, + "loss": 0.036193230748176576, + "step": 54850 + }, + { + "epoch": 7.78708303761533, + "grad_norm": 0.038563072681427, + "learning_rate": 9.221632363378283e-05, + "loss": 0.02522934377193451, + "step": 54860 + }, + { + "epoch": 7.788502484031228, + "grad_norm": 6.226294040679932, + "learning_rate": 9.221490418736693e-05, + "loss": 0.05935906767845154, + "step": 54870 + }, + { + "epoch": 7.789921930447125, + "grad_norm": 4.046054363250732, + "learning_rate": 9.221348474095103e-05, + "loss": 0.06731109023094177, + "step": 54880 + }, + { + "epoch": 7.7913413768630235, + "grad_norm": 8.530564308166504, + "learning_rate": 9.221206529453514e-05, + "loss": 0.07162481546401978, + "step": 54890 + }, + { + "epoch": 7.792760823278921, + "grad_norm": 0.30639684200286865, + "learning_rate": 9.221064584811923e-05, + "loss": 0.04374118745326996, + "step": 54900 + }, + { + "epoch": 7.794180269694819, + "grad_norm": 3.7728283405303955, + "learning_rate": 9.220922640170334e-05, + "loss": 0.04018616378307342, + "step": 54910 + }, + { + "epoch": 7.795599716110717, + "grad_norm": 5.375783443450928, + "learning_rate": 9.220780695528744e-05, + "loss": 0.07194701433181763, + "step": 54920 + }, + { + "epoch": 7.797019162526615, + "grad_norm": 0.9897046089172363, + "learning_rate": 9.220638750887154e-05, + "loss": 0.012161526829004288, + "step": 54930 + }, + { + "epoch": 7.798438608942512, + "grad_norm": 8.78571891784668, + "learning_rate": 9.220496806245564e-05, + "loss": 0.04038854837417603, + "step": 54940 + }, + { + "epoch": 7.79985805535841, + "grad_norm": 6.383355617523193, + "learning_rate": 9.220354861603975e-05, + "loss": 0.043821310997009276, + "step": 54950 + }, + { + "epoch": 7.801277501774308, + "grad_norm": 0.5270785689353943, + "learning_rate": 9.220212916962385e-05, + "loss": 0.039912080764770506, + "step": 54960 + }, + { + "epoch": 7.8026969481902055, + "grad_norm": 1.264320969581604, + "learning_rate": 9.220070972320796e-05, + "loss": 0.03971914649009704, + "step": 54970 + }, + { + "epoch": 7.804116394606104, + "grad_norm": 5.6460862159729, + "learning_rate": 9.219929027679207e-05, + "loss": 0.04472689926624298, + "step": 54980 + }, + { + "epoch": 7.805535841022001, + "grad_norm": 4.065241813659668, + "learning_rate": 9.219787083037615e-05, + "loss": 0.08542245626449585, + "step": 54990 + }, + { + "epoch": 7.8069552874379, + "grad_norm": 1.2237682342529297, + "learning_rate": 9.219645138396026e-05, + "loss": 0.040656208992004395, + "step": 55000 + }, + { + "epoch": 7.8069552874379, + "eval_accuracy": 0.9778088637375214, + "eval_loss": 0.07491476088762283, + "eval_runtime": 33.0238, + "eval_samples_per_second": 476.233, + "eval_steps_per_second": 14.898, + "step": 55000 + }, + { + "epoch": 7.808374733853797, + "grad_norm": 5.090839862823486, + "learning_rate": 9.219503193754436e-05, + "loss": 0.038296476006507874, + "step": 55010 + }, + { + "epoch": 7.809794180269694, + "grad_norm": 1.009082317352295, + "learning_rate": 9.219361249112847e-05, + "loss": 0.026539346575736998, + "step": 55020 + }, + { + "epoch": 7.811213626685593, + "grad_norm": 2.415933132171631, + "learning_rate": 9.219219304471257e-05, + "loss": 0.05707488656044006, + "step": 55030 + }, + { + "epoch": 7.81263307310149, + "grad_norm": 10.987373352050781, + "learning_rate": 9.219077359829666e-05, + "loss": 0.041009390354156496, + "step": 55040 + }, + { + "epoch": 7.814052519517388, + "grad_norm": 4.925211429595947, + "learning_rate": 9.218935415188076e-05, + "loss": 0.03250816464424133, + "step": 55050 + }, + { + "epoch": 7.815471965933286, + "grad_norm": 0.5724888443946838, + "learning_rate": 9.218793470546487e-05, + "loss": 0.024578140676021577, + "step": 55060 + }, + { + "epoch": 7.816891412349184, + "grad_norm": 5.185868263244629, + "learning_rate": 9.218651525904898e-05, + "loss": 0.11081494092941284, + "step": 55070 + }, + { + "epoch": 7.818310858765082, + "grad_norm": 0.41721123456954956, + "learning_rate": 9.218509581263308e-05, + "loss": 0.02687859833240509, + "step": 55080 + }, + { + "epoch": 7.819730305180979, + "grad_norm": 0.6981793642044067, + "learning_rate": 9.218367636621718e-05, + "loss": 0.043942618370056155, + "step": 55090 + }, + { + "epoch": 7.821149751596877, + "grad_norm": 0.15657079219818115, + "learning_rate": 9.218225691980128e-05, + "loss": 0.022310236096382143, + "step": 55100 + }, + { + "epoch": 7.822569198012775, + "grad_norm": 3.7001686096191406, + "learning_rate": 9.218083747338539e-05, + "loss": 0.03169908821582794, + "step": 55110 + }, + { + "epoch": 7.823988644428673, + "grad_norm": 0.7200214266777039, + "learning_rate": 9.217941802696948e-05, + "loss": 0.03638424575328827, + "step": 55120 + }, + { + "epoch": 7.8254080908445705, + "grad_norm": 2.6627345085144043, + "learning_rate": 9.21779985805536e-05, + "loss": 0.055549895763397215, + "step": 55130 + }, + { + "epoch": 7.826827537260469, + "grad_norm": 0.5793723464012146, + "learning_rate": 9.217657913413768e-05, + "loss": 0.039405593276023866, + "step": 55140 + }, + { + "epoch": 7.828246983676366, + "grad_norm": 0.7050085067749023, + "learning_rate": 9.217515968772179e-05, + "loss": 0.017613281309604645, + "step": 55150 + }, + { + "epoch": 7.829666430092264, + "grad_norm": 0.11719467490911484, + "learning_rate": 9.21737402413059e-05, + "loss": 0.0698439598083496, + "step": 55160 + }, + { + "epoch": 7.831085876508162, + "grad_norm": 10.498146057128906, + "learning_rate": 9.217232079489e-05, + "loss": 0.046284270286560056, + "step": 55170 + }, + { + "epoch": 7.832505322924059, + "grad_norm": 4.107329368591309, + "learning_rate": 9.217090134847411e-05, + "loss": 0.07234654426574708, + "step": 55180 + }, + { + "epoch": 7.833924769339958, + "grad_norm": 7.7555742263793945, + "learning_rate": 9.21694819020582e-05, + "loss": 0.04864185750484466, + "step": 55190 + }, + { + "epoch": 7.835344215755855, + "grad_norm": 0.09962215274572372, + "learning_rate": 9.21680624556423e-05, + "loss": 0.06997425556182861, + "step": 55200 + }, + { + "epoch": 7.836763662171753, + "grad_norm": 0.10970594733953476, + "learning_rate": 9.21666430092264e-05, + "loss": 0.08814730048179627, + "step": 55210 + }, + { + "epoch": 7.838183108587651, + "grad_norm": 1.0084171295166016, + "learning_rate": 9.216522356281051e-05, + "loss": 0.05179111957550049, + "step": 55220 + }, + { + "epoch": 7.839602555003548, + "grad_norm": 0.7253168821334839, + "learning_rate": 9.216380411639461e-05, + "loss": 0.0762027621269226, + "step": 55230 + }, + { + "epoch": 7.8410220014194465, + "grad_norm": 0.6522200107574463, + "learning_rate": 9.216238466997871e-05, + "loss": 0.09144155979156494, + "step": 55240 + }, + { + "epoch": 7.842441447835344, + "grad_norm": 0.2461749017238617, + "learning_rate": 9.216096522356282e-05, + "loss": 0.05044819116592407, + "step": 55250 + }, + { + "epoch": 7.843860894251242, + "grad_norm": 0.39150509238243103, + "learning_rate": 9.215954577714692e-05, + "loss": 0.05807398557662964, + "step": 55260 + }, + { + "epoch": 7.84528034066714, + "grad_norm": 0.10233612358570099, + "learning_rate": 9.215812633073103e-05, + "loss": 0.05883774161338806, + "step": 55270 + }, + { + "epoch": 7.846699787083038, + "grad_norm": 0.454196035861969, + "learning_rate": 9.215670688431512e-05, + "loss": 0.05965339541435242, + "step": 55280 + }, + { + "epoch": 7.848119233498935, + "grad_norm": 0.20604932308197021, + "learning_rate": 9.215528743789923e-05, + "loss": 0.03166616261005402, + "step": 55290 + }, + { + "epoch": 7.849538679914833, + "grad_norm": 1.6209155321121216, + "learning_rate": 9.215386799148332e-05, + "loss": 0.009179739654064179, + "step": 55300 + }, + { + "epoch": 7.850958126330731, + "grad_norm": 0.06912713497877121, + "learning_rate": 9.215244854506743e-05, + "loss": 0.014850091934204102, + "step": 55310 + }, + { + "epoch": 7.8523775727466285, + "grad_norm": 4.286614418029785, + "learning_rate": 9.215102909865153e-05, + "loss": 0.018069779872894286, + "step": 55320 + }, + { + "epoch": 7.853797019162527, + "grad_norm": 0.020820684731006622, + "learning_rate": 9.214960965223564e-05, + "loss": 0.017675217986106873, + "step": 55330 + }, + { + "epoch": 7.855216465578424, + "grad_norm": 0.03339977562427521, + "learning_rate": 9.214819020581974e-05, + "loss": 0.03491029143333435, + "step": 55340 + }, + { + "epoch": 7.8566359119943225, + "grad_norm": 1.0540772676467896, + "learning_rate": 9.214677075940383e-05, + "loss": 0.05180479884147644, + "step": 55350 + }, + { + "epoch": 7.85805535841022, + "grad_norm": 6.752560615539551, + "learning_rate": 9.214535131298794e-05, + "loss": 0.0732165277004242, + "step": 55360 + }, + { + "epoch": 7.859474804826117, + "grad_norm": 6.2393341064453125, + "learning_rate": 9.214393186657204e-05, + "loss": 0.039051464200019835, + "step": 55370 + }, + { + "epoch": 7.860894251242016, + "grad_norm": 0.16125303506851196, + "learning_rate": 9.214251242015615e-05, + "loss": 0.06089925169944763, + "step": 55380 + }, + { + "epoch": 7.862313697657913, + "grad_norm": 0.10348144173622131, + "learning_rate": 9.214109297374025e-05, + "loss": 0.035461637377738955, + "step": 55390 + }, + { + "epoch": 7.863733144073811, + "grad_norm": 0.2502758800983429, + "learning_rate": 9.213967352732435e-05, + "loss": 0.030822911858558656, + "step": 55400 + }, + { + "epoch": 7.865152590489709, + "grad_norm": 0.2445206493139267, + "learning_rate": 9.213825408090844e-05, + "loss": 0.1120613694190979, + "step": 55410 + }, + { + "epoch": 7.866572036905607, + "grad_norm": 2.0001306533813477, + "learning_rate": 9.213683463449255e-05, + "loss": 0.027965742349624633, + "step": 55420 + }, + { + "epoch": 7.8679914833215046, + "grad_norm": 8.780298233032227, + "learning_rate": 9.213541518807665e-05, + "loss": 0.06586897969245911, + "step": 55430 + }, + { + "epoch": 7.869410929737402, + "grad_norm": 0.05981897935271263, + "learning_rate": 9.213399574166076e-05, + "loss": 0.03677443265914917, + "step": 55440 + }, + { + "epoch": 7.8708303761533, + "grad_norm": 6.485029220581055, + "learning_rate": 9.213257629524486e-05, + "loss": 0.05677640438079834, + "step": 55450 + }, + { + "epoch": 7.872249822569198, + "grad_norm": 0.5603395700454712, + "learning_rate": 9.213115684882896e-05, + "loss": 0.02789378762245178, + "step": 55460 + }, + { + "epoch": 7.873669268985096, + "grad_norm": 0.6483574509620667, + "learning_rate": 9.212973740241307e-05, + "loss": 0.03915688693523407, + "step": 55470 + }, + { + "epoch": 7.875088715400993, + "grad_norm": 9.088214874267578, + "learning_rate": 9.212831795599717e-05, + "loss": 0.03713131546974182, + "step": 55480 + }, + { + "epoch": 7.876508161816892, + "grad_norm": 6.221549034118652, + "learning_rate": 9.212689850958128e-05, + "loss": 0.02524724304676056, + "step": 55490 + }, + { + "epoch": 7.877927608232789, + "grad_norm": 0.056300897151231766, + "learning_rate": 9.212547906316536e-05, + "loss": 0.039009875059127806, + "step": 55500 + }, + { + "epoch": 7.877927608232789, + "eval_accuracy": 0.9793349017613022, + "eval_loss": 0.067763552069664, + "eval_runtime": 32.7453, + "eval_samples_per_second": 480.282, + "eval_steps_per_second": 15.025, + "step": 55500 + }, + { + "epoch": 7.879347054648687, + "grad_norm": 12.098105430603027, + "learning_rate": 9.212405961674947e-05, + "loss": 0.0938036561012268, + "step": 55510 + }, + { + "epoch": 7.880766501064585, + "grad_norm": 4.351677894592285, + "learning_rate": 9.212264017033357e-05, + "loss": 0.1319635510444641, + "step": 55520 + }, + { + "epoch": 7.882185947480482, + "grad_norm": 13.525969505310059, + "learning_rate": 9.212122072391768e-05, + "loss": 0.10193095207214356, + "step": 55530 + }, + { + "epoch": 7.883605393896381, + "grad_norm": 3.3304812908172607, + "learning_rate": 9.211980127750178e-05, + "loss": 0.0709221601486206, + "step": 55540 + }, + { + "epoch": 7.885024840312278, + "grad_norm": 1.3782596588134766, + "learning_rate": 9.211838183108588e-05, + "loss": 0.0606769323348999, + "step": 55550 + }, + { + "epoch": 7.886444286728176, + "grad_norm": 0.3674951195716858, + "learning_rate": 9.211696238466999e-05, + "loss": 0.044303598999977115, + "step": 55560 + }, + { + "epoch": 7.887863733144074, + "grad_norm": 1.6610469818115234, + "learning_rate": 9.211554293825408e-05, + "loss": 0.04471073746681213, + "step": 55570 + }, + { + "epoch": 7.889283179559971, + "grad_norm": 0.8602482676506042, + "learning_rate": 9.21141234918382e-05, + "loss": 0.030886751413345338, + "step": 55580 + }, + { + "epoch": 7.8907026259758695, + "grad_norm": 2.7088918685913086, + "learning_rate": 9.211270404542229e-05, + "loss": 0.06906713843345642, + "step": 55590 + }, + { + "epoch": 7.892122072391767, + "grad_norm": 0.3230050504207611, + "learning_rate": 9.211128459900639e-05, + "loss": 0.053826934099197386, + "step": 55600 + }, + { + "epoch": 7.893541518807665, + "grad_norm": 0.27713441848754883, + "learning_rate": 9.210986515259049e-05, + "loss": 0.04268667995929718, + "step": 55610 + }, + { + "epoch": 7.894960965223563, + "grad_norm": 12.684019088745117, + "learning_rate": 9.21084457061746e-05, + "loss": 0.03835551738739014, + "step": 55620 + }, + { + "epoch": 7.896380411639461, + "grad_norm": 0.29225078225135803, + "learning_rate": 9.21070262597587e-05, + "loss": 0.08084606528282165, + "step": 55630 + }, + { + "epoch": 7.897799858055358, + "grad_norm": 0.03829476609826088, + "learning_rate": 9.21056068133428e-05, + "loss": 0.06641941666603088, + "step": 55640 + }, + { + "epoch": 7.899219304471256, + "grad_norm": 0.5478501915931702, + "learning_rate": 9.21041873669269e-05, + "loss": 0.06162059307098389, + "step": 55650 + }, + { + "epoch": 7.900638750887154, + "grad_norm": 2.589578628540039, + "learning_rate": 9.2102767920511e-05, + "loss": 0.025594592094421387, + "step": 55660 + }, + { + "epoch": 7.9020581973030515, + "grad_norm": 7.136966228485107, + "learning_rate": 9.210134847409511e-05, + "loss": 0.0472207635641098, + "step": 55670 + }, + { + "epoch": 7.90347764371895, + "grad_norm": 5.2671966552734375, + "learning_rate": 9.209992902767921e-05, + "loss": 0.06289007067680359, + "step": 55680 + }, + { + "epoch": 7.904897090134847, + "grad_norm": 1.7476320266723633, + "learning_rate": 9.209850958126332e-05, + "loss": 0.02860119938850403, + "step": 55690 + }, + { + "epoch": 7.9063165365507455, + "grad_norm": 1.2390192747116089, + "learning_rate": 9.209709013484742e-05, + "loss": 0.07715204358100891, + "step": 55700 + }, + { + "epoch": 7.907735982966643, + "grad_norm": 7.412806510925293, + "learning_rate": 9.209567068843151e-05, + "loss": 0.05441153049468994, + "step": 55710 + }, + { + "epoch": 7.90915542938254, + "grad_norm": 0.0410551056265831, + "learning_rate": 9.209425124201561e-05, + "loss": 0.03540098369121551, + "step": 55720 + }, + { + "epoch": 7.910574875798439, + "grad_norm": 0.848318874835968, + "learning_rate": 9.209283179559972e-05, + "loss": 0.04798963665962219, + "step": 55730 + }, + { + "epoch": 7.911994322214336, + "grad_norm": 0.9137446284294128, + "learning_rate": 9.209141234918382e-05, + "loss": 0.046340417861938474, + "step": 55740 + }, + { + "epoch": 7.913413768630234, + "grad_norm": 0.49675452709198, + "learning_rate": 9.208999290276793e-05, + "loss": 0.03758726119995117, + "step": 55750 + }, + { + "epoch": 7.914833215046132, + "grad_norm": 8.145282745361328, + "learning_rate": 9.208857345635203e-05, + "loss": 0.01985916793346405, + "step": 55760 + }, + { + "epoch": 7.91625266146203, + "grad_norm": 0.5327089428901672, + "learning_rate": 9.208715400993613e-05, + "loss": 0.0250131219625473, + "step": 55770 + }, + { + "epoch": 7.9176721078779275, + "grad_norm": 6.69525146484375, + "learning_rate": 9.208573456352024e-05, + "loss": 0.06276538968086243, + "step": 55780 + }, + { + "epoch": 7.919091554293825, + "grad_norm": 2.453524589538574, + "learning_rate": 9.208431511710433e-05, + "loss": 0.05585165619850159, + "step": 55790 + }, + { + "epoch": 7.920511000709723, + "grad_norm": 1.1722917556762695, + "learning_rate": 9.208289567068844e-05, + "loss": 0.03681559562683105, + "step": 55800 + }, + { + "epoch": 7.921930447125621, + "grad_norm": 2.173949718475342, + "learning_rate": 9.208147622427253e-05, + "loss": 0.0207523837685585, + "step": 55810 + }, + { + "epoch": 7.923349893541519, + "grad_norm": 1.8896828889846802, + "learning_rate": 9.208005677785664e-05, + "loss": 0.012036536633968354, + "step": 55820 + }, + { + "epoch": 7.924769339957416, + "grad_norm": 12.876811981201172, + "learning_rate": 9.207863733144074e-05, + "loss": 0.06892849206924438, + "step": 55830 + }, + { + "epoch": 7.926188786373315, + "grad_norm": 8.169452667236328, + "learning_rate": 9.207721788502485e-05, + "loss": 0.019931772351264955, + "step": 55840 + }, + { + "epoch": 7.927608232789212, + "grad_norm": 9.522194862365723, + "learning_rate": 9.207594038325053e-05, + "loss": 0.07275225520133972, + "step": 55850 + }, + { + "epoch": 7.9290276792051095, + "grad_norm": 4.947075366973877, + "learning_rate": 9.207452093683464e-05, + "loss": 0.01999004781246185, + "step": 55860 + }, + { + "epoch": 7.930447125621008, + "grad_norm": 0.13232362270355225, + "learning_rate": 9.207310149041874e-05, + "loss": 0.03356763422489166, + "step": 55870 + }, + { + "epoch": 7.931866572036905, + "grad_norm": 7.417023658752441, + "learning_rate": 9.207168204400284e-05, + "loss": 0.023995618522167205, + "step": 55880 + }, + { + "epoch": 7.933286018452804, + "grad_norm": 0.10036950558423996, + "learning_rate": 9.207026259758694e-05, + "loss": 0.05645661950111389, + "step": 55890 + }, + { + "epoch": 7.934705464868701, + "grad_norm": 3.993074417114258, + "learning_rate": 9.206884315117105e-05, + "loss": 0.05878961682319641, + "step": 55900 + }, + { + "epoch": 7.936124911284599, + "grad_norm": 6.879668235778809, + "learning_rate": 9.206742370475516e-05, + "loss": 0.04947432279586792, + "step": 55910 + }, + { + "epoch": 7.937544357700497, + "grad_norm": 1.353629469871521, + "learning_rate": 9.206600425833926e-05, + "loss": 0.06450677514076233, + "step": 55920 + }, + { + "epoch": 7.938963804116394, + "grad_norm": 1.9450629949569702, + "learning_rate": 9.206458481192335e-05, + "loss": 0.08942593336105346, + "step": 55930 + }, + { + "epoch": 7.940383250532292, + "grad_norm": 13.481873512268066, + "learning_rate": 9.206316536550745e-05, + "loss": 0.04448253512382507, + "step": 55940 + }, + { + "epoch": 7.94180269694819, + "grad_norm": 5.543252468109131, + "learning_rate": 9.206174591909156e-05, + "loss": 0.1036454200744629, + "step": 55950 + }, + { + "epoch": 7.943222143364088, + "grad_norm": 4.105419158935547, + "learning_rate": 9.206032647267566e-05, + "loss": 0.06337498426437378, + "step": 55960 + }, + { + "epoch": 7.944641589779986, + "grad_norm": 0.6640766263008118, + "learning_rate": 9.205890702625977e-05, + "loss": 0.03773975372314453, + "step": 55970 + }, + { + "epoch": 7.946061036195884, + "grad_norm": 2.163017988204956, + "learning_rate": 9.205748757984387e-05, + "loss": 0.09545602798461914, + "step": 55980 + }, + { + "epoch": 7.947480482611781, + "grad_norm": 6.420900821685791, + "learning_rate": 9.205606813342796e-05, + "loss": 0.05853666663169861, + "step": 55990 + }, + { + "epoch": 7.948899929027679, + "grad_norm": 6.487877368927002, + "learning_rate": 9.205464868701207e-05, + "loss": 0.057965916395187375, + "step": 56000 + }, + { + "epoch": 7.948899929027679, + "eval_accuracy": 0.9719590513130285, + "eval_loss": 0.09048442542552948, + "eval_runtime": 34.3429, + "eval_samples_per_second": 457.94, + "eval_steps_per_second": 14.326, + "step": 56000 + }, + { + "epoch": 7.950319375443577, + "grad_norm": 0.12126820534467697, + "learning_rate": 9.205322924059617e-05, + "loss": 0.04208188056945801, + "step": 56010 + }, + { + "epoch": 7.9517388218594744, + "grad_norm": 1.2649909257888794, + "learning_rate": 9.205180979418028e-05, + "loss": 0.041204127669334414, + "step": 56020 + }, + { + "epoch": 7.953158268275373, + "grad_norm": 0.5406652092933655, + "learning_rate": 9.205039034776438e-05, + "loss": 0.013593432307243348, + "step": 56030 + }, + { + "epoch": 7.95457771469127, + "grad_norm": 0.8398789763450623, + "learning_rate": 9.204897090134848e-05, + "loss": 0.024010343849658965, + "step": 56040 + }, + { + "epoch": 7.9559971611071685, + "grad_norm": 8.085817337036133, + "learning_rate": 9.204755145493258e-05, + "loss": 0.039969196915626524, + "step": 56050 + }, + { + "epoch": 7.957416607523066, + "grad_norm": 5.8064398765563965, + "learning_rate": 9.204613200851669e-05, + "loss": 0.05945647358894348, + "step": 56060 + }, + { + "epoch": 7.958836053938963, + "grad_norm": 5.265396595001221, + "learning_rate": 9.204471256210078e-05, + "loss": 0.049289605021476744, + "step": 56070 + }, + { + "epoch": 7.960255500354862, + "grad_norm": 0.12839733064174652, + "learning_rate": 9.20432931156849e-05, + "loss": 0.05018383264541626, + "step": 56080 + }, + { + "epoch": 7.961674946770759, + "grad_norm": 2.0961010456085205, + "learning_rate": 9.204187366926899e-05, + "loss": 0.06823894381523132, + "step": 56090 + }, + { + "epoch": 7.963094393186657, + "grad_norm": 4.042204856872559, + "learning_rate": 9.204045422285309e-05, + "loss": 0.04348786175251007, + "step": 56100 + }, + { + "epoch": 7.964513839602555, + "grad_norm": 5.022154331207275, + "learning_rate": 9.20390347764372e-05, + "loss": 0.05021085143089295, + "step": 56110 + }, + { + "epoch": 7.965933286018453, + "grad_norm": 3.282322406768799, + "learning_rate": 9.20376153300213e-05, + "loss": 0.0450539231300354, + "step": 56120 + }, + { + "epoch": 7.9673527324343505, + "grad_norm": 7.373341083526611, + "learning_rate": 9.203619588360541e-05, + "loss": 0.044395309686660764, + "step": 56130 + }, + { + "epoch": 7.968772178850248, + "grad_norm": 5.55653715133667, + "learning_rate": 9.203477643718949e-05, + "loss": 0.034415480494499204, + "step": 56140 + }, + { + "epoch": 7.970191625266146, + "grad_norm": 4.097558975219727, + "learning_rate": 9.20333569907736e-05, + "loss": 0.08280274868011475, + "step": 56150 + }, + { + "epoch": 7.971611071682044, + "grad_norm": 0.11635271459817886, + "learning_rate": 9.20319375443577e-05, + "loss": 0.010257638990879059, + "step": 56160 + }, + { + "epoch": 7.973030518097942, + "grad_norm": 0.73923659324646, + "learning_rate": 9.203051809794181e-05, + "loss": 0.02687770426273346, + "step": 56170 + }, + { + "epoch": 7.974449964513839, + "grad_norm": 0.6702300310134888, + "learning_rate": 9.202909865152591e-05, + "loss": 0.053887850046157836, + "step": 56180 + }, + { + "epoch": 7.975869410929738, + "grad_norm": 2.243748664855957, + "learning_rate": 9.202767920511e-05, + "loss": 0.03436025381088257, + "step": 56190 + }, + { + "epoch": 7.977288857345635, + "grad_norm": 0.31168264150619507, + "learning_rate": 9.202625975869412e-05, + "loss": 0.08324195742607117, + "step": 56200 + }, + { + "epoch": 7.9787083037615325, + "grad_norm": 0.38794392347335815, + "learning_rate": 9.202484031227821e-05, + "loss": 0.026874610781669618, + "step": 56210 + }, + { + "epoch": 7.980127750177431, + "grad_norm": 1.8175753355026245, + "learning_rate": 9.202342086586233e-05, + "loss": 0.020836614072322845, + "step": 56220 + }, + { + "epoch": 7.981547196593328, + "grad_norm": 0.1702578067779541, + "learning_rate": 9.202200141944642e-05, + "loss": 0.09062458276748657, + "step": 56230 + }, + { + "epoch": 7.9829666430092265, + "grad_norm": 4.012740612030029, + "learning_rate": 9.202058197303052e-05, + "loss": 0.054927331209182736, + "step": 56240 + }, + { + "epoch": 7.984386089425124, + "grad_norm": 1.2998656034469604, + "learning_rate": 9.201916252661462e-05, + "loss": 0.01595239043235779, + "step": 56250 + }, + { + "epoch": 7.985805535841022, + "grad_norm": 0.5248915553092957, + "learning_rate": 9.201774308019873e-05, + "loss": 0.054577767848968506, + "step": 56260 + }, + { + "epoch": 7.98722498225692, + "grad_norm": 8.47024154663086, + "learning_rate": 9.201632363378283e-05, + "loss": 0.07780020236968994, + "step": 56270 + }, + { + "epoch": 7.988644428672818, + "grad_norm": 0.10993131995201111, + "learning_rate": 9.201490418736694e-05, + "loss": 0.006740601360797882, + "step": 56280 + }, + { + "epoch": 7.990063875088715, + "grad_norm": 2.291124105453491, + "learning_rate": 9.201348474095103e-05, + "loss": 0.0112013079226017, + "step": 56290 + }, + { + "epoch": 7.991483321504613, + "grad_norm": 1.062869906425476, + "learning_rate": 9.201206529453513e-05, + "loss": 0.013367721438407898, + "step": 56300 + }, + { + "epoch": 7.992902767920511, + "grad_norm": 0.40215814113616943, + "learning_rate": 9.201064584811924e-05, + "loss": 0.01611262857913971, + "step": 56310 + }, + { + "epoch": 7.9943222143364085, + "grad_norm": 2.8733534812927246, + "learning_rate": 9.200922640170334e-05, + "loss": 0.010476227104663848, + "step": 56320 + }, + { + "epoch": 7.995741660752307, + "grad_norm": 4.937908172607422, + "learning_rate": 9.200780695528745e-05, + "loss": 0.018619158864021303, + "step": 56330 + }, + { + "epoch": 7.997161107168204, + "grad_norm": 6.0632829666137695, + "learning_rate": 9.200638750887155e-05, + "loss": 0.024484434723854066, + "step": 56340 + }, + { + "epoch": 7.998580553584103, + "grad_norm": 0.44283390045166016, + "learning_rate": 9.200496806245565e-05, + "loss": 0.005785078555345535, + "step": 56350 + }, + { + "epoch": 8.0, + "grad_norm": 10.04196548461914, + "learning_rate": 9.200354861603974e-05, + "loss": 0.03202285170555115, + "step": 56360 + }, + { + "epoch": 8.001419446415898, + "grad_norm": 3.1271493434906006, + "learning_rate": 9.200212916962385e-05, + "loss": 0.00439850240945816, + "step": 56370 + }, + { + "epoch": 8.002838892831795, + "grad_norm": 6.92768669128418, + "learning_rate": 9.200070972320795e-05, + "loss": 0.04164240658283234, + "step": 56380 + }, + { + "epoch": 8.004258339247693, + "grad_norm": 1.7308955192565918, + "learning_rate": 9.199929027679206e-05, + "loss": 0.018328216671943665, + "step": 56390 + }, + { + "epoch": 8.005677785663591, + "grad_norm": 3.8155698776245117, + "learning_rate": 9.199787083037616e-05, + "loss": 0.05507233142852783, + "step": 56400 + }, + { + "epoch": 8.00709723207949, + "grad_norm": 1.0746243000030518, + "learning_rate": 9.199645138396026e-05, + "loss": 0.044785207509994505, + "step": 56410 + }, + { + "epoch": 8.008516678495386, + "grad_norm": 3.7036960124969482, + "learning_rate": 9.199503193754437e-05, + "loss": 0.08312212824821472, + "step": 56420 + }, + { + "epoch": 8.009936124911285, + "grad_norm": 0.4116200804710388, + "learning_rate": 9.199361249112847e-05, + "loss": 0.06781967878341674, + "step": 56430 + }, + { + "epoch": 8.011355571327183, + "grad_norm": 1.766020655632019, + "learning_rate": 9.199219304471258e-05, + "loss": 0.05059321522712708, + "step": 56440 + }, + { + "epoch": 8.01277501774308, + "grad_norm": 8.156139373779297, + "learning_rate": 9.199077359829666e-05, + "loss": 0.042923194169998166, + "step": 56450 + }, + { + "epoch": 8.014194464158978, + "grad_norm": 7.318849086761475, + "learning_rate": 9.198935415188077e-05, + "loss": 0.022391645610332488, + "step": 56460 + }, + { + "epoch": 8.015613910574876, + "grad_norm": 7.450283527374268, + "learning_rate": 9.198793470546487e-05, + "loss": 0.0307634174823761, + "step": 56470 + }, + { + "epoch": 8.017033356990774, + "grad_norm": 0.19863024353981018, + "learning_rate": 9.198651525904898e-05, + "loss": 0.02928740680217743, + "step": 56480 + }, + { + "epoch": 8.01845280340667, + "grad_norm": 3.1025230884552, + "learning_rate": 9.198509581263308e-05, + "loss": 0.031206589937210084, + "step": 56490 + }, + { + "epoch": 8.01987224982257, + "grad_norm": 0.772976279258728, + "learning_rate": 9.198367636621717e-05, + "loss": 0.022878825664520264, + "step": 56500 + }, + { + "epoch": 8.01987224982257, + "eval_accuracy": 0.9783811279964393, + "eval_loss": 0.06529545783996582, + "eval_runtime": 32.6501, + "eval_samples_per_second": 481.682, + "eval_steps_per_second": 15.069, + "step": 56500 + }, + { + "epoch": 8.021291696238467, + "grad_norm": 0.32695847749710083, + "learning_rate": 9.198225691980129e-05, + "loss": 0.02905186414718628, + "step": 56510 + }, + { + "epoch": 8.022711142654364, + "grad_norm": 7.177985668182373, + "learning_rate": 9.198083747338538e-05, + "loss": 0.047767966985702515, + "step": 56520 + }, + { + "epoch": 8.024130589070262, + "grad_norm": 3.370894432067871, + "learning_rate": 9.19794180269695e-05, + "loss": 0.03640216886997223, + "step": 56530 + }, + { + "epoch": 8.02555003548616, + "grad_norm": 6.081117153167725, + "learning_rate": 9.197799858055359e-05, + "loss": 0.04813358187675476, + "step": 56540 + }, + { + "epoch": 8.026969481902059, + "grad_norm": 0.587363600730896, + "learning_rate": 9.197657913413769e-05, + "loss": 0.04699629247188568, + "step": 56550 + }, + { + "epoch": 8.028388928317955, + "grad_norm": 2.6055822372436523, + "learning_rate": 9.197515968772179e-05, + "loss": 0.012260462343692779, + "step": 56560 + }, + { + "epoch": 8.029808374733854, + "grad_norm": 1.192359447479248, + "learning_rate": 9.19737402413059e-05, + "loss": 0.08580980896949768, + "step": 56570 + }, + { + "epoch": 8.031227821149752, + "grad_norm": 2.338804006576538, + "learning_rate": 9.197232079489e-05, + "loss": 0.04305451214313507, + "step": 56580 + }, + { + "epoch": 8.032647267565649, + "grad_norm": 1.1648589372634888, + "learning_rate": 9.19709013484741e-05, + "loss": 0.027252352237701415, + "step": 56590 + }, + { + "epoch": 8.034066713981547, + "grad_norm": 0.39368653297424316, + "learning_rate": 9.19694819020582e-05, + "loss": 0.027997153997421264, + "step": 56600 + }, + { + "epoch": 8.035486160397445, + "grad_norm": 12.663429260253906, + "learning_rate": 9.19680624556423e-05, + "loss": 0.09810233116149902, + "step": 56610 + }, + { + "epoch": 8.036905606813344, + "grad_norm": 0.03624679520726204, + "learning_rate": 9.196664300922641e-05, + "loss": 0.05567052960395813, + "step": 56620 + }, + { + "epoch": 8.03832505322924, + "grad_norm": 2.5650811195373535, + "learning_rate": 9.196522356281051e-05, + "loss": 0.044278931617736814, + "step": 56630 + }, + { + "epoch": 8.039744499645138, + "grad_norm": 4.6383161544799805, + "learning_rate": 9.196380411639462e-05, + "loss": 0.05508902668952942, + "step": 56640 + }, + { + "epoch": 8.041163946061037, + "grad_norm": 0.8782206177711487, + "learning_rate": 9.19623846699787e-05, + "loss": 0.0375711590051651, + "step": 56650 + }, + { + "epoch": 8.042583392476933, + "grad_norm": 0.17253684997558594, + "learning_rate": 9.196096522356281e-05, + "loss": 0.02836502194404602, + "step": 56660 + }, + { + "epoch": 8.044002838892832, + "grad_norm": 0.18491551280021667, + "learning_rate": 9.195954577714691e-05, + "loss": 0.007738977670669556, + "step": 56670 + }, + { + "epoch": 8.04542228530873, + "grad_norm": 4.824859619140625, + "learning_rate": 9.195812633073102e-05, + "loss": 0.06535216569900512, + "step": 56680 + }, + { + "epoch": 8.046841731724628, + "grad_norm": 0.2667035758495331, + "learning_rate": 9.195670688431512e-05, + "loss": 0.051838308572769165, + "step": 56690 + }, + { + "epoch": 8.048261178140525, + "grad_norm": 1.4201091527938843, + "learning_rate": 9.195528743789923e-05, + "loss": 0.010924571752548217, + "step": 56700 + }, + { + "epoch": 8.049680624556423, + "grad_norm": 5.541697025299072, + "learning_rate": 9.195386799148333e-05, + "loss": 0.03459354043006897, + "step": 56710 + }, + { + "epoch": 8.051100070972321, + "grad_norm": 6.982559680938721, + "learning_rate": 9.195244854506742e-05, + "loss": 0.05080728530883789, + "step": 56720 + }, + { + "epoch": 8.052519517388218, + "grad_norm": 0.2359095960855484, + "learning_rate": 9.195102909865154e-05, + "loss": 0.004116867855191231, + "step": 56730 + }, + { + "epoch": 8.053938963804116, + "grad_norm": 4.762861251831055, + "learning_rate": 9.194960965223563e-05, + "loss": 0.030681657791137695, + "step": 56740 + }, + { + "epoch": 8.055358410220014, + "grad_norm": 0.20742924511432648, + "learning_rate": 9.194819020581974e-05, + "loss": 0.028750818967819215, + "step": 56750 + }, + { + "epoch": 8.056777856635913, + "grad_norm": 0.384204238653183, + "learning_rate": 9.194677075940383e-05, + "loss": 0.05377202033996582, + "step": 56760 + }, + { + "epoch": 8.05819730305181, + "grad_norm": 0.2828398644924164, + "learning_rate": 9.194535131298794e-05, + "loss": 0.053729516267776486, + "step": 56770 + }, + { + "epoch": 8.059616749467708, + "grad_norm": 2.284846305847168, + "learning_rate": 9.194393186657204e-05, + "loss": 0.029895415902137755, + "step": 56780 + }, + { + "epoch": 8.061036195883606, + "grad_norm": 2.943188190460205, + "learning_rate": 9.194251242015615e-05, + "loss": 0.04509938657283783, + "step": 56790 + }, + { + "epoch": 8.062455642299502, + "grad_norm": 0.010644367896020412, + "learning_rate": 9.194109297374024e-05, + "loss": 0.019495250284671785, + "step": 56800 + }, + { + "epoch": 8.0638750887154, + "grad_norm": 6.129455089569092, + "learning_rate": 9.193967352732434e-05, + "loss": 0.043457993865013124, + "step": 56810 + }, + { + "epoch": 8.065294535131299, + "grad_norm": 0.03838249295949936, + "learning_rate": 9.193825408090845e-05, + "loss": 0.02848859131336212, + "step": 56820 + }, + { + "epoch": 8.066713981547197, + "grad_norm": 1.1387866735458374, + "learning_rate": 9.193683463449255e-05, + "loss": 0.018293626606464386, + "step": 56830 + }, + { + "epoch": 8.068133427963094, + "grad_norm": 4.466863632202148, + "learning_rate": 9.193541518807666e-05, + "loss": 0.0418965220451355, + "step": 56840 + }, + { + "epoch": 8.069552874378992, + "grad_norm": 0.7588008642196655, + "learning_rate": 9.193399574166076e-05, + "loss": 0.03291020691394806, + "step": 56850 + }, + { + "epoch": 8.07097232079489, + "grad_norm": 11.674447059631348, + "learning_rate": 9.193257629524486e-05, + "loss": 0.035113191604614256, + "step": 56860 + }, + { + "epoch": 8.072391767210787, + "grad_norm": 4.501001358032227, + "learning_rate": 9.193115684882895e-05, + "loss": 0.0164683535695076, + "step": 56870 + }, + { + "epoch": 8.073811213626685, + "grad_norm": 0.5031645894050598, + "learning_rate": 9.192973740241306e-05, + "loss": 0.049401518702507016, + "step": 56880 + }, + { + "epoch": 8.075230660042584, + "grad_norm": 0.4161587655544281, + "learning_rate": 9.192831795599716e-05, + "loss": 0.041995969414710996, + "step": 56890 + }, + { + "epoch": 8.076650106458482, + "grad_norm": 1.415582299232483, + "learning_rate": 9.192689850958127e-05, + "loss": 0.007475908100605011, + "step": 56900 + }, + { + "epoch": 8.078069552874378, + "grad_norm": 13.384356498718262, + "learning_rate": 9.192547906316537e-05, + "loss": 0.0641768991947174, + "step": 56910 + }, + { + "epoch": 8.079488999290277, + "grad_norm": 2.316368818283081, + "learning_rate": 9.192405961674947e-05, + "loss": 0.019245807826519013, + "step": 56920 + }, + { + "epoch": 8.080908445706175, + "grad_norm": 12.386263847351074, + "learning_rate": 9.192264017033358e-05, + "loss": 0.023176319897174835, + "step": 56930 + }, + { + "epoch": 8.082327892122072, + "grad_norm": 1.6595792770385742, + "learning_rate": 9.192122072391768e-05, + "loss": 0.056136542558670045, + "step": 56940 + }, + { + "epoch": 8.08374733853797, + "grad_norm": 8.082763671875, + "learning_rate": 9.191980127750179e-05, + "loss": 0.08039953112602234, + "step": 56950 + }, + { + "epoch": 8.085166784953868, + "grad_norm": 2.5632402896881104, + "learning_rate": 9.191838183108587e-05, + "loss": 0.03842626512050629, + "step": 56960 + }, + { + "epoch": 8.086586231369767, + "grad_norm": 1.1970226764678955, + "learning_rate": 9.191696238466998e-05, + "loss": 0.018404172360897066, + "step": 56970 + }, + { + "epoch": 8.088005677785663, + "grad_norm": 3.487342119216919, + "learning_rate": 9.191554293825408e-05, + "loss": 0.013470767438411713, + "step": 56980 + }, + { + "epoch": 8.089425124201561, + "grad_norm": 0.19747193157672882, + "learning_rate": 9.191412349183819e-05, + "loss": 0.03778769075870514, + "step": 56990 + }, + { + "epoch": 8.09084457061746, + "grad_norm": 0.46198809146881104, + "learning_rate": 9.191270404542229e-05, + "loss": 0.029198932647705077, + "step": 57000 + }, + { + "epoch": 8.09084457061746, + "eval_accuracy": 0.983849430914987, + "eval_loss": 0.050795987248420715, + "eval_runtime": 33.4179, + "eval_samples_per_second": 470.616, + "eval_steps_per_second": 14.723, + "step": 57000 + }, + { + "epoch": 8.092264017033356, + "grad_norm": 1.7470345497131348, + "learning_rate": 9.191128459900638e-05, + "loss": 0.028783124685287476, + "step": 57010 + }, + { + "epoch": 8.093683463449254, + "grad_norm": 10.447305679321289, + "learning_rate": 9.19098651525905e-05, + "loss": 0.03660332858562469, + "step": 57020 + }, + { + "epoch": 8.095102909865153, + "grad_norm": 2.9325790405273438, + "learning_rate": 9.190844570617459e-05, + "loss": 0.006524309515953064, + "step": 57030 + }, + { + "epoch": 8.096522356281051, + "grad_norm": 0.6765064597129822, + "learning_rate": 9.19070262597587e-05, + "loss": 0.04815886616706848, + "step": 57040 + }, + { + "epoch": 8.097941802696948, + "grad_norm": 0.4403517544269562, + "learning_rate": 9.19056068133428e-05, + "loss": 0.009912458062171937, + "step": 57050 + }, + { + "epoch": 8.099361249112846, + "grad_norm": 0.5875139236450195, + "learning_rate": 9.190418736692691e-05, + "loss": 0.09076798558235169, + "step": 57060 + }, + { + "epoch": 8.100780695528744, + "grad_norm": 0.27696019411087036, + "learning_rate": 9.1902767920511e-05, + "loss": 0.022283504903316497, + "step": 57070 + }, + { + "epoch": 8.10220014194464, + "grad_norm": 0.488571435213089, + "learning_rate": 9.19013484740951e-05, + "loss": 0.01099751442670822, + "step": 57080 + }, + { + "epoch": 8.103619588360539, + "grad_norm": 0.2157941311597824, + "learning_rate": 9.18999290276792e-05, + "loss": 0.018986338376998903, + "step": 57090 + }, + { + "epoch": 8.105039034776437, + "grad_norm": 0.009966165758669376, + "learning_rate": 9.189850958126331e-05, + "loss": 0.03489084541797638, + "step": 57100 + }, + { + "epoch": 8.106458481192336, + "grad_norm": 0.18608416616916656, + "learning_rate": 9.189709013484741e-05, + "loss": 0.055800986289978025, + "step": 57110 + }, + { + "epoch": 8.107877927608232, + "grad_norm": 0.023776527494192123, + "learning_rate": 9.189567068843151e-05, + "loss": 0.050034058094024655, + "step": 57120 + }, + { + "epoch": 8.10929737402413, + "grad_norm": 1.362025260925293, + "learning_rate": 9.189425124201562e-05, + "loss": 0.018491455912590028, + "step": 57130 + }, + { + "epoch": 8.110716820440029, + "grad_norm": 15.343968391418457, + "learning_rate": 9.189283179559972e-05, + "loss": 0.10932686328887939, + "step": 57140 + }, + { + "epoch": 8.112136266855925, + "grad_norm": 0.018440308049321175, + "learning_rate": 9.189141234918383e-05, + "loss": 0.046657025814056396, + "step": 57150 + }, + { + "epoch": 8.113555713271824, + "grad_norm": 0.047869276255369186, + "learning_rate": 9.188999290276793e-05, + "loss": 0.03132939338684082, + "step": 57160 + }, + { + "epoch": 8.114975159687722, + "grad_norm": 0.28167399764060974, + "learning_rate": 9.188857345635202e-05, + "loss": 0.03517512679100036, + "step": 57170 + }, + { + "epoch": 8.11639460610362, + "grad_norm": 1.3338134288787842, + "learning_rate": 9.188715400993612e-05, + "loss": 0.057652842998504636, + "step": 57180 + }, + { + "epoch": 8.117814052519517, + "grad_norm": 0.7916436791419983, + "learning_rate": 9.188573456352023e-05, + "loss": 0.01187950223684311, + "step": 57190 + }, + { + "epoch": 8.119233498935415, + "grad_norm": 10.301247596740723, + "learning_rate": 9.188431511710433e-05, + "loss": 0.027448675036430357, + "step": 57200 + }, + { + "epoch": 8.120652945351313, + "grad_norm": 0.07494665682315826, + "learning_rate": 9.188289567068844e-05, + "loss": 0.0361156702041626, + "step": 57210 + }, + { + "epoch": 8.12207239176721, + "grad_norm": 0.010714360512793064, + "learning_rate": 9.188147622427254e-05, + "loss": 0.08930673599243164, + "step": 57220 + }, + { + "epoch": 8.123491838183108, + "grad_norm": 1.554410696029663, + "learning_rate": 9.188005677785663e-05, + "loss": 0.04430621564388275, + "step": 57230 + }, + { + "epoch": 8.124911284599007, + "grad_norm": 1.4681209325790405, + "learning_rate": 9.187863733144075e-05, + "loss": 0.02614140808582306, + "step": 57240 + }, + { + "epoch": 8.126330731014905, + "grad_norm": 0.13754108548164368, + "learning_rate": 9.187721788502484e-05, + "loss": 0.017473718523979186, + "step": 57250 + }, + { + "epoch": 8.127750177430801, + "grad_norm": 7.498713493347168, + "learning_rate": 9.187579843860895e-05, + "loss": 0.06623184084892272, + "step": 57260 + }, + { + "epoch": 8.1291696238467, + "grad_norm": 4.109618186950684, + "learning_rate": 9.187437899219304e-05, + "loss": 0.016468723118305207, + "step": 57270 + }, + { + "epoch": 8.130589070262598, + "grad_norm": 0.10974710434675217, + "learning_rate": 9.187295954577715e-05, + "loss": 0.03356311023235321, + "step": 57280 + }, + { + "epoch": 8.132008516678495, + "grad_norm": 0.2429163157939911, + "learning_rate": 9.187154009936125e-05, + "loss": 0.011898426711559296, + "step": 57290 + }, + { + "epoch": 8.133427963094393, + "grad_norm": 1.5502437353134155, + "learning_rate": 9.187012065294536e-05, + "loss": 0.03775279223918915, + "step": 57300 + }, + { + "epoch": 8.134847409510291, + "grad_norm": 1.5370547771453857, + "learning_rate": 9.186870120652947e-05, + "loss": 0.07396373748779297, + "step": 57310 + }, + { + "epoch": 8.13626685592619, + "grad_norm": 0.12712019681930542, + "learning_rate": 9.186728176011355e-05, + "loss": 0.032404530048370364, + "step": 57320 + }, + { + "epoch": 8.137686302342086, + "grad_norm": 3.8964903354644775, + "learning_rate": 9.186586231369766e-05, + "loss": 0.01291709989309311, + "step": 57330 + }, + { + "epoch": 8.139105748757984, + "grad_norm": 2.898106575012207, + "learning_rate": 9.186444286728176e-05, + "loss": 0.027319177985191345, + "step": 57340 + }, + { + "epoch": 8.140525195173883, + "grad_norm": 6.190265655517578, + "learning_rate": 9.186302342086587e-05, + "loss": 0.022334299981594086, + "step": 57350 + }, + { + "epoch": 8.14194464158978, + "grad_norm": 6.5095038414001465, + "learning_rate": 9.186160397444997e-05, + "loss": 0.02422604411840439, + "step": 57360 + }, + { + "epoch": 8.143364088005677, + "grad_norm": 0.18886369466781616, + "learning_rate": 9.186018452803407e-05, + "loss": 0.08141739964485169, + "step": 57370 + }, + { + "epoch": 8.144783534421576, + "grad_norm": 0.38172465562820435, + "learning_rate": 9.185876508161816e-05, + "loss": 0.027005189657211305, + "step": 57380 + }, + { + "epoch": 8.146202980837474, + "grad_norm": 3.8629918098449707, + "learning_rate": 9.185734563520227e-05, + "loss": 0.015311364829540253, + "step": 57390 + }, + { + "epoch": 8.14762242725337, + "grad_norm": 0.2354840785264969, + "learning_rate": 9.185592618878639e-05, + "loss": 0.06443232297897339, + "step": 57400 + }, + { + "epoch": 8.149041873669269, + "grad_norm": 8.87781047821045, + "learning_rate": 9.185450674237048e-05, + "loss": 0.02635810077190399, + "step": 57410 + }, + { + "epoch": 8.150461320085167, + "grad_norm": 0.2784138023853302, + "learning_rate": 9.18530872959546e-05, + "loss": 0.038064810633659366, + "step": 57420 + }, + { + "epoch": 8.151880766501064, + "grad_norm": 1.3603695631027222, + "learning_rate": 9.185166784953868e-05, + "loss": 0.009778007864952087, + "step": 57430 + }, + { + "epoch": 8.153300212916962, + "grad_norm": 1.439544916152954, + "learning_rate": 9.185024840312279e-05, + "loss": 0.03558608889579773, + "step": 57440 + }, + { + "epoch": 8.15471965933286, + "grad_norm": 0.5217808485031128, + "learning_rate": 9.184882895670689e-05, + "loss": 0.017676195502281188, + "step": 57450 + }, + { + "epoch": 8.156139105748759, + "grad_norm": 0.015781676396727562, + "learning_rate": 9.1847409510291e-05, + "loss": 0.013397561013698578, + "step": 57460 + }, + { + "epoch": 8.157558552164655, + "grad_norm": 0.17529694736003876, + "learning_rate": 9.18459900638751e-05, + "loss": 0.02492748200893402, + "step": 57470 + }, + { + "epoch": 8.158977998580554, + "grad_norm": 1.654995083808899, + "learning_rate": 9.184457061745919e-05, + "loss": 0.01247488558292389, + "step": 57480 + }, + { + "epoch": 8.160397444996452, + "grad_norm": 2.895176410675049, + "learning_rate": 9.18431511710433e-05, + "loss": 0.04395711421966553, + "step": 57490 + }, + { + "epoch": 8.161816891412348, + "grad_norm": 0.12417486310005188, + "learning_rate": 9.18417317246274e-05, + "loss": 0.018846186995506286, + "step": 57500 + }, + { + "epoch": 8.161816891412348, + "eval_accuracy": 0.9755833916195078, + "eval_loss": 0.08238392323255539, + "eval_runtime": 32.0699, + "eval_samples_per_second": 490.398, + "eval_steps_per_second": 15.342, + "step": 57500 + }, + { + "epoch": 8.163236337828247, + "grad_norm": 4.528477668762207, + "learning_rate": 9.184031227821151e-05, + "loss": 0.0625986099243164, + "step": 57510 + }, + { + "epoch": 8.164655784244145, + "grad_norm": 1.188218355178833, + "learning_rate": 9.183889283179561e-05, + "loss": 0.024391371011734008, + "step": 57520 + }, + { + "epoch": 8.166075230660043, + "grad_norm": 0.12762023508548737, + "learning_rate": 9.18374733853797e-05, + "loss": 0.058549624681472776, + "step": 57530 + }, + { + "epoch": 8.16749467707594, + "grad_norm": 7.117990016937256, + "learning_rate": 9.18360539389638e-05, + "loss": 0.05282506942749023, + "step": 57540 + }, + { + "epoch": 8.168914123491838, + "grad_norm": 1.1791610717773438, + "learning_rate": 9.183463449254791e-05, + "loss": 0.016684700548648835, + "step": 57550 + }, + { + "epoch": 8.170333569907736, + "grad_norm": 0.29182952642440796, + "learning_rate": 9.183321504613201e-05, + "loss": 0.03156082630157471, + "step": 57560 + }, + { + "epoch": 8.171753016323633, + "grad_norm": 0.5603981018066406, + "learning_rate": 9.183179559971612e-05, + "loss": 0.045778483152389526, + "step": 57570 + }, + { + "epoch": 8.173172462739531, + "grad_norm": 10.272153854370117, + "learning_rate": 9.183037615330022e-05, + "loss": 0.0619217574596405, + "step": 57580 + }, + { + "epoch": 8.17459190915543, + "grad_norm": 0.1799510270357132, + "learning_rate": 9.182895670688432e-05, + "loss": 0.03966604471206665, + "step": 57590 + }, + { + "epoch": 8.176011355571328, + "grad_norm": 0.05715738981962204, + "learning_rate": 9.182753726046843e-05, + "loss": 0.02208338528871536, + "step": 57600 + }, + { + "epoch": 8.177430801987224, + "grad_norm": 1.5309598445892334, + "learning_rate": 9.182611781405252e-05, + "loss": 0.00791141539812088, + "step": 57610 + }, + { + "epoch": 8.178850248403123, + "grad_norm": 4.317909240722656, + "learning_rate": 9.182469836763664e-05, + "loss": 0.01654169410467148, + "step": 57620 + }, + { + "epoch": 8.180269694819021, + "grad_norm": 3.71759033203125, + "learning_rate": 9.182327892122072e-05, + "loss": 0.0275895893573761, + "step": 57630 + }, + { + "epoch": 8.181689141234918, + "grad_norm": 6.036670684814453, + "learning_rate": 9.182185947480483e-05, + "loss": 0.05108698606491089, + "step": 57640 + }, + { + "epoch": 8.183108587650816, + "grad_norm": 3.292255163192749, + "learning_rate": 9.182044002838893e-05, + "loss": 0.023200985789299012, + "step": 57650 + }, + { + "epoch": 8.184528034066714, + "grad_norm": 1.7355892658233643, + "learning_rate": 9.181902058197304e-05, + "loss": 0.046505868434906006, + "step": 57660 + }, + { + "epoch": 8.185947480482612, + "grad_norm": 2.1724367141723633, + "learning_rate": 9.181760113555714e-05, + "loss": 0.03132735192775726, + "step": 57670 + }, + { + "epoch": 8.187366926898509, + "grad_norm": 8.026420593261719, + "learning_rate": 9.181618168914123e-05, + "loss": 0.04148833453655243, + "step": 57680 + }, + { + "epoch": 8.188786373314407, + "grad_norm": 4.966795921325684, + "learning_rate": 9.181476224272534e-05, + "loss": 0.05792571902275086, + "step": 57690 + }, + { + "epoch": 8.190205819730306, + "grad_norm": 0.702034056186676, + "learning_rate": 9.181334279630944e-05, + "loss": 0.021608872711658476, + "step": 57700 + }, + { + "epoch": 8.191625266146202, + "grad_norm": 0.0565681979060173, + "learning_rate": 9.181192334989355e-05, + "loss": 0.009969682991504669, + "step": 57710 + }, + { + "epoch": 8.1930447125621, + "grad_norm": 0.2924593985080719, + "learning_rate": 9.181050390347765e-05, + "loss": 0.04140026867389679, + "step": 57720 + }, + { + "epoch": 8.194464158977999, + "grad_norm": 7.954497337341309, + "learning_rate": 9.180908445706175e-05, + "loss": 0.06450521945953369, + "step": 57730 + }, + { + "epoch": 8.195883605393897, + "grad_norm": 0.15417218208312988, + "learning_rate": 9.180766501064584e-05, + "loss": 0.02865954041481018, + "step": 57740 + }, + { + "epoch": 8.197303051809794, + "grad_norm": 1.5217573642730713, + "learning_rate": 9.180624556422996e-05, + "loss": 0.05589728355407715, + "step": 57750 + }, + { + "epoch": 8.198722498225692, + "grad_norm": 11.092070579528809, + "learning_rate": 9.180482611781405e-05, + "loss": 0.06653887033462524, + "step": 57760 + }, + { + "epoch": 8.20014194464159, + "grad_norm": 1.9384713172912598, + "learning_rate": 9.180340667139816e-05, + "loss": 0.015503853559494019, + "step": 57770 + }, + { + "epoch": 8.201561391057487, + "grad_norm": 3.561843156814575, + "learning_rate": 9.180198722498226e-05, + "loss": 0.06690990924835205, + "step": 57780 + }, + { + "epoch": 8.202980837473385, + "grad_norm": 0.6527613997459412, + "learning_rate": 9.180056777856636e-05, + "loss": 0.02870522141456604, + "step": 57790 + }, + { + "epoch": 8.204400283889283, + "grad_norm": 0.692225992679596, + "learning_rate": 9.179914833215047e-05, + "loss": 0.047651296854019164, + "step": 57800 + }, + { + "epoch": 8.205819730305182, + "grad_norm": 0.11748456209897995, + "learning_rate": 9.179772888573457e-05, + "loss": 0.026576727628707886, + "step": 57810 + }, + { + "epoch": 8.207239176721078, + "grad_norm": 5.99944543838501, + "learning_rate": 9.179630943931868e-05, + "loss": 0.061805450916290285, + "step": 57820 + }, + { + "epoch": 8.208658623136976, + "grad_norm": 3.832113265991211, + "learning_rate": 9.179488999290278e-05, + "loss": 0.01249971017241478, + "step": 57830 + }, + { + "epoch": 8.210078069552875, + "grad_norm": 0.19495727121829987, + "learning_rate": 9.179347054648687e-05, + "loss": 0.05017414689064026, + "step": 57840 + }, + { + "epoch": 8.211497515968771, + "grad_norm": 0.5911961197853088, + "learning_rate": 9.179205110007097e-05, + "loss": 0.025897520780563354, + "step": 57850 + }, + { + "epoch": 8.21291696238467, + "grad_norm": 5.365500450134277, + "learning_rate": 9.179063165365508e-05, + "loss": 0.037281885743141174, + "step": 57860 + }, + { + "epoch": 8.214336408800568, + "grad_norm": 1.0354958772659302, + "learning_rate": 9.178921220723918e-05, + "loss": 0.028408104181289674, + "step": 57870 + }, + { + "epoch": 8.215755855216466, + "grad_norm": 6.38060188293457, + "learning_rate": 9.178779276082329e-05, + "loss": 0.019732609391212463, + "step": 57880 + }, + { + "epoch": 8.217175301632363, + "grad_norm": 0.3738914728164673, + "learning_rate": 9.178637331440739e-05, + "loss": 0.020280544459819794, + "step": 57890 + }, + { + "epoch": 8.218594748048261, + "grad_norm": 7.738147258758545, + "learning_rate": 9.178495386799148e-05, + "loss": 0.06968256831169128, + "step": 57900 + }, + { + "epoch": 8.22001419446416, + "grad_norm": 3.8040566444396973, + "learning_rate": 9.17835344215756e-05, + "loss": 0.05860614776611328, + "step": 57910 + }, + { + "epoch": 8.221433640880056, + "grad_norm": 0.19749833643436432, + "learning_rate": 9.178211497515969e-05, + "loss": 0.05739356875419617, + "step": 57920 + }, + { + "epoch": 8.222853087295954, + "grad_norm": 0.8916294574737549, + "learning_rate": 9.17806955287438e-05, + "loss": 0.027829304337501526, + "step": 57930 + }, + { + "epoch": 8.224272533711853, + "grad_norm": 2.294523239135742, + "learning_rate": 9.177927608232789e-05, + "loss": 0.024026399850845336, + "step": 57940 + }, + { + "epoch": 8.22569198012775, + "grad_norm": 1.959633469581604, + "learning_rate": 9.1777856635912e-05, + "loss": 0.057163572311401366, + "step": 57950 + }, + { + "epoch": 8.227111426543647, + "grad_norm": 5.235497951507568, + "learning_rate": 9.17764371894961e-05, + "loss": 0.11002181768417359, + "step": 57960 + }, + { + "epoch": 8.228530872959546, + "grad_norm": 5.246006965637207, + "learning_rate": 9.17750177430802e-05, + "loss": 0.036422187089920045, + "step": 57970 + }, + { + "epoch": 8.229950319375444, + "grad_norm": 3.0154199600219727, + "learning_rate": 9.17735982966643e-05, + "loss": 0.016849853098392487, + "step": 57980 + }, + { + "epoch": 8.231369765791342, + "grad_norm": 3.4750466346740723, + "learning_rate": 9.17721788502484e-05, + "loss": 0.03442394733428955, + "step": 57990 + }, + { + "epoch": 8.232789212207239, + "grad_norm": 8.968158721923828, + "learning_rate": 9.177075940383251e-05, + "loss": 0.043063384294509885, + "step": 58000 + }, + { + "epoch": 8.232789212207239, + "eval_accuracy": 0.9691613149360971, + "eval_loss": 0.10660364478826523, + "eval_runtime": 32.1198, + "eval_samples_per_second": 489.635, + "eval_steps_per_second": 15.318, + "step": 58000 + }, + { + "epoch": 8.234208658623137, + "grad_norm": 0.9778158068656921, + "learning_rate": 9.176933995741661e-05, + "loss": 0.030573081970214844, + "step": 58010 + }, + { + "epoch": 8.235628105039035, + "grad_norm": 0.6507449150085449, + "learning_rate": 9.176792051100072e-05, + "loss": 0.014274489879608155, + "step": 58020 + }, + { + "epoch": 8.237047551454932, + "grad_norm": 3.5025880336761475, + "learning_rate": 9.176650106458482e-05, + "loss": 0.020151573419570922, + "step": 58030 + }, + { + "epoch": 8.23846699787083, + "grad_norm": 1.5362058877944946, + "learning_rate": 9.176508161816892e-05, + "loss": 0.06194206476211548, + "step": 58040 + }, + { + "epoch": 8.239886444286729, + "grad_norm": 2.623915672302246, + "learning_rate": 9.176366217175301e-05, + "loss": 0.023121093213558198, + "step": 58050 + }, + { + "epoch": 8.241305890702627, + "grad_norm": 0.05486688017845154, + "learning_rate": 9.176224272533712e-05, + "loss": 0.02971988618373871, + "step": 58060 + }, + { + "epoch": 8.242725337118523, + "grad_norm": 0.1611616164445877, + "learning_rate": 9.176082327892122e-05, + "loss": 0.04156226217746735, + "step": 58070 + }, + { + "epoch": 8.244144783534422, + "grad_norm": 3.102126359939575, + "learning_rate": 9.175940383250533e-05, + "loss": 0.018605512380599976, + "step": 58080 + }, + { + "epoch": 8.24556422995032, + "grad_norm": 1.3434499502182007, + "learning_rate": 9.175812633073102e-05, + "loss": 0.04484846293926239, + "step": 58090 + }, + { + "epoch": 8.246983676366217, + "grad_norm": 0.9203706383705139, + "learning_rate": 9.175670688431513e-05, + "loss": 0.020824790000915527, + "step": 58100 + }, + { + "epoch": 8.248403122782115, + "grad_norm": 0.8287354111671448, + "learning_rate": 9.175528743789923e-05, + "loss": 0.03844572603702545, + "step": 58110 + }, + { + "epoch": 8.249822569198013, + "grad_norm": 11.324418067932129, + "learning_rate": 9.175386799148332e-05, + "loss": 0.04563019275665283, + "step": 58120 + }, + { + "epoch": 8.251242015613911, + "grad_norm": 0.2999362051486969, + "learning_rate": 9.175244854506742e-05, + "loss": 0.039099177718162535, + "step": 58130 + }, + { + "epoch": 8.252661462029808, + "grad_norm": 1.6591378450393677, + "learning_rate": 9.175102909865153e-05, + "loss": 0.03310187757015228, + "step": 58140 + }, + { + "epoch": 8.254080908445706, + "grad_norm": 6.494851112365723, + "learning_rate": 9.174960965223564e-05, + "loss": 0.061228638887405394, + "step": 58150 + }, + { + "epoch": 8.255500354861605, + "grad_norm": 1.309449315071106, + "learning_rate": 9.174819020581974e-05, + "loss": 0.015565997362136841, + "step": 58160 + }, + { + "epoch": 8.256919801277501, + "grad_norm": 0.35763129591941833, + "learning_rate": 9.174677075940384e-05, + "loss": 0.07266973853111267, + "step": 58170 + }, + { + "epoch": 8.2583392476934, + "grad_norm": 2.6130881309509277, + "learning_rate": 9.174535131298793e-05, + "loss": 0.03264107704162598, + "step": 58180 + }, + { + "epoch": 8.259758694109298, + "grad_norm": 0.6050202250480652, + "learning_rate": 9.174393186657204e-05, + "loss": 0.015243317186832427, + "step": 58190 + }, + { + "epoch": 8.261178140525196, + "grad_norm": 1.1150106191635132, + "learning_rate": 9.174251242015614e-05, + "loss": 0.05223976969718933, + "step": 58200 + }, + { + "epoch": 8.262597586941093, + "grad_norm": 6.839016437530518, + "learning_rate": 9.174109297374025e-05, + "loss": 0.0411196768283844, + "step": 58210 + }, + { + "epoch": 8.264017033356991, + "grad_norm": 6.911043167114258, + "learning_rate": 9.173967352732434e-05, + "loss": 0.04330936968326569, + "step": 58220 + }, + { + "epoch": 8.26543647977289, + "grad_norm": 1.0361573696136475, + "learning_rate": 9.173825408090845e-05, + "loss": 0.025824397802352905, + "step": 58230 + }, + { + "epoch": 8.266855926188786, + "grad_norm": 12.14588451385498, + "learning_rate": 9.173683463449256e-05, + "loss": 0.02549113631248474, + "step": 58240 + }, + { + "epoch": 8.268275372604684, + "grad_norm": 0.05422484129667282, + "learning_rate": 9.173541518807666e-05, + "loss": 0.032814472913742065, + "step": 58250 + }, + { + "epoch": 8.269694819020582, + "grad_norm": 3.4622371196746826, + "learning_rate": 9.173399574166077e-05, + "loss": 0.029658371210098268, + "step": 58260 + }, + { + "epoch": 8.27111426543648, + "grad_norm": 0.06342365592718124, + "learning_rate": 9.173257629524485e-05, + "loss": 0.02727014124393463, + "step": 58270 + }, + { + "epoch": 8.272533711852377, + "grad_norm": 7.434948921203613, + "learning_rate": 9.173115684882896e-05, + "loss": 0.04313863217830658, + "step": 58280 + }, + { + "epoch": 8.273953158268275, + "grad_norm": 2.478610038757324, + "learning_rate": 9.172973740241306e-05, + "loss": 0.021670131385326384, + "step": 58290 + }, + { + "epoch": 8.275372604684174, + "grad_norm": 6.441994667053223, + "learning_rate": 9.172831795599717e-05, + "loss": 0.04854116439819336, + "step": 58300 + }, + { + "epoch": 8.27679205110007, + "grad_norm": 4.104756832122803, + "learning_rate": 9.172689850958127e-05, + "loss": 0.03957696259021759, + "step": 58310 + }, + { + "epoch": 8.278211497515969, + "grad_norm": 0.4648134112358093, + "learning_rate": 9.172547906316536e-05, + "loss": 0.03666155338287354, + "step": 58320 + }, + { + "epoch": 8.279630943931867, + "grad_norm": 5.658158779144287, + "learning_rate": 9.172405961674948e-05, + "loss": 0.05015560984611511, + "step": 58330 + }, + { + "epoch": 8.281050390347765, + "grad_norm": 0.17183201014995575, + "learning_rate": 9.172264017033357e-05, + "loss": 0.05419689416885376, + "step": 58340 + }, + { + "epoch": 8.282469836763662, + "grad_norm": 0.5585965514183044, + "learning_rate": 9.172122072391768e-05, + "loss": 0.019291809201240538, + "step": 58350 + }, + { + "epoch": 8.28388928317956, + "grad_norm": 0.5460109114646912, + "learning_rate": 9.171980127750178e-05, + "loss": 0.024260058999061584, + "step": 58360 + }, + { + "epoch": 8.285308729595458, + "grad_norm": 4.628096103668213, + "learning_rate": 9.171838183108588e-05, + "loss": 0.06682268977165222, + "step": 58370 + }, + { + "epoch": 8.286728176011355, + "grad_norm": 0.07002594321966171, + "learning_rate": 9.171696238466998e-05, + "loss": 0.06908130049705505, + "step": 58380 + }, + { + "epoch": 8.288147622427253, + "grad_norm": 5.662642478942871, + "learning_rate": 9.171554293825409e-05, + "loss": 0.0431951105594635, + "step": 58390 + }, + { + "epoch": 8.289567068843152, + "grad_norm": 3.9657466411590576, + "learning_rate": 9.171412349183818e-05, + "loss": 0.023533882200717927, + "step": 58400 + }, + { + "epoch": 8.29098651525905, + "grad_norm": 0.025615880265831947, + "learning_rate": 9.17127040454223e-05, + "loss": 0.01455162763595581, + "step": 58410 + }, + { + "epoch": 8.292405961674946, + "grad_norm": 5.503678321838379, + "learning_rate": 9.171128459900639e-05, + "loss": 0.0331308126449585, + "step": 58420 + }, + { + "epoch": 8.293825408090845, + "grad_norm": 4.88674259185791, + "learning_rate": 9.170986515259049e-05, + "loss": 0.028948378562927247, + "step": 58430 + }, + { + "epoch": 8.295244854506743, + "grad_norm": 0.2964005470275879, + "learning_rate": 9.17084457061746e-05, + "loss": 0.04497859477996826, + "step": 58440 + }, + { + "epoch": 8.29666430092264, + "grad_norm": 1.4039242267608643, + "learning_rate": 9.17070262597587e-05, + "loss": 0.03003017008304596, + "step": 58450 + }, + { + "epoch": 8.298083747338538, + "grad_norm": 0.22310487926006317, + "learning_rate": 9.170560681334281e-05, + "loss": 0.034260991215705874, + "step": 58460 + }, + { + "epoch": 8.299503193754436, + "grad_norm": 3.774014472961426, + "learning_rate": 9.170418736692691e-05, + "loss": 0.05678040981292724, + "step": 58470 + }, + { + "epoch": 8.300922640170334, + "grad_norm": 4.474982261657715, + "learning_rate": 9.1702767920511e-05, + "loss": 0.05777202248573303, + "step": 58480 + }, + { + "epoch": 8.302342086586231, + "grad_norm": 0.0562448650598526, + "learning_rate": 9.17013484740951e-05, + "loss": 0.04792693853378296, + "step": 58490 + }, + { + "epoch": 8.30376153300213, + "grad_norm": 1.0092743635177612, + "learning_rate": 9.169992902767921e-05, + "loss": 0.033463281393051145, + "step": 58500 + }, + { + "epoch": 8.30376153300213, + "eval_accuracy": 0.9818782984676034, + "eval_loss": 0.056806765496730804, + "eval_runtime": 34.6905, + "eval_samples_per_second": 453.351, + "eval_steps_per_second": 14.183, + "step": 58500 + }, + { + "epoch": 8.305180979418028, + "grad_norm": 0.3739173412322998, + "learning_rate": 9.169850958126331e-05, + "loss": 0.04968686699867249, + "step": 58510 + }, + { + "epoch": 8.306600425833924, + "grad_norm": 4.6436767578125, + "learning_rate": 9.169709013484742e-05, + "loss": 0.05652905702590942, + "step": 58520 + }, + { + "epoch": 8.308019872249822, + "grad_norm": 1.215824842453003, + "learning_rate": 9.169567068843152e-05, + "loss": 0.08560553789138795, + "step": 58530 + }, + { + "epoch": 8.30943931866572, + "grad_norm": 2.3930976390838623, + "learning_rate": 9.169425124201562e-05, + "loss": 0.047614786028862, + "step": 58540 + }, + { + "epoch": 8.310858765081619, + "grad_norm": 0.21128606796264648, + "learning_rate": 9.169283179559973e-05, + "loss": 0.041520559787750246, + "step": 58550 + }, + { + "epoch": 8.312278211497516, + "grad_norm": 0.11912447959184647, + "learning_rate": 9.169141234918382e-05, + "loss": 0.04494628310203552, + "step": 58560 + }, + { + "epoch": 8.313697657913414, + "grad_norm": 3.504589557647705, + "learning_rate": 9.168999290276793e-05, + "loss": 0.05546298623085022, + "step": 58570 + }, + { + "epoch": 8.315117104329312, + "grad_norm": 0.06115560233592987, + "learning_rate": 9.168857345635202e-05, + "loss": 0.020033690333366393, + "step": 58580 + }, + { + "epoch": 8.316536550745209, + "grad_norm": 0.08107715845108032, + "learning_rate": 9.168715400993613e-05, + "loss": 0.025489938259124757, + "step": 58590 + }, + { + "epoch": 8.317955997161107, + "grad_norm": 2.1215507984161377, + "learning_rate": 9.168573456352023e-05, + "loss": 0.05170263051986694, + "step": 58600 + }, + { + "epoch": 8.319375443577005, + "grad_norm": 3.4608006477355957, + "learning_rate": 9.168431511710434e-05, + "loss": 0.029605063796043395, + "step": 58610 + }, + { + "epoch": 8.320794889992904, + "grad_norm": 0.22780439257621765, + "learning_rate": 9.168289567068844e-05, + "loss": 0.039026209712028505, + "step": 58620 + }, + { + "epoch": 8.3222143364088, + "grad_norm": 6.255256175994873, + "learning_rate": 9.168147622427253e-05, + "loss": 0.06340646147727966, + "step": 58630 + }, + { + "epoch": 8.323633782824698, + "grad_norm": 0.20648564398288727, + "learning_rate": 9.168005677785664e-05, + "loss": 0.04623824059963226, + "step": 58640 + }, + { + "epoch": 8.325053229240597, + "grad_norm": 2.573063373565674, + "learning_rate": 9.167863733144074e-05, + "loss": 0.019829289615154268, + "step": 58650 + }, + { + "epoch": 8.326472675656493, + "grad_norm": 1.750908374786377, + "learning_rate": 9.167721788502485e-05, + "loss": 0.050472980737686156, + "step": 58660 + }, + { + "epoch": 8.327892122072392, + "grad_norm": 0.2912693917751312, + "learning_rate": 9.167579843860895e-05, + "loss": 0.02320457398891449, + "step": 58670 + }, + { + "epoch": 8.32931156848829, + "grad_norm": 1.3368074893951416, + "learning_rate": 9.167437899219305e-05, + "loss": 0.03441727757453918, + "step": 58680 + }, + { + "epoch": 8.330731014904188, + "grad_norm": 0.020405098795890808, + "learning_rate": 9.167295954577714e-05, + "loss": 0.030085077881813048, + "step": 58690 + }, + { + "epoch": 8.332150461320085, + "grad_norm": 2.079420804977417, + "learning_rate": 9.167154009936125e-05, + "loss": 0.03475523889064789, + "step": 58700 + }, + { + "epoch": 8.333569907735983, + "grad_norm": 6.957132816314697, + "learning_rate": 9.167012065294535e-05, + "loss": 0.04805854558944702, + "step": 58710 + }, + { + "epoch": 8.334989354151881, + "grad_norm": 0.8798670768737793, + "learning_rate": 9.166870120652946e-05, + "loss": 0.03421752452850342, + "step": 58720 + }, + { + "epoch": 8.336408800567778, + "grad_norm": 4.264864921569824, + "learning_rate": 9.166728176011356e-05, + "loss": 0.04698627889156341, + "step": 58730 + }, + { + "epoch": 8.337828246983676, + "grad_norm": 8.51839828491211, + "learning_rate": 9.166586231369766e-05, + "loss": 0.024400044977664948, + "step": 58740 + }, + { + "epoch": 8.339247693399575, + "grad_norm": 2.2272045612335205, + "learning_rate": 9.166444286728177e-05, + "loss": 0.0337568461894989, + "step": 58750 + }, + { + "epoch": 8.340667139815473, + "grad_norm": 0.3619248569011688, + "learning_rate": 9.166302342086587e-05, + "loss": 0.06187044978141785, + "step": 58760 + }, + { + "epoch": 8.34208658623137, + "grad_norm": 0.6603171229362488, + "learning_rate": 9.166160397444998e-05, + "loss": 0.02449636459350586, + "step": 58770 + }, + { + "epoch": 8.343506032647268, + "grad_norm": 2.9686007499694824, + "learning_rate": 9.166018452803407e-05, + "loss": 0.03370268642902374, + "step": 58780 + }, + { + "epoch": 8.344925479063166, + "grad_norm": 0.051271889358758926, + "learning_rate": 9.165876508161817e-05, + "loss": 0.01729312539100647, + "step": 58790 + }, + { + "epoch": 8.346344925479062, + "grad_norm": 0.945408046245575, + "learning_rate": 9.165734563520227e-05, + "loss": 0.01863696575164795, + "step": 58800 + }, + { + "epoch": 8.34776437189496, + "grad_norm": 8.780373573303223, + "learning_rate": 9.165592618878638e-05, + "loss": 0.04798442721366882, + "step": 58810 + }, + { + "epoch": 8.349183818310859, + "grad_norm": 2.2021658420562744, + "learning_rate": 9.165450674237048e-05, + "loss": 0.04746388792991638, + "step": 58820 + }, + { + "epoch": 8.350603264726757, + "grad_norm": 2.684445381164551, + "learning_rate": 9.165308729595459e-05, + "loss": 0.045510712265968326, + "step": 58830 + }, + { + "epoch": 8.352022711142654, + "grad_norm": 0.4354016184806824, + "learning_rate": 9.165166784953869e-05, + "loss": 0.061435526609420775, + "step": 58840 + }, + { + "epoch": 8.353442157558552, + "grad_norm": 9.262724876403809, + "learning_rate": 9.165024840312278e-05, + "loss": 0.09312753081321716, + "step": 58850 + }, + { + "epoch": 8.35486160397445, + "grad_norm": 0.48700007796287537, + "learning_rate": 9.16488289567069e-05, + "loss": 0.049555063247680664, + "step": 58860 + }, + { + "epoch": 8.356281050390347, + "grad_norm": 0.06040867790579796, + "learning_rate": 9.164740951029099e-05, + "loss": 0.046083787083625795, + "step": 58870 + }, + { + "epoch": 8.357700496806245, + "grad_norm": 5.327953338623047, + "learning_rate": 9.16459900638751e-05, + "loss": 0.04298398494720459, + "step": 58880 + }, + { + "epoch": 8.359119943222144, + "grad_norm": 6.591610431671143, + "learning_rate": 9.164457061745919e-05, + "loss": 0.10864330530166626, + "step": 58890 + }, + { + "epoch": 8.360539389638042, + "grad_norm": 17.488178253173828, + "learning_rate": 9.16431511710433e-05, + "loss": 0.07181705236434936, + "step": 58900 + }, + { + "epoch": 8.361958836053939, + "grad_norm": 8.191031455993652, + "learning_rate": 9.16417317246274e-05, + "loss": 0.10081918239593506, + "step": 58910 + }, + { + "epoch": 8.363378282469837, + "grad_norm": 0.9254047274589539, + "learning_rate": 9.16403122782115e-05, + "loss": 0.03317170143127442, + "step": 58920 + }, + { + "epoch": 8.364797728885735, + "grad_norm": 15.292092323303223, + "learning_rate": 9.16388928317956e-05, + "loss": 0.11680049896240234, + "step": 58930 + }, + { + "epoch": 8.366217175301632, + "grad_norm": 3.000251531600952, + "learning_rate": 9.16374733853797e-05, + "loss": 0.050798237323760986, + "step": 58940 + }, + { + "epoch": 8.36763662171753, + "grad_norm": 3.4350600242614746, + "learning_rate": 9.163605393896381e-05, + "loss": 0.019013065099716186, + "step": 58950 + }, + { + "epoch": 8.369056068133428, + "grad_norm": 0.20841482281684875, + "learning_rate": 9.163463449254791e-05, + "loss": 0.06327688694000244, + "step": 58960 + }, + { + "epoch": 8.370475514549327, + "grad_norm": 10.657003402709961, + "learning_rate": 9.163321504613202e-05, + "loss": 0.05808635950088501, + "step": 58970 + }, + { + "epoch": 8.371894960965223, + "grad_norm": 2.521904706954956, + "learning_rate": 9.163179559971612e-05, + "loss": 0.03354381322860718, + "step": 58980 + }, + { + "epoch": 8.373314407381121, + "grad_norm": 9.845879554748535, + "learning_rate": 9.163037615330021e-05, + "loss": 0.05215628147125244, + "step": 58990 + }, + { + "epoch": 8.37473385379702, + "grad_norm": 6.998453617095947, + "learning_rate": 9.162895670688431e-05, + "loss": 0.04951879382133484, + "step": 59000 + }, + { + "epoch": 8.37473385379702, + "eval_accuracy": 0.977045844725631, + "eval_loss": 0.07441914826631546, + "eval_runtime": 33.0904, + "eval_samples_per_second": 475.274, + "eval_steps_per_second": 14.868, + "step": 59000 + }, + { + "epoch": 8.376153300212916, + "grad_norm": 4.48925256729126, + "learning_rate": 9.162753726046842e-05, + "loss": 0.03418838381767273, + "step": 59010 + }, + { + "epoch": 8.377572746628815, + "grad_norm": 0.2785710096359253, + "learning_rate": 9.162611781405252e-05, + "loss": 0.026520654559135437, + "step": 59020 + }, + { + "epoch": 8.378992193044713, + "grad_norm": 0.04143285006284714, + "learning_rate": 9.162469836763663e-05, + "loss": 0.03496352732181549, + "step": 59030 + }, + { + "epoch": 8.380411639460611, + "grad_norm": 0.034225188195705414, + "learning_rate": 9.162327892122073e-05, + "loss": 0.020401456952095033, + "step": 59040 + }, + { + "epoch": 8.381831085876508, + "grad_norm": 0.8951627612113953, + "learning_rate": 9.162185947480483e-05, + "loss": 0.025759845972061157, + "step": 59050 + }, + { + "epoch": 8.383250532292406, + "grad_norm": 4.353384971618652, + "learning_rate": 9.162044002838894e-05, + "loss": 0.04796704351902008, + "step": 59060 + }, + { + "epoch": 8.384669978708304, + "grad_norm": 6.735307216644287, + "learning_rate": 9.161902058197303e-05, + "loss": 0.03449790477752686, + "step": 59070 + }, + { + "epoch": 8.3860894251242, + "grad_norm": 0.14655238389968872, + "learning_rate": 9.161760113555714e-05, + "loss": 0.010301701724529266, + "step": 59080 + }, + { + "epoch": 8.3875088715401, + "grad_norm": 10.556654930114746, + "learning_rate": 9.161618168914123e-05, + "loss": 0.05006436705589294, + "step": 59090 + }, + { + "epoch": 8.388928317955997, + "grad_norm": 0.158490389585495, + "learning_rate": 9.161476224272534e-05, + "loss": 0.04050736129283905, + "step": 59100 + }, + { + "epoch": 8.390347764371896, + "grad_norm": 0.09657344222068787, + "learning_rate": 9.161334279630944e-05, + "loss": 0.017902058362960816, + "step": 59110 + }, + { + "epoch": 8.391767210787792, + "grad_norm": 0.02483726106584072, + "learning_rate": 9.161192334989355e-05, + "loss": 0.017482933402061463, + "step": 59120 + }, + { + "epoch": 8.39318665720369, + "grad_norm": 7.812098026275635, + "learning_rate": 9.161050390347765e-05, + "loss": 0.018153285980224608, + "step": 59130 + }, + { + "epoch": 8.394606103619589, + "grad_norm": 0.03699856996536255, + "learning_rate": 9.160908445706176e-05, + "loss": 0.019261515140533446, + "step": 59140 + }, + { + "epoch": 8.396025550035485, + "grad_norm": 8.018878936767578, + "learning_rate": 9.160766501064585e-05, + "loss": 0.05388169288635254, + "step": 59150 + }, + { + "epoch": 8.397444996451384, + "grad_norm": 0.09866965562105179, + "learning_rate": 9.160624556422995e-05, + "loss": 0.027665621042251586, + "step": 59160 + }, + { + "epoch": 8.398864442867282, + "grad_norm": 18.310009002685547, + "learning_rate": 9.160482611781406e-05, + "loss": 0.0914535403251648, + "step": 59170 + }, + { + "epoch": 8.40028388928318, + "grad_norm": 0.0723535567522049, + "learning_rate": 9.160340667139816e-05, + "loss": 0.029044130444526674, + "step": 59180 + }, + { + "epoch": 8.401703335699077, + "grad_norm": 0.02246226742863655, + "learning_rate": 9.160198722498227e-05, + "loss": 0.016526098549365997, + "step": 59190 + }, + { + "epoch": 8.403122782114975, + "grad_norm": 0.16880199313163757, + "learning_rate": 9.160056777856635e-05, + "loss": 0.014294581115245819, + "step": 59200 + }, + { + "epoch": 8.404542228530874, + "grad_norm": 4.8066864013671875, + "learning_rate": 9.159914833215047e-05, + "loss": 0.017830350995063783, + "step": 59210 + }, + { + "epoch": 8.40596167494677, + "grad_norm": 6.521173000335693, + "learning_rate": 9.159772888573456e-05, + "loss": 0.028379026055335998, + "step": 59220 + }, + { + "epoch": 8.407381121362668, + "grad_norm": 2.4245357513427734, + "learning_rate": 9.159630943931867e-05, + "loss": 0.023089191317558287, + "step": 59230 + }, + { + "epoch": 8.408800567778567, + "grad_norm": 5.615114688873291, + "learning_rate": 9.159488999290277e-05, + "loss": 0.0288492351770401, + "step": 59240 + }, + { + "epoch": 8.410220014194465, + "grad_norm": 6.010306358337402, + "learning_rate": 9.159347054648687e-05, + "loss": 0.043870294094085695, + "step": 59250 + }, + { + "epoch": 8.411639460610361, + "grad_norm": 0.6510958671569824, + "learning_rate": 9.159205110007098e-05, + "loss": 0.03329501152038574, + "step": 59260 + }, + { + "epoch": 8.41305890702626, + "grad_norm": 0.7864037752151489, + "learning_rate": 9.159063165365508e-05, + "loss": 0.04816370904445648, + "step": 59270 + }, + { + "epoch": 8.414478353442158, + "grad_norm": 6.080152988433838, + "learning_rate": 9.158921220723919e-05, + "loss": 0.04071834087371826, + "step": 59280 + }, + { + "epoch": 8.415897799858055, + "grad_norm": 0.08487723022699356, + "learning_rate": 9.158779276082328e-05, + "loss": 0.021995453536510466, + "step": 59290 + }, + { + "epoch": 8.417317246273953, + "grad_norm": 1.2968878746032715, + "learning_rate": 9.158637331440738e-05, + "loss": 0.03177845478057861, + "step": 59300 + }, + { + "epoch": 8.418736692689851, + "grad_norm": 4.66085147857666, + "learning_rate": 9.158495386799148e-05, + "loss": 0.02630176246166229, + "step": 59310 + }, + { + "epoch": 8.42015613910575, + "grad_norm": 11.192215919494629, + "learning_rate": 9.158353442157559e-05, + "loss": 0.05713456869125366, + "step": 59320 + }, + { + "epoch": 8.421575585521646, + "grad_norm": 6.2787628173828125, + "learning_rate": 9.158211497515969e-05, + "loss": 0.056651723384857175, + "step": 59330 + }, + { + "epoch": 8.422995031937544, + "grad_norm": 1.7867172956466675, + "learning_rate": 9.15806955287438e-05, + "loss": 0.062083113193511966, + "step": 59340 + }, + { + "epoch": 8.424414478353443, + "grad_norm": 6.2273783683776855, + "learning_rate": 9.15792760823279e-05, + "loss": 0.1109097957611084, + "step": 59350 + }, + { + "epoch": 8.42583392476934, + "grad_norm": 0.06000044196844101, + "learning_rate": 9.1577856635912e-05, + "loss": 0.032784104347229004, + "step": 59360 + }, + { + "epoch": 8.427253371185238, + "grad_norm": 5.235750198364258, + "learning_rate": 9.15764371894961e-05, + "loss": 0.027906310558319092, + "step": 59370 + }, + { + "epoch": 8.428672817601136, + "grad_norm": 1.2695759534835815, + "learning_rate": 9.15750177430802e-05, + "loss": 0.01285722553730011, + "step": 59380 + }, + { + "epoch": 8.430092264017034, + "grad_norm": 2.5559439659118652, + "learning_rate": 9.157359829666431e-05, + "loss": 0.01039145290851593, + "step": 59390 + }, + { + "epoch": 8.43151171043293, + "grad_norm": 8.719844818115234, + "learning_rate": 9.15721788502484e-05, + "loss": 0.06004953384399414, + "step": 59400 + }, + { + "epoch": 8.432931156848829, + "grad_norm": 0.029175806790590286, + "learning_rate": 9.157075940383251e-05, + "loss": 0.031366673111915586, + "step": 59410 + }, + { + "epoch": 8.434350603264727, + "grad_norm": 0.4012078046798706, + "learning_rate": 9.15693399574166e-05, + "loss": 0.029990941286087036, + "step": 59420 + }, + { + "epoch": 8.435770049680624, + "grad_norm": 3.6554975509643555, + "learning_rate": 9.156792051100072e-05, + "loss": 0.011965757608413697, + "step": 59430 + }, + { + "epoch": 8.437189496096522, + "grad_norm": 1.1532933712005615, + "learning_rate": 9.156650106458481e-05, + "loss": 0.03454259634017944, + "step": 59440 + }, + { + "epoch": 8.43860894251242, + "grad_norm": 5.955157279968262, + "learning_rate": 9.156508161816891e-05, + "loss": 0.011077064275741576, + "step": 59450 + }, + { + "epoch": 8.440028388928319, + "grad_norm": 0.1530897617340088, + "learning_rate": 9.156366217175302e-05, + "loss": 0.026709139347076416, + "step": 59460 + }, + { + "epoch": 8.441447835344215, + "grad_norm": 1.0983542203903198, + "learning_rate": 9.156224272533712e-05, + "loss": 0.023664931952953338, + "step": 59470 + }, + { + "epoch": 8.442867281760114, + "grad_norm": 0.36292317509651184, + "learning_rate": 9.156082327892123e-05, + "loss": 0.048028239607810976, + "step": 59480 + }, + { + "epoch": 8.444286728176012, + "grad_norm": 0.14995090663433075, + "learning_rate": 9.155940383250533e-05, + "loss": 0.015602460503578186, + "step": 59490 + }, + { + "epoch": 8.445706174591908, + "grad_norm": 0.9158965945243835, + "learning_rate": 9.155798438608944e-05, + "loss": 0.026582181453704834, + "step": 59500 + }, + { + "epoch": 8.445706174591908, + "eval_accuracy": 0.9734850893368093, + "eval_loss": 0.08671265840530396, + "eval_runtime": 33.331, + "eval_samples_per_second": 471.843, + "eval_steps_per_second": 14.761, + "step": 59500 + }, + { + "epoch": 8.447125621007807, + "grad_norm": 0.5541223287582397, + "learning_rate": 9.155656493967352e-05, + "loss": 0.040118956565856935, + "step": 59510 + }, + { + "epoch": 8.448545067423705, + "grad_norm": 1.745306372642517, + "learning_rate": 9.155514549325763e-05, + "loss": 0.024646060168743135, + "step": 59520 + }, + { + "epoch": 8.449964513839603, + "grad_norm": 0.9366075992584229, + "learning_rate": 9.155372604684173e-05, + "loss": 0.09292943477630615, + "step": 59530 + }, + { + "epoch": 8.4513839602555, + "grad_norm": 0.06262222677469254, + "learning_rate": 9.155230660042584e-05, + "loss": 0.04963191449642181, + "step": 59540 + }, + { + "epoch": 8.452803406671398, + "grad_norm": 0.08809320628643036, + "learning_rate": 9.155088715400995e-05, + "loss": 0.017690953612327576, + "step": 59550 + }, + { + "epoch": 8.454222853087296, + "grad_norm": 1.2314077615737915, + "learning_rate": 9.154946770759404e-05, + "loss": 0.0151987686753273, + "step": 59560 + }, + { + "epoch": 8.455642299503193, + "grad_norm": 0.8722369074821472, + "learning_rate": 9.154804826117815e-05, + "loss": 0.0057766992598772045, + "step": 59570 + }, + { + "epoch": 8.457061745919091, + "grad_norm": 0.21585650742053986, + "learning_rate": 9.154662881476224e-05, + "loss": 0.031658861041069034, + "step": 59580 + }, + { + "epoch": 8.45848119233499, + "grad_norm": 7.521589756011963, + "learning_rate": 9.154520936834636e-05, + "loss": 0.03252851366996765, + "step": 59590 + }, + { + "epoch": 8.459900638750888, + "grad_norm": 10.103790283203125, + "learning_rate": 9.154378992193045e-05, + "loss": 0.029417049884796143, + "step": 59600 + }, + { + "epoch": 8.461320085166784, + "grad_norm": 11.28209400177002, + "learning_rate": 9.154237047551455e-05, + "loss": 0.0774505078792572, + "step": 59610 + }, + { + "epoch": 8.462739531582683, + "grad_norm": 13.403277397155762, + "learning_rate": 9.154095102909865e-05, + "loss": 0.10143941640853882, + "step": 59620 + }, + { + "epoch": 8.464158977998581, + "grad_norm": 0.5809075832366943, + "learning_rate": 9.153953158268276e-05, + "loss": 0.07998875379562378, + "step": 59630 + }, + { + "epoch": 8.465578424414478, + "grad_norm": 4.999480724334717, + "learning_rate": 9.153811213626687e-05, + "loss": 0.03397766649723053, + "step": 59640 + }, + { + "epoch": 8.466997870830376, + "grad_norm": 0.23321618139743805, + "learning_rate": 9.153669268985097e-05, + "loss": 0.034213504195213316, + "step": 59650 + }, + { + "epoch": 8.468417317246274, + "grad_norm": 5.279557228088379, + "learning_rate": 9.153527324343506e-05, + "loss": 0.016608065366744994, + "step": 59660 + }, + { + "epoch": 8.469836763662173, + "grad_norm": 3.1837151050567627, + "learning_rate": 9.153385379701916e-05, + "loss": 0.011586660146713256, + "step": 59670 + }, + { + "epoch": 8.471256210078069, + "grad_norm": 0.05866161733865738, + "learning_rate": 9.153243435060327e-05, + "loss": 0.03473564088344574, + "step": 59680 + }, + { + "epoch": 8.472675656493967, + "grad_norm": 1.3842390775680542, + "learning_rate": 9.153101490418737e-05, + "loss": 0.009509885311126709, + "step": 59690 + }, + { + "epoch": 8.474095102909866, + "grad_norm": 0.1759142428636551, + "learning_rate": 9.152959545777148e-05, + "loss": 0.009905293583869934, + "step": 59700 + }, + { + "epoch": 8.475514549325762, + "grad_norm": 0.848075270652771, + "learning_rate": 9.152817601135556e-05, + "loss": 0.0336533784866333, + "step": 59710 + }, + { + "epoch": 8.47693399574166, + "grad_norm": 0.3343120813369751, + "learning_rate": 9.152675656493968e-05, + "loss": 0.017269288003444672, + "step": 59720 + }, + { + "epoch": 8.478353442157559, + "grad_norm": 6.539244651794434, + "learning_rate": 9.152533711852379e-05, + "loss": 0.04893214702606201, + "step": 59730 + }, + { + "epoch": 8.479772888573457, + "grad_norm": 7.863105773925781, + "learning_rate": 9.152391767210788e-05, + "loss": 0.03542499840259552, + "step": 59740 + }, + { + "epoch": 8.481192334989354, + "grad_norm": 1.095321774482727, + "learning_rate": 9.1522498225692e-05, + "loss": 0.019741693139076234, + "step": 59750 + }, + { + "epoch": 8.482611781405252, + "grad_norm": 0.1375374048948288, + "learning_rate": 9.152107877927608e-05, + "loss": 0.026455044746398926, + "step": 59760 + }, + { + "epoch": 8.48403122782115, + "grad_norm": 5.922633647918701, + "learning_rate": 9.151965933286019e-05, + "loss": 0.06508615016937255, + "step": 59770 + }, + { + "epoch": 8.485450674237047, + "grad_norm": 0.24430686235427856, + "learning_rate": 9.151823988644429e-05, + "loss": 0.06395163536071777, + "step": 59780 + }, + { + "epoch": 8.486870120652945, + "grad_norm": 3.22963285446167, + "learning_rate": 9.15168204400284e-05, + "loss": 0.046134963631629944, + "step": 59790 + }, + { + "epoch": 8.488289567068843, + "grad_norm": 0.17032699286937714, + "learning_rate": 9.15154009936125e-05, + "loss": 0.011191642284393311, + "step": 59800 + }, + { + "epoch": 8.489709013484742, + "grad_norm": 2.9801077842712402, + "learning_rate": 9.151398154719659e-05, + "loss": 0.03180201649665833, + "step": 59810 + }, + { + "epoch": 8.491128459900638, + "grad_norm": 4.111161708831787, + "learning_rate": 9.15125621007807e-05, + "loss": 0.0533446192741394, + "step": 59820 + }, + { + "epoch": 8.492547906316537, + "grad_norm": 4.968252182006836, + "learning_rate": 9.15111426543648e-05, + "loss": 0.04567363262176514, + "step": 59830 + }, + { + "epoch": 8.493967352732435, + "grad_norm": 7.942319393157959, + "learning_rate": 9.150972320794891e-05, + "loss": 0.0842665195465088, + "step": 59840 + }, + { + "epoch": 8.495386799148331, + "grad_norm": 7.745817184448242, + "learning_rate": 9.150830376153301e-05, + "loss": 0.05556324124336243, + "step": 59850 + }, + { + "epoch": 8.49680624556423, + "grad_norm": 1.72450590133667, + "learning_rate": 9.150688431511712e-05, + "loss": 0.028340262174606324, + "step": 59860 + }, + { + "epoch": 8.498225691980128, + "grad_norm": 5.714693546295166, + "learning_rate": 9.15054648687012e-05, + "loss": 0.06224752068519592, + "step": 59870 + }, + { + "epoch": 8.499645138396026, + "grad_norm": 7.615067958831787, + "learning_rate": 9.150404542228531e-05, + "loss": 0.08627032041549683, + "step": 59880 + }, + { + "epoch": 8.501064584811923, + "grad_norm": 0.1859230101108551, + "learning_rate": 9.150262597586941e-05, + "loss": 0.01632542759180069, + "step": 59890 + }, + { + "epoch": 8.502484031227821, + "grad_norm": 1.340531826019287, + "learning_rate": 9.150120652945352e-05, + "loss": 0.028228405117988586, + "step": 59900 + }, + { + "epoch": 8.50390347764372, + "grad_norm": 8.235560417175293, + "learning_rate": 9.149978708303762e-05, + "loss": 0.04519372284412384, + "step": 59910 + }, + { + "epoch": 8.505322924059616, + "grad_norm": 9.629444122314453, + "learning_rate": 9.149836763662172e-05, + "loss": 0.02357942909002304, + "step": 59920 + }, + { + "epoch": 8.506742370475514, + "grad_norm": 0.11500538885593414, + "learning_rate": 9.149694819020583e-05, + "loss": 0.004986101761460304, + "step": 59930 + }, + { + "epoch": 8.508161816891413, + "grad_norm": 0.19352419674396515, + "learning_rate": 9.149552874378993e-05, + "loss": 0.028413400053977966, + "step": 59940 + }, + { + "epoch": 8.509581263307311, + "grad_norm": 2.6463675498962402, + "learning_rate": 9.149410929737404e-05, + "loss": 0.05246865153312683, + "step": 59950 + }, + { + "epoch": 8.511000709723207, + "grad_norm": 0.2576519548892975, + "learning_rate": 9.149268985095813e-05, + "loss": 0.023058263957500456, + "step": 59960 + }, + { + "epoch": 8.512420156139106, + "grad_norm": 0.25054165720939636, + "learning_rate": 9.149127040454223e-05, + "loss": 0.045983174443244935, + "step": 59970 + }, + { + "epoch": 8.513839602555004, + "grad_norm": 3.5932352542877197, + "learning_rate": 9.148985095812633e-05, + "loss": 0.05506667494773865, + "step": 59980 + }, + { + "epoch": 8.5152590489709, + "grad_norm": 0.24045352637767792, + "learning_rate": 9.148843151171044e-05, + "loss": 0.03867987096309662, + "step": 59990 + }, + { + "epoch": 8.516678495386799, + "grad_norm": 8.400899887084961, + "learning_rate": 9.148701206529454e-05, + "loss": 0.08673510551452637, + "step": 60000 + }, + { + "epoch": 8.516678495386799, + "eval_accuracy": 0.9781267883258091, + "eval_loss": 0.07569287717342377, + "eval_runtime": 32.1624, + "eval_samples_per_second": 488.987, + "eval_steps_per_second": 15.297, + "step": 60000 + }, + { + "epoch": 8.518097941802697, + "grad_norm": 4.49797248840332, + "learning_rate": 9.148559261887865e-05, + "loss": 0.04673793017864227, + "step": 60010 + }, + { + "epoch": 8.519517388218595, + "grad_norm": 0.5943466424942017, + "learning_rate": 9.148417317246275e-05, + "loss": 0.019504909217357636, + "step": 60020 + }, + { + "epoch": 8.520936834634492, + "grad_norm": 0.06766325235366821, + "learning_rate": 9.148275372604684e-05, + "loss": 0.009393461048603058, + "step": 60030 + }, + { + "epoch": 8.52235628105039, + "grad_norm": 0.09707861393690109, + "learning_rate": 9.148133427963095e-05, + "loss": 0.044020998477935794, + "step": 60040 + }, + { + "epoch": 8.523775727466289, + "grad_norm": 10.096122741699219, + "learning_rate": 9.147991483321505e-05, + "loss": 0.06480343341827392, + "step": 60050 + }, + { + "epoch": 8.525195173882185, + "grad_norm": 0.09844963252544403, + "learning_rate": 9.147849538679916e-05, + "loss": 0.03513259589672089, + "step": 60060 + }, + { + "epoch": 8.526614620298083, + "grad_norm": 0.8048601150512695, + "learning_rate": 9.147707594038325e-05, + "loss": 0.07613663077354431, + "step": 60070 + }, + { + "epoch": 8.528034066713982, + "grad_norm": 0.13436748087406158, + "learning_rate": 9.147565649396736e-05, + "loss": 0.02750980854034424, + "step": 60080 + }, + { + "epoch": 8.52945351312988, + "grad_norm": 1.0650267601013184, + "learning_rate": 9.147423704755145e-05, + "loss": 0.04318079948425293, + "step": 60090 + }, + { + "epoch": 8.530872959545777, + "grad_norm": 0.5778047442436218, + "learning_rate": 9.147281760113557e-05, + "loss": 0.04353642165660858, + "step": 60100 + }, + { + "epoch": 8.532292405961675, + "grad_norm": 0.41429048776626587, + "learning_rate": 9.147139815471966e-05, + "loss": 0.02352541536092758, + "step": 60110 + }, + { + "epoch": 8.533711852377573, + "grad_norm": 1.7214840650558472, + "learning_rate": 9.146997870830376e-05, + "loss": 0.02559060454368591, + "step": 60120 + }, + { + "epoch": 8.53513129879347, + "grad_norm": 0.04450737684965134, + "learning_rate": 9.146870120652946e-05, + "loss": 0.030207446217536925, + "step": 60130 + }, + { + "epoch": 8.536550745209368, + "grad_norm": 7.381211280822754, + "learning_rate": 9.146728176011357e-05, + "loss": 0.06880269050598145, + "step": 60140 + }, + { + "epoch": 8.537970191625266, + "grad_norm": 0.11311613768339157, + "learning_rate": 9.146586231369765e-05, + "loss": 0.017041406035423277, + "step": 60150 + }, + { + "epoch": 8.539389638041165, + "grad_norm": 0.23204351961612701, + "learning_rate": 9.146444286728176e-05, + "loss": 0.03590482473373413, + "step": 60160 + }, + { + "epoch": 8.540809084457061, + "grad_norm": 0.2040676772594452, + "learning_rate": 9.146302342086586e-05, + "loss": 0.025861364603042603, + "step": 60170 + }, + { + "epoch": 8.54222853087296, + "grad_norm": 0.7327109575271606, + "learning_rate": 9.146160397444997e-05, + "loss": 0.06635326743125916, + "step": 60180 + }, + { + "epoch": 8.543647977288858, + "grad_norm": 0.03353278711438179, + "learning_rate": 9.146018452803407e-05, + "loss": 0.03246139287948609, + "step": 60190 + }, + { + "epoch": 8.545067423704754, + "grad_norm": 3.9755334854125977, + "learning_rate": 9.145876508161817e-05, + "loss": 0.055750757455825806, + "step": 60200 + }, + { + "epoch": 8.546486870120653, + "grad_norm": 0.18508094549179077, + "learning_rate": 9.145734563520228e-05, + "loss": 0.042376190423965454, + "step": 60210 + }, + { + "epoch": 8.547906316536551, + "grad_norm": 0.05352622643113136, + "learning_rate": 9.145592618878638e-05, + "loss": 0.021348334848880768, + "step": 60220 + }, + { + "epoch": 8.54932576295245, + "grad_norm": 1.2180757522583008, + "learning_rate": 9.145450674237049e-05, + "loss": 0.021335867047309876, + "step": 60230 + }, + { + "epoch": 8.550745209368346, + "grad_norm": 3.486859083175659, + "learning_rate": 9.145308729595458e-05, + "loss": 0.03150171637535095, + "step": 60240 + }, + { + "epoch": 8.552164655784244, + "grad_norm": 9.03134822845459, + "learning_rate": 9.145166784953868e-05, + "loss": 0.05840170979499817, + "step": 60250 + }, + { + "epoch": 8.553584102200142, + "grad_norm": 2.2280516624450684, + "learning_rate": 9.145024840312278e-05, + "loss": 0.007564665377140045, + "step": 60260 + }, + { + "epoch": 8.555003548616039, + "grad_norm": 0.3790028691291809, + "learning_rate": 9.144882895670689e-05, + "loss": 0.026224061846733093, + "step": 60270 + }, + { + "epoch": 8.556422995031937, + "grad_norm": 3.474783182144165, + "learning_rate": 9.144740951029099e-05, + "loss": 0.015592548251152038, + "step": 60280 + }, + { + "epoch": 8.557842441447836, + "grad_norm": 7.350448131561279, + "learning_rate": 9.14459900638751e-05, + "loss": 0.05067678689956665, + "step": 60290 + }, + { + "epoch": 8.559261887863734, + "grad_norm": 7.781404972076416, + "learning_rate": 9.14445706174592e-05, + "loss": 0.06740889549255372, + "step": 60300 + }, + { + "epoch": 8.56068133427963, + "grad_norm": 0.024709496647119522, + "learning_rate": 9.144315117104329e-05, + "loss": 0.03684686422348023, + "step": 60310 + }, + { + "epoch": 8.562100780695529, + "grad_norm": 0.09711408615112305, + "learning_rate": 9.14417317246274e-05, + "loss": 0.016073787212371828, + "step": 60320 + }, + { + "epoch": 8.563520227111427, + "grad_norm": 1.8171788454055786, + "learning_rate": 9.14403122782115e-05, + "loss": 0.0676068663597107, + "step": 60330 + }, + { + "epoch": 8.564939673527324, + "grad_norm": 7.71488094329834, + "learning_rate": 9.143889283179561e-05, + "loss": 0.08123016357421875, + "step": 60340 + }, + { + "epoch": 8.566359119943222, + "grad_norm": 7.40566873550415, + "learning_rate": 9.14374733853797e-05, + "loss": 0.03387263715267182, + "step": 60350 + }, + { + "epoch": 8.56777856635912, + "grad_norm": 0.3361523449420929, + "learning_rate": 9.14360539389638e-05, + "loss": 0.0076973557472229, + "step": 60360 + }, + { + "epoch": 8.569198012775018, + "grad_norm": 0.7388091683387756, + "learning_rate": 9.14346344925479e-05, + "loss": 0.023807825148105623, + "step": 60370 + }, + { + "epoch": 8.570617459190915, + "grad_norm": 9.611340522766113, + "learning_rate": 9.143321504613201e-05, + "loss": 0.016511398553848266, + "step": 60380 + }, + { + "epoch": 8.572036905606813, + "grad_norm": 0.3703271746635437, + "learning_rate": 9.143179559971613e-05, + "loss": 0.03306230902671814, + "step": 60390 + }, + { + "epoch": 8.573456352022712, + "grad_norm": 0.2659400999546051, + "learning_rate": 9.143037615330021e-05, + "loss": 0.02672841548919678, + "step": 60400 + }, + { + "epoch": 8.574875798438608, + "grad_norm": 0.2967086136341095, + "learning_rate": 9.142895670688432e-05, + "loss": 0.027674263715744017, + "step": 60410 + }, + { + "epoch": 8.576295244854506, + "grad_norm": 0.7257947325706482, + "learning_rate": 9.142753726046842e-05, + "loss": 0.04073569178581238, + "step": 60420 + }, + { + "epoch": 8.577714691270405, + "grad_norm": 0.048524580895900726, + "learning_rate": 9.142611781405253e-05, + "loss": 0.056384187936782834, + "step": 60430 + }, + { + "epoch": 8.579134137686303, + "grad_norm": 7.515527725219727, + "learning_rate": 9.142469836763663e-05, + "loss": 0.024915623664855956, + "step": 60440 + }, + { + "epoch": 8.5805535841022, + "grad_norm": 3.3118956089019775, + "learning_rate": 9.142327892122072e-05, + "loss": 0.03472829461097717, + "step": 60450 + }, + { + "epoch": 8.581973030518098, + "grad_norm": 4.994110584259033, + "learning_rate": 9.142185947480482e-05, + "loss": 0.04696405827999115, + "step": 60460 + }, + { + "epoch": 8.583392476933996, + "grad_norm": 6.274696350097656, + "learning_rate": 9.142044002838893e-05, + "loss": 0.05722887516021728, + "step": 60470 + }, + { + "epoch": 8.584811923349893, + "grad_norm": 2.1877236366271973, + "learning_rate": 9.141902058197304e-05, + "loss": 0.07386709451675415, + "step": 60480 + }, + { + "epoch": 8.586231369765791, + "grad_norm": 3.222991704940796, + "learning_rate": 9.141760113555714e-05, + "loss": 0.06314558982849121, + "step": 60490 + }, + { + "epoch": 8.58765081618169, + "grad_norm": 0.050618402659893036, + "learning_rate": 9.141618168914125e-05, + "loss": 0.04251847565174103, + "step": 60500 + }, + { + "epoch": 8.58765081618169, + "eval_accuracy": 0.9722133909836587, + "eval_loss": 0.09201868623495102, + "eval_runtime": 33.2517, + "eval_samples_per_second": 472.969, + "eval_steps_per_second": 14.796, + "step": 60500 + }, + { + "epoch": 8.589070262597588, + "grad_norm": 6.501929759979248, + "learning_rate": 9.141476224272533e-05, + "loss": 0.025920677185058593, + "step": 60510 + }, + { + "epoch": 8.590489709013484, + "grad_norm": 1.283429741859436, + "learning_rate": 9.141334279630945e-05, + "loss": 0.04298240840435028, + "step": 60520 + }, + { + "epoch": 8.591909155429382, + "grad_norm": 0.33241111040115356, + "learning_rate": 9.141192334989354e-05, + "loss": 0.07493058443069459, + "step": 60530 + }, + { + "epoch": 8.59332860184528, + "grad_norm": 5.392263412475586, + "learning_rate": 9.141050390347765e-05, + "loss": 0.057287472486495974, + "step": 60540 + }, + { + "epoch": 8.594748048261177, + "grad_norm": 2.195725679397583, + "learning_rate": 9.140908445706175e-05, + "loss": 0.05433647036552429, + "step": 60550 + }, + { + "epoch": 8.596167494677076, + "grad_norm": 4.504883766174316, + "learning_rate": 9.140766501064585e-05, + "loss": 0.0373142272233963, + "step": 60560 + }, + { + "epoch": 8.597586941092974, + "grad_norm": 5.910608768463135, + "learning_rate": 9.140624556422996e-05, + "loss": 0.06574462652206421, + "step": 60570 + }, + { + "epoch": 8.599006387508872, + "grad_norm": 0.08602695912122726, + "learning_rate": 9.140482611781406e-05, + "loss": 0.1206291913986206, + "step": 60580 + }, + { + "epoch": 8.600425833924769, + "grad_norm": 4.0139875411987305, + "learning_rate": 9.140340667139817e-05, + "loss": 0.046517929434776305, + "step": 60590 + }, + { + "epoch": 8.601845280340667, + "grad_norm": 1.3188396692276, + "learning_rate": 9.140198722498227e-05, + "loss": 0.02969059944152832, + "step": 60600 + }, + { + "epoch": 8.603264726756565, + "grad_norm": 0.9926785826683044, + "learning_rate": 9.140056777856636e-05, + "loss": 0.03127261996269226, + "step": 60610 + }, + { + "epoch": 8.604684173172462, + "grad_norm": 2.0638065338134766, + "learning_rate": 9.139914833215046e-05, + "loss": 0.07006618976593018, + "step": 60620 + }, + { + "epoch": 8.60610361958836, + "grad_norm": 0.8679757118225098, + "learning_rate": 9.139772888573457e-05, + "loss": 0.022166214883327484, + "step": 60630 + }, + { + "epoch": 8.607523066004259, + "grad_norm": 0.7310284376144409, + "learning_rate": 9.139630943931867e-05, + "loss": 0.027963387966156005, + "step": 60640 + }, + { + "epoch": 8.608942512420157, + "grad_norm": 0.1053808331489563, + "learning_rate": 9.139488999290278e-05, + "loss": 0.0790288507938385, + "step": 60650 + }, + { + "epoch": 8.610361958836053, + "grad_norm": 3.352461338043213, + "learning_rate": 9.139347054648688e-05, + "loss": 0.05328698754310608, + "step": 60660 + }, + { + "epoch": 8.611781405251952, + "grad_norm": 0.1270398646593094, + "learning_rate": 9.139205110007097e-05, + "loss": 0.027239418029785155, + "step": 60670 + }, + { + "epoch": 8.61320085166785, + "grad_norm": 0.03895813971757889, + "learning_rate": 9.139063165365509e-05, + "loss": 0.02223038524389267, + "step": 60680 + }, + { + "epoch": 8.614620298083747, + "grad_norm": 0.4873507618904114, + "learning_rate": 9.138921220723918e-05, + "loss": 0.063723224401474, + "step": 60690 + }, + { + "epoch": 8.616039744499645, + "grad_norm": 0.9177292585372925, + "learning_rate": 9.13877927608233e-05, + "loss": 0.006845385581254959, + "step": 60700 + }, + { + "epoch": 8.617459190915543, + "grad_norm": 0.0890699028968811, + "learning_rate": 9.138637331440738e-05, + "loss": 0.04476172029972077, + "step": 60710 + }, + { + "epoch": 8.618878637331441, + "grad_norm": 0.1363246887922287, + "learning_rate": 9.138495386799149e-05, + "loss": 0.06530548930168152, + "step": 60720 + }, + { + "epoch": 8.620298083747338, + "grad_norm": 4.468372821807861, + "learning_rate": 9.138353442157559e-05, + "loss": 0.05170977711677551, + "step": 60730 + }, + { + "epoch": 8.621717530163236, + "grad_norm": 12.12729549407959, + "learning_rate": 9.13821149751597e-05, + "loss": 0.06713943481445313, + "step": 60740 + }, + { + "epoch": 8.623136976579135, + "grad_norm": 0.13509313762187958, + "learning_rate": 9.13806955287438e-05, + "loss": 0.0637391209602356, + "step": 60750 + }, + { + "epoch": 8.624556422995031, + "grad_norm": 0.43068927526474, + "learning_rate": 9.137927608232789e-05, + "loss": 0.03880758285522461, + "step": 60760 + }, + { + "epoch": 8.62597586941093, + "grad_norm": 2.7275197505950928, + "learning_rate": 9.1377856635912e-05, + "loss": 0.02224656194448471, + "step": 60770 + }, + { + "epoch": 8.627395315826828, + "grad_norm": 1.675925850868225, + "learning_rate": 9.13764371894961e-05, + "loss": 0.01180570274591446, + "step": 60780 + }, + { + "epoch": 8.628814762242726, + "grad_norm": 0.25846660137176514, + "learning_rate": 9.137501774308021e-05, + "loss": 0.02627829313278198, + "step": 60790 + }, + { + "epoch": 8.630234208658623, + "grad_norm": 7.447933673858643, + "learning_rate": 9.137359829666431e-05, + "loss": 0.0631419837474823, + "step": 60800 + }, + { + "epoch": 8.63165365507452, + "grad_norm": 0.4778624176979065, + "learning_rate": 9.13721788502484e-05, + "loss": 0.05850786566734314, + "step": 60810 + }, + { + "epoch": 8.63307310149042, + "grad_norm": 0.9950453042984009, + "learning_rate": 9.13707594038325e-05, + "loss": 0.02292805016040802, + "step": 60820 + }, + { + "epoch": 8.634492547906316, + "grad_norm": 0.47801464796066284, + "learning_rate": 9.136933995741661e-05, + "loss": 0.01733073443174362, + "step": 60830 + }, + { + "epoch": 8.635911994322214, + "grad_norm": 11.81622314453125, + "learning_rate": 9.136792051100071e-05, + "loss": 0.03141593337059021, + "step": 60840 + }, + { + "epoch": 8.637331440738112, + "grad_norm": 0.011311270296573639, + "learning_rate": 9.136650106458482e-05, + "loss": 0.008127608895301819, + "step": 60850 + }, + { + "epoch": 8.63875088715401, + "grad_norm": 8.19806957244873, + "learning_rate": 9.136508161816892e-05, + "loss": 0.03081195056438446, + "step": 60860 + }, + { + "epoch": 8.640170333569907, + "grad_norm": 0.19872935116291046, + "learning_rate": 9.136366217175302e-05, + "loss": 0.03248392045497894, + "step": 60870 + }, + { + "epoch": 8.641589779985805, + "grad_norm": 4.823294162750244, + "learning_rate": 9.136224272533713e-05, + "loss": 0.029790616035461424, + "step": 60880 + }, + { + "epoch": 8.643009226401704, + "grad_norm": 1.5712690353393555, + "learning_rate": 9.136082327892122e-05, + "loss": 0.03219999372959137, + "step": 60890 + }, + { + "epoch": 8.6444286728176, + "grad_norm": 2.975715160369873, + "learning_rate": 9.135940383250534e-05, + "loss": 0.05302545428276062, + "step": 60900 + }, + { + "epoch": 8.645848119233499, + "grad_norm": 1.2663664817810059, + "learning_rate": 9.135798438608943e-05, + "loss": 0.035529720783233645, + "step": 60910 + }, + { + "epoch": 8.647267565649397, + "grad_norm": 2.0028679370880127, + "learning_rate": 9.135656493967353e-05, + "loss": 0.040467509627342226, + "step": 60920 + }, + { + "epoch": 8.648687012065295, + "grad_norm": 0.7642605304718018, + "learning_rate": 9.135514549325763e-05, + "loss": 0.0312233567237854, + "step": 60930 + }, + { + "epoch": 8.650106458481192, + "grad_norm": 2.8044888973236084, + "learning_rate": 9.135372604684174e-05, + "loss": 0.045302554965019226, + "step": 60940 + }, + { + "epoch": 8.65152590489709, + "grad_norm": 11.276861190795898, + "learning_rate": 9.135230660042584e-05, + "loss": 0.02674412727355957, + "step": 60950 + }, + { + "epoch": 8.652945351312988, + "grad_norm": 5.281027317047119, + "learning_rate": 9.135088715400995e-05, + "loss": 0.044106674194335935, + "step": 60960 + }, + { + "epoch": 8.654364797728885, + "grad_norm": 7.230155944824219, + "learning_rate": 9.134946770759404e-05, + "loss": 0.025055408477783203, + "step": 60970 + }, + { + "epoch": 8.655784244144783, + "grad_norm": 0.44767463207244873, + "learning_rate": 9.134804826117814e-05, + "loss": 0.023888878524303436, + "step": 60980 + }, + { + "epoch": 8.657203690560682, + "grad_norm": 0.1281173974275589, + "learning_rate": 9.134662881476225e-05, + "loss": 0.02549477815628052, + "step": 60990 + }, + { + "epoch": 8.65862313697658, + "grad_norm": 0.732596755027771, + "learning_rate": 9.134520936834635e-05, + "loss": 0.05355830192565918, + "step": 61000 + }, + { + "epoch": 8.65862313697658, + "eval_accuracy": 0.9806066001144529, + "eval_loss": 0.06500012427568436, + "eval_runtime": 34.837, + "eval_samples_per_second": 451.445, + "eval_steps_per_second": 14.123, + "step": 61000 + }, + { + "epoch": 8.660042583392476, + "grad_norm": 0.9538720846176147, + "learning_rate": 9.134378992193046e-05, + "loss": 0.08146008253097534, + "step": 61010 + }, + { + "epoch": 8.661462029808375, + "grad_norm": 0.6794492602348328, + "learning_rate": 9.134237047551454e-05, + "loss": 0.013472935557365418, + "step": 61020 + }, + { + "epoch": 8.662881476224273, + "grad_norm": 0.09928761422634125, + "learning_rate": 9.134095102909866e-05, + "loss": 0.023411236703395844, + "step": 61030 + }, + { + "epoch": 8.66430092264017, + "grad_norm": 0.618294358253479, + "learning_rate": 9.133953158268275e-05, + "loss": 0.033431851863861085, + "step": 61040 + }, + { + "epoch": 8.665720369056068, + "grad_norm": 0.7553914189338684, + "learning_rate": 9.133811213626686e-05, + "loss": 0.06925633549690247, + "step": 61050 + }, + { + "epoch": 8.667139815471966, + "grad_norm": 0.39856305718421936, + "learning_rate": 9.133669268985096e-05, + "loss": 0.072854083776474, + "step": 61060 + }, + { + "epoch": 8.668559261887864, + "grad_norm": 4.453004837036133, + "learning_rate": 9.133527324343506e-05, + "loss": 0.02382792830467224, + "step": 61070 + }, + { + "epoch": 8.669978708303761, + "grad_norm": 0.14222095906734467, + "learning_rate": 9.133385379701917e-05, + "loss": 0.044448480010032654, + "step": 61080 + }, + { + "epoch": 8.67139815471966, + "grad_norm": 0.8726344704627991, + "learning_rate": 9.133243435060327e-05, + "loss": 0.04786675274372101, + "step": 61090 + }, + { + "epoch": 8.672817601135558, + "grad_norm": 0.4254017174243927, + "learning_rate": 9.133101490418738e-05, + "loss": 0.042910799384117126, + "step": 61100 + }, + { + "epoch": 8.674237047551454, + "grad_norm": 0.20297612249851227, + "learning_rate": 9.132959545777148e-05, + "loss": 0.0331714004278183, + "step": 61110 + }, + { + "epoch": 8.675656493967352, + "grad_norm": 0.44168731570243835, + "learning_rate": 9.132817601135557e-05, + "loss": 0.06532721519470215, + "step": 61120 + }, + { + "epoch": 8.67707594038325, + "grad_norm": 0.5480031967163086, + "learning_rate": 9.132675656493967e-05, + "loss": 0.03251819014549255, + "step": 61130 + }, + { + "epoch": 8.678495386799149, + "grad_norm": 0.6780239939689636, + "learning_rate": 9.132533711852378e-05, + "loss": 0.029584136605262757, + "step": 61140 + }, + { + "epoch": 8.679914833215046, + "grad_norm": 7.8614630699157715, + "learning_rate": 9.132391767210788e-05, + "loss": 0.032931667566299436, + "step": 61150 + }, + { + "epoch": 8.681334279630944, + "grad_norm": 0.2410171777009964, + "learning_rate": 9.132249822569199e-05, + "loss": 0.0210477352142334, + "step": 61160 + }, + { + "epoch": 8.682753726046842, + "grad_norm": 0.20945946872234344, + "learning_rate": 9.132107877927609e-05, + "loss": 0.06597599387168884, + "step": 61170 + }, + { + "epoch": 8.684173172462739, + "grad_norm": 0.11647208780050278, + "learning_rate": 9.131965933286018e-05, + "loss": 0.020939578115940095, + "step": 61180 + }, + { + "epoch": 8.685592618878637, + "grad_norm": 0.26495155692100525, + "learning_rate": 9.13182398864443e-05, + "loss": 0.036896157264709475, + "step": 61190 + }, + { + "epoch": 8.687012065294535, + "grad_norm": 0.42610204219818115, + "learning_rate": 9.131682044002839e-05, + "loss": 0.03127295076847077, + "step": 61200 + }, + { + "epoch": 8.688431511710434, + "grad_norm": 0.9225974678993225, + "learning_rate": 9.13154009936125e-05, + "loss": 0.03571774959564209, + "step": 61210 + }, + { + "epoch": 8.68985095812633, + "grad_norm": 6.106173992156982, + "learning_rate": 9.13139815471966e-05, + "loss": 0.05818299055099487, + "step": 61220 + }, + { + "epoch": 8.691270404542228, + "grad_norm": 0.33078286051750183, + "learning_rate": 9.13125621007807e-05, + "loss": 0.05615794658660889, + "step": 61230 + }, + { + "epoch": 8.692689850958127, + "grad_norm": 2.8934178352355957, + "learning_rate": 9.13111426543648e-05, + "loss": 0.05344209671020508, + "step": 61240 + }, + { + "epoch": 8.694109297374023, + "grad_norm": 0.8880506157875061, + "learning_rate": 9.13097232079489e-05, + "loss": 0.025896552205085754, + "step": 61250 + }, + { + "epoch": 8.695528743789922, + "grad_norm": 0.15858466923236847, + "learning_rate": 9.1308303761533e-05, + "loss": 0.05899338126182556, + "step": 61260 + }, + { + "epoch": 8.69694819020582, + "grad_norm": 0.5504029989242554, + "learning_rate": 9.130688431511711e-05, + "loss": 0.00966411828994751, + "step": 61270 + }, + { + "epoch": 8.698367636621718, + "grad_norm": 12.35024642944336, + "learning_rate": 9.130546486870121e-05, + "loss": 0.024048765003681184, + "step": 61280 + }, + { + "epoch": 8.699787083037615, + "grad_norm": 0.01407754234969616, + "learning_rate": 9.130404542228531e-05, + "loss": 0.012420719116926193, + "step": 61290 + }, + { + "epoch": 8.701206529453513, + "grad_norm": 5.070805072784424, + "learning_rate": 9.130262597586942e-05, + "loss": 0.05343193411827087, + "step": 61300 + }, + { + "epoch": 8.702625975869411, + "grad_norm": 0.9735597372055054, + "learning_rate": 9.130120652945352e-05, + "loss": 0.05009015202522278, + "step": 61310 + }, + { + "epoch": 8.704045422285308, + "grad_norm": 0.05359075218439102, + "learning_rate": 9.129978708303763e-05, + "loss": 0.041198867559432986, + "step": 61320 + }, + { + "epoch": 8.705464868701206, + "grad_norm": 0.19079919159412384, + "learning_rate": 9.129836763662171e-05, + "loss": 0.11265591382980347, + "step": 61330 + }, + { + "epoch": 8.706884315117104, + "grad_norm": 0.39164605736732483, + "learning_rate": 9.129694819020582e-05, + "loss": 0.09336072206497192, + "step": 61340 + }, + { + "epoch": 8.708303761533003, + "grad_norm": 0.1398046314716339, + "learning_rate": 9.129552874378992e-05, + "loss": 0.02090988904237747, + "step": 61350 + }, + { + "epoch": 8.7097232079489, + "grad_norm": 3.872218370437622, + "learning_rate": 9.129410929737403e-05, + "loss": 0.014080584049224854, + "step": 61360 + }, + { + "epoch": 8.711142654364798, + "grad_norm": 2.848639726638794, + "learning_rate": 9.129268985095813e-05, + "loss": 0.028994157910346985, + "step": 61370 + }, + { + "epoch": 8.712562100780696, + "grad_norm": 1.4137672185897827, + "learning_rate": 9.129127040454223e-05, + "loss": 0.02338533103466034, + "step": 61380 + }, + { + "epoch": 8.713981547196592, + "grad_norm": 5.973514080047607, + "learning_rate": 9.128985095812634e-05, + "loss": 0.06741483807563782, + "step": 61390 + }, + { + "epoch": 8.71540099361249, + "grad_norm": 0.1293167769908905, + "learning_rate": 9.128843151171043e-05, + "loss": 0.04931153953075409, + "step": 61400 + }, + { + "epoch": 8.716820440028389, + "grad_norm": 4.29320764541626, + "learning_rate": 9.128701206529455e-05, + "loss": 0.0691063404083252, + "step": 61410 + }, + { + "epoch": 8.718239886444287, + "grad_norm": 3.396770477294922, + "learning_rate": 9.128559261887864e-05, + "loss": 0.03781647980213165, + "step": 61420 + }, + { + "epoch": 8.719659332860184, + "grad_norm": 11.031145095825195, + "learning_rate": 9.128417317246274e-05, + "loss": 0.015141361951828003, + "step": 61430 + }, + { + "epoch": 8.721078779276082, + "grad_norm": 1.7965271472930908, + "learning_rate": 9.128275372604684e-05, + "loss": 0.01115918904542923, + "step": 61440 + }, + { + "epoch": 8.72249822569198, + "grad_norm": 0.16115064918994904, + "learning_rate": 9.128133427963095e-05, + "loss": 0.03681345283985138, + "step": 61450 + }, + { + "epoch": 8.723917672107877, + "grad_norm": 0.4038357436656952, + "learning_rate": 9.127991483321505e-05, + "loss": 0.05916250944137573, + "step": 61460 + }, + { + "epoch": 8.725337118523775, + "grad_norm": 0.09256431460380554, + "learning_rate": 9.127849538679916e-05, + "loss": 0.01472775936126709, + "step": 61470 + }, + { + "epoch": 8.726756564939674, + "grad_norm": 10.232614517211914, + "learning_rate": 9.127707594038325e-05, + "loss": 0.07860915660858155, + "step": 61480 + }, + { + "epoch": 8.728176011355572, + "grad_norm": 0.18803325295448303, + "learning_rate": 9.127565649396735e-05, + "loss": 0.03942298591136932, + "step": 61490 + }, + { + "epoch": 8.729595457771469, + "grad_norm": 1.1620979309082031, + "learning_rate": 9.127423704755146e-05, + "loss": 0.016197699308395385, + "step": 61500 + }, + { + "epoch": 8.729595457771469, + "eval_accuracy": 0.9816875437146309, + "eval_loss": 0.059635695070028305, + "eval_runtime": 33.2881, + "eval_samples_per_second": 472.451, + "eval_steps_per_second": 14.78, + "step": 61500 + }, + { + "epoch": 8.731014904187367, + "grad_norm": 1.1357897520065308, + "learning_rate": 9.127281760113556e-05, + "loss": 0.03907504975795746, + "step": 61510 + }, + { + "epoch": 8.732434350603265, + "grad_norm": 0.44886356592178345, + "learning_rate": 9.127139815471967e-05, + "loss": 0.031098437309265137, + "step": 61520 + }, + { + "epoch": 8.733853797019162, + "grad_norm": 0.46833470463752747, + "learning_rate": 9.126997870830376e-05, + "loss": 0.05453131794929504, + "step": 61530 + }, + { + "epoch": 8.73527324343506, + "grad_norm": 6.623453140258789, + "learning_rate": 9.126855926188787e-05, + "loss": 0.026876044273376466, + "step": 61540 + }, + { + "epoch": 8.736692689850958, + "grad_norm": 2.4892704486846924, + "learning_rate": 9.126713981547196e-05, + "loss": 0.03759989440441132, + "step": 61550 + }, + { + "epoch": 8.738112136266857, + "grad_norm": 5.195185661315918, + "learning_rate": 9.126572036905607e-05, + "loss": 0.014632970094680786, + "step": 61560 + }, + { + "epoch": 8.739531582682753, + "grad_norm": 1.108044981956482, + "learning_rate": 9.126430092264017e-05, + "loss": 0.045201820135116574, + "step": 61570 + }, + { + "epoch": 8.740951029098651, + "grad_norm": 5.98253870010376, + "learning_rate": 9.126288147622428e-05, + "loss": 0.017468076944351197, + "step": 61580 + }, + { + "epoch": 8.74237047551455, + "grad_norm": 3.968717336654663, + "learning_rate": 9.126146202980838e-05, + "loss": 0.03230096399784088, + "step": 61590 + }, + { + "epoch": 8.743789921930446, + "grad_norm": 0.03290829062461853, + "learning_rate": 9.126004258339248e-05, + "loss": 0.0632810652256012, + "step": 61600 + }, + { + "epoch": 8.745209368346345, + "grad_norm": 11.86566162109375, + "learning_rate": 9.125862313697659e-05, + "loss": 0.05286848545074463, + "step": 61610 + }, + { + "epoch": 8.746628814762243, + "grad_norm": 2.780843734741211, + "learning_rate": 9.125720369056069e-05, + "loss": 0.06637284755706788, + "step": 61620 + }, + { + "epoch": 8.748048261178141, + "grad_norm": 1.4611948728561401, + "learning_rate": 9.12557842441448e-05, + "loss": 0.06998544335365295, + "step": 61630 + }, + { + "epoch": 8.749467707594038, + "grad_norm": 13.01425838470459, + "learning_rate": 9.125436479772888e-05, + "loss": 0.038280272483825685, + "step": 61640 + }, + { + "epoch": 8.750887154009936, + "grad_norm": 0.677115797996521, + "learning_rate": 9.125294535131299e-05, + "loss": 0.02972142696380615, + "step": 61650 + }, + { + "epoch": 8.752306600425834, + "grad_norm": 6.362784385681152, + "learning_rate": 9.125152590489709e-05, + "loss": 0.059755778312683104, + "step": 61660 + }, + { + "epoch": 8.75372604684173, + "grad_norm": 1.1002269983291626, + "learning_rate": 9.12501064584812e-05, + "loss": 0.03250996172428131, + "step": 61670 + }, + { + "epoch": 8.75514549325763, + "grad_norm": 0.5199268460273743, + "learning_rate": 9.12486870120653e-05, + "loss": 0.01646912842988968, + "step": 61680 + }, + { + "epoch": 8.756564939673527, + "grad_norm": 4.037311553955078, + "learning_rate": 9.12472675656494e-05, + "loss": 0.018910709023475646, + "step": 61690 + }, + { + "epoch": 8.757984386089426, + "grad_norm": 5.73612117767334, + "learning_rate": 9.12458481192335e-05, + "loss": 0.0628777265548706, + "step": 61700 + }, + { + "epoch": 8.759403832505322, + "grad_norm": 0.10442260652780533, + "learning_rate": 9.12444286728176e-05, + "loss": 0.013911408185958863, + "step": 61710 + }, + { + "epoch": 8.76082327892122, + "grad_norm": 0.38511592149734497, + "learning_rate": 9.124300922640171e-05, + "loss": 0.015263143181800842, + "step": 61720 + }, + { + "epoch": 8.762242725337119, + "grad_norm": 1.906474232673645, + "learning_rate": 9.124158977998581e-05, + "loss": 0.01780613958835602, + "step": 61730 + }, + { + "epoch": 8.763662171753015, + "grad_norm": 8.139535903930664, + "learning_rate": 9.124017033356991e-05, + "loss": 0.037744688987731936, + "step": 61740 + }, + { + "epoch": 8.765081618168914, + "grad_norm": 10.752461433410645, + "learning_rate": 9.1238750887154e-05, + "loss": 0.03349531888961792, + "step": 61750 + }, + { + "epoch": 8.766501064584812, + "grad_norm": 1.434235692024231, + "learning_rate": 9.123733144073812e-05, + "loss": 0.02006615549325943, + "step": 61760 + }, + { + "epoch": 8.76792051100071, + "grad_norm": 2.3497636318206787, + "learning_rate": 9.123591199432221e-05, + "loss": 0.042415973544120786, + "step": 61770 + }, + { + "epoch": 8.769339957416607, + "grad_norm": 12.238152503967285, + "learning_rate": 9.123449254790632e-05, + "loss": 0.06715450882911682, + "step": 61780 + }, + { + "epoch": 8.770759403832505, + "grad_norm": 4.415995121002197, + "learning_rate": 9.123307310149042e-05, + "loss": 0.03555461466312408, + "step": 61790 + }, + { + "epoch": 8.772178850248403, + "grad_norm": 0.2075185477733612, + "learning_rate": 9.123165365507452e-05, + "loss": 0.048576629161834715, + "step": 61800 + }, + { + "epoch": 8.7735982966643, + "grad_norm": 0.8062130808830261, + "learning_rate": 9.123023420865863e-05, + "loss": 0.0283250629901886, + "step": 61810 + }, + { + "epoch": 8.775017743080198, + "grad_norm": 0.5037146210670471, + "learning_rate": 9.122881476224273e-05, + "loss": 0.006582640111446381, + "step": 61820 + }, + { + "epoch": 8.776437189496097, + "grad_norm": 7.7386932373046875, + "learning_rate": 9.122739531582684e-05, + "loss": 0.058837884664535524, + "step": 61830 + }, + { + "epoch": 8.777856635911995, + "grad_norm": 9.621651649475098, + "learning_rate": 9.122597586941092e-05, + "loss": 0.019973933696746826, + "step": 61840 + }, + { + "epoch": 8.779276082327891, + "grad_norm": 1.2101565599441528, + "learning_rate": 9.122455642299503e-05, + "loss": 0.029185032844543456, + "step": 61850 + }, + { + "epoch": 8.78069552874379, + "grad_norm": 4.3968706130981445, + "learning_rate": 9.122313697657913e-05, + "loss": 0.06037144064903259, + "step": 61860 + }, + { + "epoch": 8.782114975159688, + "grad_norm": 0.4677600860595703, + "learning_rate": 9.122171753016324e-05, + "loss": 0.03795049488544464, + "step": 61870 + }, + { + "epoch": 8.783534421575585, + "grad_norm": 0.128379687666893, + "learning_rate": 9.122029808374735e-05, + "loss": 0.07420201301574707, + "step": 61880 + }, + { + "epoch": 8.784953867991483, + "grad_norm": 0.24324475228786469, + "learning_rate": 9.121887863733144e-05, + "loss": 0.013025546073913574, + "step": 61890 + }, + { + "epoch": 8.786373314407381, + "grad_norm": 0.14116889238357544, + "learning_rate": 9.121745919091555e-05, + "loss": 0.017498120665550232, + "step": 61900 + }, + { + "epoch": 8.78779276082328, + "grad_norm": 0.6224940419197083, + "learning_rate": 9.121603974449965e-05, + "loss": 0.028740760684013367, + "step": 61910 + }, + { + "epoch": 8.789212207239176, + "grad_norm": 0.3105393350124359, + "learning_rate": 9.121462029808376e-05, + "loss": 0.02539215385913849, + "step": 61920 + }, + { + "epoch": 8.790631653655074, + "grad_norm": 0.5749163627624512, + "learning_rate": 9.121320085166785e-05, + "loss": 0.030741649866104125, + "step": 61930 + }, + { + "epoch": 8.792051100070973, + "grad_norm": 7.524410247802734, + "learning_rate": 9.121178140525196e-05, + "loss": 0.031424522399902344, + "step": 61940 + }, + { + "epoch": 8.79347054648687, + "grad_norm": 4.195517539978027, + "learning_rate": 9.121036195883605e-05, + "loss": 0.02137700915336609, + "step": 61950 + }, + { + "epoch": 8.794889992902768, + "grad_norm": 8.744084358215332, + "learning_rate": 9.120894251242016e-05, + "loss": 0.11366484165191651, + "step": 61960 + }, + { + "epoch": 8.796309439318666, + "grad_norm": 0.06890244036912918, + "learning_rate": 9.120752306600427e-05, + "loss": 0.03876006603240967, + "step": 61970 + }, + { + "epoch": 8.797728885734564, + "grad_norm": 0.25812605023384094, + "learning_rate": 9.120610361958837e-05, + "loss": 0.06622171998023987, + "step": 61980 + }, + { + "epoch": 8.79914833215046, + "grad_norm": 0.4101802408695221, + "learning_rate": 9.120468417317248e-05, + "loss": 0.0641588568687439, + "step": 61990 + }, + { + "epoch": 8.800567778566359, + "grad_norm": 0.04798253998160362, + "learning_rate": 9.120326472675656e-05, + "loss": 0.03991687297821045, + "step": 62000 + }, + { + "epoch": 8.800567778566359, + "eval_accuracy": 0.9803522604438227, + "eval_loss": 0.06005360186100006, + "eval_runtime": 32.7621, + "eval_samples_per_second": 480.036, + "eval_steps_per_second": 15.017, + "step": 62000 + }, + { + "epoch": 8.801987224982257, + "grad_norm": 0.8698391318321228, + "learning_rate": 9.120184528034067e-05, + "loss": 0.05466843247413635, + "step": 62010 + }, + { + "epoch": 8.803406671398154, + "grad_norm": 0.9279050230979919, + "learning_rate": 9.120042583392477e-05, + "loss": 0.02461112439632416, + "step": 62020 + }, + { + "epoch": 8.804826117814052, + "grad_norm": 0.347493976354599, + "learning_rate": 9.119900638750888e-05, + "loss": 0.04751765727996826, + "step": 62030 + }, + { + "epoch": 8.80624556422995, + "grad_norm": 0.16119427978992462, + "learning_rate": 9.119758694109298e-05, + "loss": 0.04457222819328308, + "step": 62040 + }, + { + "epoch": 8.807665010645849, + "grad_norm": 0.16925406455993652, + "learning_rate": 9.119616749467708e-05, + "loss": 0.013092260062694549, + "step": 62050 + }, + { + "epoch": 8.809084457061745, + "grad_norm": 0.44262468814849854, + "learning_rate": 9.119474804826119e-05, + "loss": 0.030343493819236754, + "step": 62060 + }, + { + "epoch": 8.810503903477644, + "grad_norm": 2.201153516769409, + "learning_rate": 9.119332860184528e-05, + "loss": 0.052234995365142825, + "step": 62070 + }, + { + "epoch": 8.811923349893542, + "grad_norm": 0.5974806547164917, + "learning_rate": 9.11919091554294e-05, + "loss": 0.027004152536392212, + "step": 62080 + }, + { + "epoch": 8.813342796309438, + "grad_norm": 1.2625153064727783, + "learning_rate": 9.119048970901349e-05, + "loss": 0.04115771949291229, + "step": 62090 + }, + { + "epoch": 8.814762242725337, + "grad_norm": 5.913887977600098, + "learning_rate": 9.118907026259759e-05, + "loss": 0.034026145935058594, + "step": 62100 + }, + { + "epoch": 8.816181689141235, + "grad_norm": 1.1757041215896606, + "learning_rate": 9.118765081618169e-05, + "loss": 0.07286719679832458, + "step": 62110 + }, + { + "epoch": 8.817601135557133, + "grad_norm": 0.8042858839035034, + "learning_rate": 9.11862313697658e-05, + "loss": 0.04174315929412842, + "step": 62120 + }, + { + "epoch": 8.81902058197303, + "grad_norm": 0.46218520402908325, + "learning_rate": 9.11848119233499e-05, + "loss": 0.019040848314762115, + "step": 62130 + }, + { + "epoch": 8.820440028388928, + "grad_norm": 2.684968948364258, + "learning_rate": 9.1183392476934e-05, + "loss": 0.06549656391143799, + "step": 62140 + }, + { + "epoch": 8.821859474804826, + "grad_norm": 12.88553524017334, + "learning_rate": 9.11819730305181e-05, + "loss": 0.07638974785804749, + "step": 62150 + }, + { + "epoch": 8.823278921220723, + "grad_norm": 4.401943683624268, + "learning_rate": 9.11805535841022e-05, + "loss": 0.04739658534526825, + "step": 62160 + }, + { + "epoch": 8.824698367636621, + "grad_norm": 2.7277910709381104, + "learning_rate": 9.117913413768631e-05, + "loss": 0.04343520998954773, + "step": 62170 + }, + { + "epoch": 8.82611781405252, + "grad_norm": 0.1369256228208542, + "learning_rate": 9.117771469127041e-05, + "loss": 0.01796092838048935, + "step": 62180 + }, + { + "epoch": 8.827537260468418, + "grad_norm": 4.254459857940674, + "learning_rate": 9.117629524485452e-05, + "loss": 0.018354178965091707, + "step": 62190 + }, + { + "epoch": 8.828956706884314, + "grad_norm": 3.2595226764678955, + "learning_rate": 9.11748757984386e-05, + "loss": 0.03273018896579742, + "step": 62200 + }, + { + "epoch": 8.830376153300213, + "grad_norm": 2.787855625152588, + "learning_rate": 9.117345635202272e-05, + "loss": 0.08703206777572632, + "step": 62210 + }, + { + "epoch": 8.831795599716111, + "grad_norm": 1.6568021774291992, + "learning_rate": 9.117203690560681e-05, + "loss": 0.03767965137958527, + "step": 62220 + }, + { + "epoch": 8.833215046132008, + "grad_norm": 0.16803374886512756, + "learning_rate": 9.117061745919092e-05, + "loss": 0.015233394503593446, + "step": 62230 + }, + { + "epoch": 8.834634492547906, + "grad_norm": 5.4319024085998535, + "learning_rate": 9.116919801277502e-05, + "loss": 0.034356406331062316, + "step": 62240 + }, + { + "epoch": 8.836053938963804, + "grad_norm": 0.8553333878517151, + "learning_rate": 9.116777856635912e-05, + "loss": 0.033635425567626956, + "step": 62250 + }, + { + "epoch": 8.837473385379703, + "grad_norm": 11.308504104614258, + "learning_rate": 9.116635911994323e-05, + "loss": 0.033365875482559204, + "step": 62260 + }, + { + "epoch": 8.838892831795599, + "grad_norm": 2.8172361850738525, + "learning_rate": 9.116493967352733e-05, + "loss": 0.021276645362377167, + "step": 62270 + }, + { + "epoch": 8.840312278211497, + "grad_norm": 1.633204698562622, + "learning_rate": 9.116352022711144e-05, + "loss": 0.03485658168792725, + "step": 62280 + }, + { + "epoch": 8.841731724627396, + "grad_norm": 1.5068836212158203, + "learning_rate": 9.116210078069554e-05, + "loss": 0.03303306102752686, + "step": 62290 + }, + { + "epoch": 8.843151171043292, + "grad_norm": 2.799574613571167, + "learning_rate": 9.116068133427965e-05, + "loss": 0.04048182368278504, + "step": 62300 + }, + { + "epoch": 8.84457061745919, + "grad_norm": 5.927804946899414, + "learning_rate": 9.115926188786373e-05, + "loss": 0.03790929913520813, + "step": 62310 + }, + { + "epoch": 8.845990063875089, + "grad_norm": 0.35564693808555603, + "learning_rate": 9.115784244144784e-05, + "loss": 0.019379837810993193, + "step": 62320 + }, + { + "epoch": 8.847409510290987, + "grad_norm": 12.212952613830566, + "learning_rate": 9.115642299503194e-05, + "loss": 0.0662179708480835, + "step": 62330 + }, + { + "epoch": 8.848828956706884, + "grad_norm": 9.45881175994873, + "learning_rate": 9.115500354861605e-05, + "loss": 0.06972289681434632, + "step": 62340 + }, + { + "epoch": 8.850248403122782, + "grad_norm": 0.12846651673316956, + "learning_rate": 9.115358410220015e-05, + "loss": 0.030337435007095338, + "step": 62350 + }, + { + "epoch": 8.85166784953868, + "grad_norm": 0.08279696851968765, + "learning_rate": 9.115216465578424e-05, + "loss": 0.029838696122169495, + "step": 62360 + }, + { + "epoch": 8.853087295954577, + "grad_norm": 4.972554683685303, + "learning_rate": 9.115074520936835e-05, + "loss": 0.015542306005954742, + "step": 62370 + }, + { + "epoch": 8.854506742370475, + "grad_norm": 2.105909824371338, + "learning_rate": 9.114932576295245e-05, + "loss": 0.013818117976188659, + "step": 62380 + }, + { + "epoch": 8.855926188786373, + "grad_norm": 0.3148862421512604, + "learning_rate": 9.114790631653656e-05, + "loss": 0.054498547315597536, + "step": 62390 + }, + { + "epoch": 8.857345635202272, + "grad_norm": 3.0089094638824463, + "learning_rate": 9.114648687012066e-05, + "loss": 0.03047267496585846, + "step": 62400 + }, + { + "epoch": 8.858765081618168, + "grad_norm": 0.2803598642349243, + "learning_rate": 9.114506742370476e-05, + "loss": 0.01550055593252182, + "step": 62410 + }, + { + "epoch": 8.860184528034067, + "grad_norm": 0.1116773933172226, + "learning_rate": 9.114364797728886e-05, + "loss": 0.02507326602935791, + "step": 62420 + }, + { + "epoch": 8.861603974449965, + "grad_norm": 6.727297782897949, + "learning_rate": 9.114222853087297e-05, + "loss": 0.06734004020690917, + "step": 62430 + }, + { + "epoch": 8.863023420865863, + "grad_norm": 0.10710111260414124, + "learning_rate": 9.114080908445706e-05, + "loss": 0.030666384100914, + "step": 62440 + }, + { + "epoch": 8.86444286728176, + "grad_norm": 5.24462890625, + "learning_rate": 9.113938963804117e-05, + "loss": 0.056045109033584596, + "step": 62450 + }, + { + "epoch": 8.865862313697658, + "grad_norm": 0.958200216293335, + "learning_rate": 9.113797019162527e-05, + "loss": 0.024746541678905488, + "step": 62460 + }, + { + "epoch": 8.867281760113556, + "grad_norm": 0.7351173758506775, + "learning_rate": 9.113655074520937e-05, + "loss": 0.05286588668823242, + "step": 62470 + }, + { + "epoch": 8.868701206529453, + "grad_norm": 9.957810401916504, + "learning_rate": 9.113513129879348e-05, + "loss": 0.035145890712738034, + "step": 62480 + }, + { + "epoch": 8.870120652945351, + "grad_norm": 0.04390246421098709, + "learning_rate": 9.113371185237758e-05, + "loss": 0.019046881794929506, + "step": 62490 + }, + { + "epoch": 8.87154009936125, + "grad_norm": 0.08628535270690918, + "learning_rate": 9.113229240596169e-05, + "loss": 0.05443682670593262, + "step": 62500 + }, + { + "epoch": 8.87154009936125, + "eval_accuracy": 0.9782539581611241, + "eval_loss": 0.07185545563697815, + "eval_runtime": 32.8112, + "eval_samples_per_second": 479.317, + "eval_steps_per_second": 14.995, + "step": 62500 + }, + { + "epoch": 8.872959545777148, + "grad_norm": 1.975903034210205, + "learning_rate": 9.113087295954577e-05, + "loss": 0.049159616231918335, + "step": 62510 + }, + { + "epoch": 8.874378992193044, + "grad_norm": 0.22022274136543274, + "learning_rate": 9.112945351312988e-05, + "loss": 0.015442782640457153, + "step": 62520 + }, + { + "epoch": 8.875798438608943, + "grad_norm": 0.8125544786453247, + "learning_rate": 9.112803406671398e-05, + "loss": 0.02303740233182907, + "step": 62530 + }, + { + "epoch": 8.87721788502484, + "grad_norm": 1.2327035665512085, + "learning_rate": 9.112661462029809e-05, + "loss": 0.034694111347198485, + "step": 62540 + }, + { + "epoch": 8.878637331440737, + "grad_norm": 4.138776779174805, + "learning_rate": 9.112519517388219e-05, + "loss": 0.07216629385948181, + "step": 62550 + }, + { + "epoch": 8.880056777856636, + "grad_norm": 5.914824485778809, + "learning_rate": 9.112377572746629e-05, + "loss": 0.0611856997013092, + "step": 62560 + }, + { + "epoch": 8.881476224272534, + "grad_norm": 1.282106637954712, + "learning_rate": 9.11223562810504e-05, + "loss": 0.021294346451759337, + "step": 62570 + }, + { + "epoch": 8.882895670688432, + "grad_norm": 7.624048709869385, + "learning_rate": 9.11209368346345e-05, + "loss": 0.040926402807235716, + "step": 62580 + }, + { + "epoch": 8.884315117104329, + "grad_norm": 3.448227882385254, + "learning_rate": 9.11195173882186e-05, + "loss": 0.00841144025325775, + "step": 62590 + }, + { + "epoch": 8.885734563520227, + "grad_norm": 10.487248420715332, + "learning_rate": 9.11180979418027e-05, + "loss": 0.10912501811981201, + "step": 62600 + }, + { + "epoch": 8.887154009936125, + "grad_norm": 0.008511621505022049, + "learning_rate": 9.11166784953868e-05, + "loss": 0.07005314826965332, + "step": 62610 + }, + { + "epoch": 8.888573456352022, + "grad_norm": 0.409939169883728, + "learning_rate": 9.11152590489709e-05, + "loss": 0.05328459739685058, + "step": 62620 + }, + { + "epoch": 8.88999290276792, + "grad_norm": 8.29695987701416, + "learning_rate": 9.111383960255501e-05, + "loss": 0.04727603197097778, + "step": 62630 + }, + { + "epoch": 8.891412349183819, + "grad_norm": 3.3352630138397217, + "learning_rate": 9.11124201561391e-05, + "loss": 0.0870002806186676, + "step": 62640 + }, + { + "epoch": 8.892831795599717, + "grad_norm": 11.782904624938965, + "learning_rate": 9.111100070972322e-05, + "loss": 0.04238658845424652, + "step": 62650 + }, + { + "epoch": 8.894251242015613, + "grad_norm": 2.9484095573425293, + "learning_rate": 9.110958126330731e-05, + "loss": 0.029105091094970705, + "step": 62660 + }, + { + "epoch": 8.895670688431512, + "grad_norm": 0.40413135290145874, + "learning_rate": 9.110816181689141e-05, + "loss": 0.037084218859672544, + "step": 62670 + }, + { + "epoch": 8.89709013484741, + "grad_norm": 0.24385766685009003, + "learning_rate": 9.110674237047552e-05, + "loss": 0.01750268042087555, + "step": 62680 + }, + { + "epoch": 8.898509581263307, + "grad_norm": 0.06714774668216705, + "learning_rate": 9.110532292405962e-05, + "loss": 0.03886613845825195, + "step": 62690 + }, + { + "epoch": 8.899929027679205, + "grad_norm": 0.9303773045539856, + "learning_rate": 9.110390347764373e-05, + "loss": 0.045955890417099, + "step": 62700 + }, + { + "epoch": 8.901348474095103, + "grad_norm": 2.0016958713531494, + "learning_rate": 9.110248403122783e-05, + "loss": 0.04948480129241943, + "step": 62710 + }, + { + "epoch": 8.902767920511002, + "grad_norm": 6.1627655029296875, + "learning_rate": 9.110106458481193e-05, + "loss": 0.03391303420066834, + "step": 62720 + }, + { + "epoch": 8.904187366926898, + "grad_norm": 0.391539990901947, + "learning_rate": 9.109964513839602e-05, + "loss": 0.04015987515449524, + "step": 62730 + }, + { + "epoch": 8.905606813342796, + "grad_norm": 2.768480062484741, + "learning_rate": 9.109822569198013e-05, + "loss": 0.0364282488822937, + "step": 62740 + }, + { + "epoch": 8.907026259758695, + "grad_norm": 4.698720932006836, + "learning_rate": 9.109680624556423e-05, + "loss": 0.019236212968826293, + "step": 62750 + }, + { + "epoch": 8.908445706174591, + "grad_norm": 8.612178802490234, + "learning_rate": 9.109538679914834e-05, + "loss": 0.03959521353244781, + "step": 62760 + }, + { + "epoch": 8.90986515259049, + "grad_norm": 7.235753536224365, + "learning_rate": 9.109396735273244e-05, + "loss": 0.025758838653564452, + "step": 62770 + }, + { + "epoch": 8.911284599006388, + "grad_norm": 1.243888258934021, + "learning_rate": 9.109254790631654e-05, + "loss": 0.06227513551712036, + "step": 62780 + }, + { + "epoch": 8.912704045422286, + "grad_norm": 0.15928207337856293, + "learning_rate": 9.109112845990065e-05, + "loss": 0.047986623644828794, + "step": 62790 + }, + { + "epoch": 8.914123491838183, + "grad_norm": 0.026005161926150322, + "learning_rate": 9.108970901348475e-05, + "loss": 0.061690449714660645, + "step": 62800 + }, + { + "epoch": 8.915542938254081, + "grad_norm": 1.2915374040603638, + "learning_rate": 9.108828956706886e-05, + "loss": 0.0475294291973114, + "step": 62810 + }, + { + "epoch": 8.91696238466998, + "grad_norm": 3.0353357791900635, + "learning_rate": 9.108687012065294e-05, + "loss": 0.039467260241508484, + "step": 62820 + }, + { + "epoch": 8.918381831085876, + "grad_norm": 3.1496834754943848, + "learning_rate": 9.108545067423705e-05, + "loss": 0.036444342136383055, + "step": 62830 + }, + { + "epoch": 8.919801277501774, + "grad_norm": 5.958468914031982, + "learning_rate": 9.108403122782115e-05, + "loss": 0.02855021059513092, + "step": 62840 + }, + { + "epoch": 8.921220723917672, + "grad_norm": 1.679870843887329, + "learning_rate": 9.108261178140526e-05, + "loss": 0.03625530004501343, + "step": 62850 + }, + { + "epoch": 8.92264017033357, + "grad_norm": 1.037192463874817, + "learning_rate": 9.108119233498936e-05, + "loss": 0.029511517286300658, + "step": 62860 + }, + { + "epoch": 8.924059616749467, + "grad_norm": 4.464804649353027, + "learning_rate": 9.107977288857345e-05, + "loss": 0.0660893201828003, + "step": 62870 + }, + { + "epoch": 8.925479063165366, + "grad_norm": 2.770401954650879, + "learning_rate": 9.107849538679915e-05, + "loss": 0.06292575597763062, + "step": 62880 + }, + { + "epoch": 8.926898509581264, + "grad_norm": 6.6392011642456055, + "learning_rate": 9.107707594038325e-05, + "loss": 0.05748374462127685, + "step": 62890 + }, + { + "epoch": 8.92831795599716, + "grad_norm": 5.904137134552002, + "learning_rate": 9.107565649396736e-05, + "loss": 0.011138977110385894, + "step": 62900 + }, + { + "epoch": 8.929737402413059, + "grad_norm": 3.686323404312134, + "learning_rate": 9.107423704755146e-05, + "loss": 0.02662035822868347, + "step": 62910 + }, + { + "epoch": 8.931156848828957, + "grad_norm": 4.201900482177734, + "learning_rate": 9.107281760113557e-05, + "loss": 0.07875522971153259, + "step": 62920 + }, + { + "epoch": 8.932576295244855, + "grad_norm": 0.29425719380378723, + "learning_rate": 9.107139815471967e-05, + "loss": 0.02993255853652954, + "step": 62930 + }, + { + "epoch": 8.933995741660752, + "grad_norm": 2.1735360622406006, + "learning_rate": 9.106997870830378e-05, + "loss": 0.03278636932373047, + "step": 62940 + }, + { + "epoch": 8.93541518807665, + "grad_norm": 1.2639693021774292, + "learning_rate": 9.106855926188786e-05, + "loss": 0.13020997047424315, + "step": 62950 + }, + { + "epoch": 8.936834634492548, + "grad_norm": 3.955355167388916, + "learning_rate": 9.106713981547197e-05, + "loss": 0.030237650871276854, + "step": 62960 + }, + { + "epoch": 8.938254080908445, + "grad_norm": 1.5435599088668823, + "learning_rate": 9.106572036905607e-05, + "loss": 0.031678777933120725, + "step": 62970 + }, + { + "epoch": 8.939673527324343, + "grad_norm": 1.9179738759994507, + "learning_rate": 9.106430092264018e-05, + "loss": 0.07520507574081421, + "step": 62980 + }, + { + "epoch": 8.941092973740242, + "grad_norm": 3.3584091663360596, + "learning_rate": 9.106288147622428e-05, + "loss": 0.043991255760192874, + "step": 62990 + }, + { + "epoch": 8.94251242015614, + "grad_norm": 0.2111026793718338, + "learning_rate": 9.106146202980838e-05, + "loss": 0.016981948912143708, + "step": 63000 + }, + { + "epoch": 8.94251242015614, + "eval_accuracy": 0.9835950912443568, + "eval_loss": 0.053931817412376404, + "eval_runtime": 33.1446, + "eval_samples_per_second": 474.497, + "eval_steps_per_second": 14.844, + "step": 63000 + }, + { + "epoch": 8.943931866572036, + "grad_norm": 2.602844715118408, + "learning_rate": 9.106004258339249e-05, + "loss": 0.026281210780143737, + "step": 63010 + }, + { + "epoch": 8.945351312987935, + "grad_norm": 4.518967628479004, + "learning_rate": 9.105862313697658e-05, + "loss": 0.021604365110397337, + "step": 63020 + }, + { + "epoch": 8.946770759403833, + "grad_norm": 2.052237033843994, + "learning_rate": 9.10572036905607e-05, + "loss": 0.05609427690505982, + "step": 63030 + }, + { + "epoch": 8.94819020581973, + "grad_norm": 16.218595504760742, + "learning_rate": 9.105578424414479e-05, + "loss": 0.06002562642097473, + "step": 63040 + }, + { + "epoch": 8.949609652235628, + "grad_norm": 15.071115493774414, + "learning_rate": 9.105436479772889e-05, + "loss": 0.05712893605232239, + "step": 63050 + }, + { + "epoch": 8.951029098651526, + "grad_norm": 0.1248808354139328, + "learning_rate": 9.105294535131299e-05, + "loss": 0.04929445683956146, + "step": 63060 + }, + { + "epoch": 8.952448545067424, + "grad_norm": 9.456350326538086, + "learning_rate": 9.10515259048971e-05, + "loss": 0.020241251587867735, + "step": 63070 + }, + { + "epoch": 8.953867991483321, + "grad_norm": 5.2534403800964355, + "learning_rate": 9.10501064584812e-05, + "loss": 0.1037605881690979, + "step": 63080 + }, + { + "epoch": 8.95528743789922, + "grad_norm": 0.07144279032945633, + "learning_rate": 9.10486870120653e-05, + "loss": 0.010947969555854798, + "step": 63090 + }, + { + "epoch": 8.956706884315118, + "grad_norm": 0.0726260244846344, + "learning_rate": 9.10472675656494e-05, + "loss": 0.02054280638694763, + "step": 63100 + }, + { + "epoch": 8.958126330731014, + "grad_norm": 3.0107414722442627, + "learning_rate": 9.10458481192335e-05, + "loss": 0.0355594128370285, + "step": 63110 + }, + { + "epoch": 8.959545777146912, + "grad_norm": 0.8008242845535278, + "learning_rate": 9.104442867281761e-05, + "loss": 0.020664720237255095, + "step": 63120 + }, + { + "epoch": 8.96096522356281, + "grad_norm": 0.20576325058937073, + "learning_rate": 9.104300922640171e-05, + "loss": 0.03208284378051758, + "step": 63130 + }, + { + "epoch": 8.962384669978709, + "grad_norm": 0.6825739145278931, + "learning_rate": 9.104158977998582e-05, + "loss": 0.014281252026557922, + "step": 63140 + }, + { + "epoch": 8.963804116394606, + "grad_norm": 3.512455463409424, + "learning_rate": 9.10401703335699e-05, + "loss": 0.005695473775267601, + "step": 63150 + }, + { + "epoch": 8.965223562810504, + "grad_norm": 0.5392917990684509, + "learning_rate": 9.103875088715401e-05, + "loss": 0.03396806418895722, + "step": 63160 + }, + { + "epoch": 8.966643009226402, + "grad_norm": 0.07298357039690018, + "learning_rate": 9.103733144073811e-05, + "loss": 0.011821673810482025, + "step": 63170 + }, + { + "epoch": 8.968062455642299, + "grad_norm": 0.03901802748441696, + "learning_rate": 9.103591199432222e-05, + "loss": 0.040955165028572084, + "step": 63180 + }, + { + "epoch": 8.969481902058197, + "grad_norm": 2.884178400039673, + "learning_rate": 9.103449254790632e-05, + "loss": 0.059520548582077025, + "step": 63190 + }, + { + "epoch": 8.970901348474095, + "grad_norm": 6.718408584594727, + "learning_rate": 9.103307310149042e-05, + "loss": 0.09160915017127991, + "step": 63200 + }, + { + "epoch": 8.972320794889994, + "grad_norm": 0.259737491607666, + "learning_rate": 9.103165365507453e-05, + "loss": 0.018729987740516662, + "step": 63210 + }, + { + "epoch": 8.97374024130589, + "grad_norm": 0.10244779288768768, + "learning_rate": 9.103023420865863e-05, + "loss": 0.060778087377548216, + "step": 63220 + }, + { + "epoch": 8.975159687721789, + "grad_norm": 2.154569625854492, + "learning_rate": 9.102881476224274e-05, + "loss": 0.03838706910610199, + "step": 63230 + }, + { + "epoch": 8.976579134137687, + "grad_norm": 10.858172416687012, + "learning_rate": 9.102739531582683e-05, + "loss": 0.05365590453147888, + "step": 63240 + }, + { + "epoch": 8.977998580553583, + "grad_norm": 0.09644179791212082, + "learning_rate": 9.102597586941093e-05, + "loss": 0.030293729901313782, + "step": 63250 + }, + { + "epoch": 8.979418026969482, + "grad_norm": 0.23783816397190094, + "learning_rate": 9.102455642299503e-05, + "loss": 0.04220397770404816, + "step": 63260 + }, + { + "epoch": 8.98083747338538, + "grad_norm": 7.483023166656494, + "learning_rate": 9.102313697657914e-05, + "loss": 0.04700967967510224, + "step": 63270 + }, + { + "epoch": 8.982256919801278, + "grad_norm": 0.4879629611968994, + "learning_rate": 9.102171753016324e-05, + "loss": 0.04679511487483978, + "step": 63280 + }, + { + "epoch": 8.983676366217175, + "grad_norm": 1.037855625152588, + "learning_rate": 9.102029808374735e-05, + "loss": 0.05557551383972168, + "step": 63290 + }, + { + "epoch": 8.985095812633073, + "grad_norm": 0.9630271792411804, + "learning_rate": 9.101887863733145e-05, + "loss": 0.045986443758010864, + "step": 63300 + }, + { + "epoch": 8.986515259048971, + "grad_norm": 4.669297218322754, + "learning_rate": 9.101745919091554e-05, + "loss": 0.028100493550300597, + "step": 63310 + }, + { + "epoch": 8.987934705464868, + "grad_norm": 0.6383737325668335, + "learning_rate": 9.101603974449965e-05, + "loss": 0.029157137870788573, + "step": 63320 + }, + { + "epoch": 8.989354151880766, + "grad_norm": 7.461113929748535, + "learning_rate": 9.101462029808375e-05, + "loss": 0.03222631812095642, + "step": 63330 + }, + { + "epoch": 8.990773598296665, + "grad_norm": 2.4280130863189697, + "learning_rate": 9.101320085166786e-05, + "loss": 0.019230978190898897, + "step": 63340 + }, + { + "epoch": 8.992193044712563, + "grad_norm": 1.3081002235412598, + "learning_rate": 9.101178140525196e-05, + "loss": 0.0344824880361557, + "step": 63350 + }, + { + "epoch": 8.99361249112846, + "grad_norm": 0.041129060089588165, + "learning_rate": 9.101036195883606e-05, + "loss": 0.05528616905212402, + "step": 63360 + }, + { + "epoch": 8.995031937544358, + "grad_norm": 8.482027053833008, + "learning_rate": 9.100894251242015e-05, + "loss": 0.02113038897514343, + "step": 63370 + }, + { + "epoch": 8.996451383960256, + "grad_norm": 1.0515233278274536, + "learning_rate": 9.100752306600427e-05, + "loss": 0.08732074499130249, + "step": 63380 + }, + { + "epoch": 8.997870830376153, + "grad_norm": 11.335579872131348, + "learning_rate": 9.100610361958836e-05, + "loss": 0.05290312767028808, + "step": 63390 + }, + { + "epoch": 8.99929027679205, + "grad_norm": 3.011958360671997, + "learning_rate": 9.100468417317247e-05, + "loss": 0.028017181158065795, + "step": 63400 + }, + { + "epoch": 9.00070972320795, + "grad_norm": 2.5004894733428955, + "learning_rate": 9.100326472675657e-05, + "loss": 0.007114443182945252, + "step": 63410 + }, + { + "epoch": 9.002129169623847, + "grad_norm": 0.9608315229415894, + "learning_rate": 9.100184528034067e-05, + "loss": 0.03280950784683227, + "step": 63420 + }, + { + "epoch": 9.003548616039744, + "grad_norm": 0.11002276837825775, + "learning_rate": 9.100042583392478e-05, + "loss": 0.052053457498550414, + "step": 63430 + }, + { + "epoch": 9.004968062455642, + "grad_norm": 3.397446870803833, + "learning_rate": 9.099900638750888e-05, + "loss": 0.028442218899726868, + "step": 63440 + }, + { + "epoch": 9.00638750887154, + "grad_norm": 6.959239959716797, + "learning_rate": 9.099758694109299e-05, + "loss": 0.02767481803894043, + "step": 63450 + }, + { + "epoch": 9.007806955287437, + "grad_norm": 1.9296232461929321, + "learning_rate": 9.099616749467707e-05, + "loss": 0.01916002035140991, + "step": 63460 + }, + { + "epoch": 9.009226401703335, + "grad_norm": 0.399951696395874, + "learning_rate": 9.099474804826118e-05, + "loss": 0.02451731264591217, + "step": 63470 + }, + { + "epoch": 9.010645848119234, + "grad_norm": 4.6797308921813965, + "learning_rate": 9.099332860184528e-05, + "loss": 0.011301268637180329, + "step": 63480 + }, + { + "epoch": 9.012065294535132, + "grad_norm": 10.480035781860352, + "learning_rate": 9.099190915542939e-05, + "loss": 0.019284191727638244, + "step": 63490 + }, + { + "epoch": 9.013484740951029, + "grad_norm": 1.5630741119384766, + "learning_rate": 9.099048970901349e-05, + "loss": 0.03886908292770386, + "step": 63500 + }, + { + "epoch": 9.013484740951029, + "eval_accuracy": 0.9846124499268774, + "eval_loss": 0.047866348177194595, + "eval_runtime": 32.9807, + "eval_samples_per_second": 476.855, + "eval_steps_per_second": 14.918, + "step": 63500 + }, + { + "epoch": 9.014904187366927, + "grad_norm": 2.2032713890075684, + "learning_rate": 9.098907026259759e-05, + "loss": 0.024095374345779418, + "step": 63510 + }, + { + "epoch": 9.016323633782825, + "grad_norm": 0.2933710515499115, + "learning_rate": 9.09876508161817e-05, + "loss": 0.03843706250190735, + "step": 63520 + }, + { + "epoch": 9.017743080198722, + "grad_norm": 8.106107711791992, + "learning_rate": 9.09862313697658e-05, + "loss": 0.03381499648094177, + "step": 63530 + }, + { + "epoch": 9.01916252661462, + "grad_norm": 4.045937538146973, + "learning_rate": 9.09848119233499e-05, + "loss": 0.009852905571460725, + "step": 63540 + }, + { + "epoch": 9.020581973030518, + "grad_norm": 9.243306159973145, + "learning_rate": 9.0983392476934e-05, + "loss": 0.06235023140907288, + "step": 63550 + }, + { + "epoch": 9.022001419446417, + "grad_norm": 10.42448902130127, + "learning_rate": 9.09819730305181e-05, + "loss": 0.030120083689689638, + "step": 63560 + }, + { + "epoch": 9.023420865862313, + "grad_norm": 0.11517821252346039, + "learning_rate": 9.09805535841022e-05, + "loss": 0.022688122093677522, + "step": 63570 + }, + { + "epoch": 9.024840312278211, + "grad_norm": 11.302186012268066, + "learning_rate": 9.097913413768631e-05, + "loss": 0.0606619656085968, + "step": 63580 + }, + { + "epoch": 9.02625975869411, + "grad_norm": 5.819149971008301, + "learning_rate": 9.09777146912704e-05, + "loss": 0.020539863407611846, + "step": 63590 + }, + { + "epoch": 9.027679205110006, + "grad_norm": 7.555016994476318, + "learning_rate": 9.097629524485452e-05, + "loss": 0.05828157663345337, + "step": 63600 + }, + { + "epoch": 9.029098651525905, + "grad_norm": 4.1367411613464355, + "learning_rate": 9.097487579843861e-05, + "loss": 0.06016632318496704, + "step": 63610 + }, + { + "epoch": 9.030518097941803, + "grad_norm": 1.3763189315795898, + "learning_rate": 9.097345635202271e-05, + "loss": 0.03897145688533783, + "step": 63620 + }, + { + "epoch": 9.031937544357701, + "grad_norm": 1.6536431312561035, + "learning_rate": 9.097203690560682e-05, + "loss": 0.018550589680671692, + "step": 63630 + }, + { + "epoch": 9.033356990773598, + "grad_norm": 0.18937966227531433, + "learning_rate": 9.097061745919092e-05, + "loss": 0.048836135864257814, + "step": 63640 + }, + { + "epoch": 9.034776437189496, + "grad_norm": 0.9972341060638428, + "learning_rate": 9.096919801277503e-05, + "loss": 0.018428274989128114, + "step": 63650 + }, + { + "epoch": 9.036195883605394, + "grad_norm": 0.13892175257205963, + "learning_rate": 9.096777856635913e-05, + "loss": 0.020253479480743408, + "step": 63660 + }, + { + "epoch": 9.037615330021291, + "grad_norm": 0.30342087149620056, + "learning_rate": 9.096635911994322e-05, + "loss": 0.029193535447120667, + "step": 63670 + }, + { + "epoch": 9.03903477643719, + "grad_norm": 0.5949711203575134, + "learning_rate": 9.096493967352732e-05, + "loss": 0.041388329863548276, + "step": 63680 + }, + { + "epoch": 9.040454222853088, + "grad_norm": 6.298411846160889, + "learning_rate": 9.096352022711143e-05, + "loss": 0.05257458686828613, + "step": 63690 + }, + { + "epoch": 9.041873669268986, + "grad_norm": 3.842451572418213, + "learning_rate": 9.096210078069553e-05, + "loss": 0.04259455502033234, + "step": 63700 + }, + { + "epoch": 9.043293115684882, + "grad_norm": 6.479927062988281, + "learning_rate": 9.096068133427964e-05, + "loss": 0.053349781036376956, + "step": 63710 + }, + { + "epoch": 9.04471256210078, + "grad_norm": 8.164941787719727, + "learning_rate": 9.095926188786374e-05, + "loss": 0.03254518806934357, + "step": 63720 + }, + { + "epoch": 9.046132008516679, + "grad_norm": 0.7975434064865112, + "learning_rate": 9.095784244144784e-05, + "loss": 0.013299444317817688, + "step": 63730 + }, + { + "epoch": 9.047551454932576, + "grad_norm": 1.7391208410263062, + "learning_rate": 9.095642299503195e-05, + "loss": 0.007834933698177338, + "step": 63740 + }, + { + "epoch": 9.048970901348474, + "grad_norm": 0.11377626657485962, + "learning_rate": 9.095500354861604e-05, + "loss": 0.09526675939559937, + "step": 63750 + }, + { + "epoch": 9.050390347764372, + "grad_norm": 8.344837188720703, + "learning_rate": 9.095358410220016e-05, + "loss": 0.031209063529968262, + "step": 63760 + }, + { + "epoch": 9.05180979418027, + "grad_norm": 2.4288384914398193, + "learning_rate": 9.095216465578424e-05, + "loss": 0.041136741638183594, + "step": 63770 + }, + { + "epoch": 9.053229240596167, + "grad_norm": 0.02463367208838463, + "learning_rate": 9.095074520936835e-05, + "loss": 0.018258750438690186, + "step": 63780 + }, + { + "epoch": 9.054648687012065, + "grad_norm": 0.7001953125, + "learning_rate": 9.094932576295245e-05, + "loss": 0.033777013421058655, + "step": 63790 + }, + { + "epoch": 9.056068133427964, + "grad_norm": 0.08243271708488464, + "learning_rate": 9.094790631653656e-05, + "loss": 0.05407797694206238, + "step": 63800 + }, + { + "epoch": 9.05748757984386, + "grad_norm": 6.220699310302734, + "learning_rate": 9.094648687012066e-05, + "loss": 0.019056543707847595, + "step": 63810 + }, + { + "epoch": 9.058907026259758, + "grad_norm": 1.73167085647583, + "learning_rate": 9.094506742370475e-05, + "loss": 0.024083656072616578, + "step": 63820 + }, + { + "epoch": 9.060326472675657, + "grad_norm": 0.7688294649124146, + "learning_rate": 9.094364797728886e-05, + "loss": 0.015182797610759736, + "step": 63830 + }, + { + "epoch": 9.061745919091555, + "grad_norm": 4.387524127960205, + "learning_rate": 9.094222853087296e-05, + "loss": 0.0528663158416748, + "step": 63840 + }, + { + "epoch": 9.063165365507452, + "grad_norm": 0.5699704885482788, + "learning_rate": 9.094080908445707e-05, + "loss": 0.010815832018852233, + "step": 63850 + }, + { + "epoch": 9.06458481192335, + "grad_norm": 0.7277228236198425, + "learning_rate": 9.093938963804117e-05, + "loss": 0.004810039326548577, + "step": 63860 + }, + { + "epoch": 9.066004258339248, + "grad_norm": 0.21413087844848633, + "learning_rate": 9.093797019162527e-05, + "loss": 0.018760937452316283, + "step": 63870 + }, + { + "epoch": 9.067423704755145, + "grad_norm": 0.2676238417625427, + "learning_rate": 9.093655074520936e-05, + "loss": 0.013498370349407197, + "step": 63880 + }, + { + "epoch": 9.068843151171043, + "grad_norm": 0.12514221668243408, + "learning_rate": 9.093513129879348e-05, + "loss": 0.009908372163772583, + "step": 63890 + }, + { + "epoch": 9.070262597586941, + "grad_norm": 2.2376949787139893, + "learning_rate": 9.093371185237757e-05, + "loss": 0.01928650438785553, + "step": 63900 + }, + { + "epoch": 9.07168204400284, + "grad_norm": 1.1722604036331177, + "learning_rate": 9.093229240596168e-05, + "loss": 0.06219164133071899, + "step": 63910 + }, + { + "epoch": 9.073101490418736, + "grad_norm": 0.09166330099105835, + "learning_rate": 9.093087295954578e-05, + "loss": 0.024753783643245698, + "step": 63920 + }, + { + "epoch": 9.074520936834634, + "grad_norm": 0.16517707705497742, + "learning_rate": 9.092945351312988e-05, + "loss": 0.014034570753574371, + "step": 63930 + }, + { + "epoch": 9.075940383250533, + "grad_norm": 7.7434773445129395, + "learning_rate": 9.092803406671399e-05, + "loss": 0.06656568646430969, + "step": 63940 + }, + { + "epoch": 9.07735982966643, + "grad_norm": 0.7195178270339966, + "learning_rate": 9.092661462029809e-05, + "loss": 0.023263543844223022, + "step": 63950 + }, + { + "epoch": 9.078779276082328, + "grad_norm": 0.576549768447876, + "learning_rate": 9.09251951738822e-05, + "loss": 0.03049999475479126, + "step": 63960 + }, + { + "epoch": 9.080198722498226, + "grad_norm": 2.331205368041992, + "learning_rate": 9.092377572746628e-05, + "loss": 0.03987345695495605, + "step": 63970 + }, + { + "epoch": 9.081618168914124, + "grad_norm": 2.247481346130371, + "learning_rate": 9.092235628105039e-05, + "loss": 0.034203958511352536, + "step": 63980 + }, + { + "epoch": 9.08303761533002, + "grad_norm": 3.3796920776367188, + "learning_rate": 9.092093683463449e-05, + "loss": 0.028829208016395567, + "step": 63990 + }, + { + "epoch": 9.084457061745919, + "grad_norm": 0.5988370776176453, + "learning_rate": 9.09195173882186e-05, + "loss": 0.005476556345820427, + "step": 64000 + }, + { + "epoch": 9.084457061745919, + "eval_accuracy": 0.9644560310294398, + "eval_loss": 0.1253519356250763, + "eval_runtime": 33.0057, + "eval_samples_per_second": 476.494, + "eval_steps_per_second": 14.907, + "step": 64000 + }, + { + "epoch": 9.085876508161817, + "grad_norm": 7.576418399810791, + "learning_rate": 9.09180979418027e-05, + "loss": 0.06737409830093384, + "step": 64010 + }, + { + "epoch": 9.087295954577714, + "grad_norm": 1.5139234066009521, + "learning_rate": 9.091667849538681e-05, + "loss": 0.06679987907409668, + "step": 64020 + }, + { + "epoch": 9.088715400993612, + "grad_norm": 0.34518659114837646, + "learning_rate": 9.09152590489709e-05, + "loss": 0.04123524129390717, + "step": 64030 + }, + { + "epoch": 9.09013484740951, + "grad_norm": 5.3888068199157715, + "learning_rate": 9.0913839602555e-05, + "loss": 0.05024193525314331, + "step": 64040 + }, + { + "epoch": 9.091554293825409, + "grad_norm": 1.4487290382385254, + "learning_rate": 9.091242015613911e-05, + "loss": 0.06248751878738403, + "step": 64050 + }, + { + "epoch": 9.092973740241305, + "grad_norm": 1.193361520767212, + "learning_rate": 9.091100070972321e-05, + "loss": 0.0333157479763031, + "step": 64060 + }, + { + "epoch": 9.094393186657204, + "grad_norm": 0.790325939655304, + "learning_rate": 9.090958126330732e-05, + "loss": 0.06225918531417847, + "step": 64070 + }, + { + "epoch": 9.095812633073102, + "grad_norm": 5.633969306945801, + "learning_rate": 9.09081618168914e-05, + "loss": 0.05814381837844849, + "step": 64080 + }, + { + "epoch": 9.097232079488998, + "grad_norm": 0.02727256342768669, + "learning_rate": 9.090674237047552e-05, + "loss": 0.033204466104507446, + "step": 64090 + }, + { + "epoch": 9.098651525904897, + "grad_norm": 4.997963905334473, + "learning_rate": 9.090532292405961e-05, + "loss": 0.08285855650901794, + "step": 64100 + }, + { + "epoch": 9.100070972320795, + "grad_norm": 2.4729809761047363, + "learning_rate": 9.090390347764373e-05, + "loss": 0.072034353017807, + "step": 64110 + }, + { + "epoch": 9.101490418736693, + "grad_norm": 0.2927655279636383, + "learning_rate": 9.090248403122784e-05, + "loss": 0.015156900882720948, + "step": 64120 + }, + { + "epoch": 9.10290986515259, + "grad_norm": 3.0484793186187744, + "learning_rate": 9.090106458481192e-05, + "loss": 0.032074537873268125, + "step": 64130 + }, + { + "epoch": 9.104329311568488, + "grad_norm": 0.18304117023944855, + "learning_rate": 9.089964513839603e-05, + "loss": 0.05773396492004394, + "step": 64140 + }, + { + "epoch": 9.105748757984387, + "grad_norm": 3.660144090652466, + "learning_rate": 9.089822569198013e-05, + "loss": 0.03408423960208893, + "step": 64150 + }, + { + "epoch": 9.107168204400283, + "grad_norm": 0.703398585319519, + "learning_rate": 9.089680624556424e-05, + "loss": 0.01160280853509903, + "step": 64160 + }, + { + "epoch": 9.108587650816181, + "grad_norm": 0.30307838320732117, + "learning_rate": 9.089538679914834e-05, + "loss": 0.03360556662082672, + "step": 64170 + }, + { + "epoch": 9.11000709723208, + "grad_norm": 0.2776535451412201, + "learning_rate": 9.089396735273243e-05, + "loss": 0.09121599197387695, + "step": 64180 + }, + { + "epoch": 9.111426543647978, + "grad_norm": 0.8593045473098755, + "learning_rate": 9.089254790631653e-05, + "loss": 0.016895319521427154, + "step": 64190 + }, + { + "epoch": 9.112845990063875, + "grad_norm": 2.2342422008514404, + "learning_rate": 9.089112845990064e-05, + "loss": 0.015390795469284058, + "step": 64200 + }, + { + "epoch": 9.114265436479773, + "grad_norm": 7.935168743133545, + "learning_rate": 9.088970901348475e-05, + "loss": 0.06856619715690612, + "step": 64210 + }, + { + "epoch": 9.115684882895671, + "grad_norm": 0.09339141100645065, + "learning_rate": 9.088828956706885e-05, + "loss": 0.027166441082954407, + "step": 64220 + }, + { + "epoch": 9.117104329311568, + "grad_norm": 0.7055644989013672, + "learning_rate": 9.088687012065295e-05, + "loss": 0.02743214964866638, + "step": 64230 + }, + { + "epoch": 9.118523775727466, + "grad_norm": 8.74792766571045, + "learning_rate": 9.088545067423705e-05, + "loss": 0.058396434783935545, + "step": 64240 + }, + { + "epoch": 9.119943222143364, + "grad_norm": 0.2736349105834961, + "learning_rate": 9.088403122782116e-05, + "loss": 0.020447200536727904, + "step": 64250 + }, + { + "epoch": 9.121362668559263, + "grad_norm": 0.13781176507472992, + "learning_rate": 9.088261178140525e-05, + "loss": 0.016127771139144896, + "step": 64260 + }, + { + "epoch": 9.12278211497516, + "grad_norm": 0.014397944323718548, + "learning_rate": 9.088119233498937e-05, + "loss": 0.07276721000671386, + "step": 64270 + }, + { + "epoch": 9.124201561391057, + "grad_norm": 0.040416114032268524, + "learning_rate": 9.087977288857345e-05, + "loss": 0.02326855659484863, + "step": 64280 + }, + { + "epoch": 9.125621007806956, + "grad_norm": 9.205130577087402, + "learning_rate": 9.087835344215756e-05, + "loss": 0.05591330528259277, + "step": 64290 + }, + { + "epoch": 9.127040454222852, + "grad_norm": 0.20341289043426514, + "learning_rate": 9.087693399574167e-05, + "loss": 0.025824469327926636, + "step": 64300 + }, + { + "epoch": 9.12845990063875, + "grad_norm": 0.5872713923454285, + "learning_rate": 9.087551454932577e-05, + "loss": 0.04505482614040375, + "step": 64310 + }, + { + "epoch": 9.129879347054649, + "grad_norm": 4.44802188873291, + "learning_rate": 9.087409510290988e-05, + "loss": 0.013928559422492982, + "step": 64320 + }, + { + "epoch": 9.131298793470547, + "grad_norm": 4.500983238220215, + "learning_rate": 9.087267565649396e-05, + "loss": 0.040919405221939084, + "step": 64330 + }, + { + "epoch": 9.132718239886444, + "grad_norm": 1.2529829740524292, + "learning_rate": 9.087125621007807e-05, + "loss": 0.042445436120033264, + "step": 64340 + }, + { + "epoch": 9.134137686302342, + "grad_norm": 3.7027170658111572, + "learning_rate": 9.086983676366217e-05, + "loss": 0.04291227459907532, + "step": 64350 + }, + { + "epoch": 9.13555713271824, + "grad_norm": 14.58912181854248, + "learning_rate": 9.086841731724628e-05, + "loss": 0.11172311305999756, + "step": 64360 + }, + { + "epoch": 9.136976579134137, + "grad_norm": 0.06370960175991058, + "learning_rate": 9.086699787083038e-05, + "loss": 0.005412508919835091, + "step": 64370 + }, + { + "epoch": 9.138396025550035, + "grad_norm": 0.059994593262672424, + "learning_rate": 9.086557842441449e-05, + "loss": 0.050759947299957274, + "step": 64380 + }, + { + "epoch": 9.139815471965933, + "grad_norm": 0.875908374786377, + "learning_rate": 9.086415897799859e-05, + "loss": 0.018525166809558867, + "step": 64390 + }, + { + "epoch": 9.141234918381832, + "grad_norm": 4.418863296508789, + "learning_rate": 9.086273953158269e-05, + "loss": 0.05396139621734619, + "step": 64400 + }, + { + "epoch": 9.142654364797728, + "grad_norm": 10.711448669433594, + "learning_rate": 9.08613200851668e-05, + "loss": 0.02829013466835022, + "step": 64410 + }, + { + "epoch": 9.144073811213627, + "grad_norm": 0.09981971979141235, + "learning_rate": 9.08599006387509e-05, + "loss": 0.027979806065559387, + "step": 64420 + }, + { + "epoch": 9.145493257629525, + "grad_norm": 9.851990699768066, + "learning_rate": 9.0858481192335e-05, + "loss": 0.031139957904815673, + "step": 64430 + }, + { + "epoch": 9.146912704045421, + "grad_norm": 0.539822518825531, + "learning_rate": 9.085706174591909e-05, + "loss": 0.02237287014722824, + "step": 64440 + }, + { + "epoch": 9.14833215046132, + "grad_norm": 4.454430103302002, + "learning_rate": 9.08556422995032e-05, + "loss": 0.043205234408378604, + "step": 64450 + }, + { + "epoch": 9.149751596877218, + "grad_norm": 0.11565633863210678, + "learning_rate": 9.08542228530873e-05, + "loss": 0.02587103843688965, + "step": 64460 + }, + { + "epoch": 9.151171043293116, + "grad_norm": 0.7783809900283813, + "learning_rate": 9.085280340667141e-05, + "loss": 0.039271104335784915, + "step": 64470 + }, + { + "epoch": 9.152590489709013, + "grad_norm": 0.5557367205619812, + "learning_rate": 9.08513839602555e-05, + "loss": 0.04649159014225006, + "step": 64480 + }, + { + "epoch": 9.154009936124911, + "grad_norm": 2.1434662342071533, + "learning_rate": 9.08499645138396e-05, + "loss": 0.049646627902984616, + "step": 64490 + }, + { + "epoch": 9.15542938254081, + "grad_norm": 7.786257743835449, + "learning_rate": 9.084854506742371e-05, + "loss": 0.037093961238861085, + "step": 64500 + }, + { + "epoch": 9.15542938254081, + "eval_accuracy": 0.9621033890761111, + "eval_loss": 0.14138783514499664, + "eval_runtime": 32.1947, + "eval_samples_per_second": 488.497, + "eval_steps_per_second": 15.282, + "step": 64500 + }, + { + "epoch": 9.156848828956706, + "grad_norm": 8.986673355102539, + "learning_rate": 9.084712562100781e-05, + "loss": 0.07170224785804749, + "step": 64510 + }, + { + "epoch": 9.158268275372604, + "grad_norm": 5.408350467681885, + "learning_rate": 9.084570617459192e-05, + "loss": 0.030388069152832032, + "step": 64520 + }, + { + "epoch": 9.159687721788503, + "grad_norm": 2.8842499256134033, + "learning_rate": 9.084428672817602e-05, + "loss": 0.015032586455345155, + "step": 64530 + }, + { + "epoch": 9.161107168204401, + "grad_norm": 2.7716171741485596, + "learning_rate": 9.084286728176012e-05, + "loss": 0.017233891785144805, + "step": 64540 + }, + { + "epoch": 9.162526614620297, + "grad_norm": 14.251026153564453, + "learning_rate": 9.084144783534421e-05, + "loss": 0.0279281884431839, + "step": 64550 + }, + { + "epoch": 9.163946061036196, + "grad_norm": 0.09909752011299133, + "learning_rate": 9.084002838892832e-05, + "loss": 0.01321396678686142, + "step": 64560 + }, + { + "epoch": 9.165365507452094, + "grad_norm": 0.048804301768541336, + "learning_rate": 9.083860894251242e-05, + "loss": 0.030893230438232423, + "step": 64570 + }, + { + "epoch": 9.16678495386799, + "grad_norm": 7.728274345397949, + "learning_rate": 9.083718949609653e-05, + "loss": 0.0831569790840149, + "step": 64580 + }, + { + "epoch": 9.168204400283889, + "grad_norm": 4.418582439422607, + "learning_rate": 9.083577004968063e-05, + "loss": 0.04628153443336487, + "step": 64590 + }, + { + "epoch": 9.169623846699787, + "grad_norm": 0.2589815855026245, + "learning_rate": 9.083435060326473e-05, + "loss": 0.014576169848442077, + "step": 64600 + }, + { + "epoch": 9.171043293115686, + "grad_norm": 0.2530888020992279, + "learning_rate": 9.083293115684884e-05, + "loss": 0.03431870639324188, + "step": 64610 + }, + { + "epoch": 9.172462739531582, + "grad_norm": 4.140875816345215, + "learning_rate": 9.083151171043294e-05, + "loss": 0.01830962300300598, + "step": 64620 + }, + { + "epoch": 9.17388218594748, + "grad_norm": 0.6446725130081177, + "learning_rate": 9.083009226401705e-05, + "loss": 0.024257193505764007, + "step": 64630 + }, + { + "epoch": 9.175301632363379, + "grad_norm": 0.051858942955732346, + "learning_rate": 9.082867281760113e-05, + "loss": 0.021634511649608612, + "step": 64640 + }, + { + "epoch": 9.176721078779275, + "grad_norm": 0.036192528903484344, + "learning_rate": 9.082725337118524e-05, + "loss": 0.010384272038936614, + "step": 64650 + }, + { + "epoch": 9.178140525195174, + "grad_norm": 2.0581307411193848, + "learning_rate": 9.082583392476934e-05, + "loss": 0.03748018741607666, + "step": 64660 + }, + { + "epoch": 9.179559971611072, + "grad_norm": 0.1270061433315277, + "learning_rate": 9.082441447835345e-05, + "loss": 0.05351813435554505, + "step": 64670 + }, + { + "epoch": 9.18097941802697, + "grad_norm": 8.708552360534668, + "learning_rate": 9.082299503193755e-05, + "loss": 0.036863112449646, + "step": 64680 + }, + { + "epoch": 9.182398864442867, + "grad_norm": 2.0032787322998047, + "learning_rate": 9.082157558552164e-05, + "loss": 0.008637142181396485, + "step": 64690 + }, + { + "epoch": 9.183818310858765, + "grad_norm": 1.0035320520401, + "learning_rate": 9.082015613910576e-05, + "loss": 0.04162544012069702, + "step": 64700 + }, + { + "epoch": 9.185237757274663, + "grad_norm": 6.927931308746338, + "learning_rate": 9.081873669268985e-05, + "loss": 0.05242146253585815, + "step": 64710 + }, + { + "epoch": 9.18665720369056, + "grad_norm": 0.6717086434364319, + "learning_rate": 9.081731724627396e-05, + "loss": 0.023917996883392335, + "step": 64720 + }, + { + "epoch": 9.188076650106458, + "grad_norm": 0.19253717362880707, + "learning_rate": 9.081589779985806e-05, + "loss": 0.020571285486221315, + "step": 64730 + }, + { + "epoch": 9.189496096522356, + "grad_norm": 0.1123395785689354, + "learning_rate": 9.081447835344217e-05, + "loss": 0.044478365778923036, + "step": 64740 + }, + { + "epoch": 9.190915542938255, + "grad_norm": 2.836575984954834, + "learning_rate": 9.081305890702626e-05, + "loss": 0.027955496311187746, + "step": 64750 + }, + { + "epoch": 9.192334989354151, + "grad_norm": 2.197659492492676, + "learning_rate": 9.081163946061037e-05, + "loss": 0.04038041830062866, + "step": 64760 + }, + { + "epoch": 9.19375443577005, + "grad_norm": 13.875947952270508, + "learning_rate": 9.081022001419446e-05, + "loss": 0.03770278990268707, + "step": 64770 + }, + { + "epoch": 9.195173882185948, + "grad_norm": 4.144615173339844, + "learning_rate": 9.080880056777858e-05, + "loss": 0.04378984868526459, + "step": 64780 + }, + { + "epoch": 9.196593328601844, + "grad_norm": 8.495119094848633, + "learning_rate": 9.080738112136267e-05, + "loss": 0.042510056495666505, + "step": 64790 + }, + { + "epoch": 9.198012775017743, + "grad_norm": 3.9217870235443115, + "learning_rate": 9.080596167494677e-05, + "loss": 0.05529659986495972, + "step": 64800 + }, + { + "epoch": 9.199432221433641, + "grad_norm": 7.344886302947998, + "learning_rate": 9.080454222853088e-05, + "loss": 0.05696294903755188, + "step": 64810 + }, + { + "epoch": 9.20085166784954, + "grad_norm": 4.72236442565918, + "learning_rate": 9.080312278211498e-05, + "loss": 0.04589243531227112, + "step": 64820 + }, + { + "epoch": 9.202271114265436, + "grad_norm": 12.580382347106934, + "learning_rate": 9.080170333569909e-05, + "loss": 0.06638288497924805, + "step": 64830 + }, + { + "epoch": 9.203690560681334, + "grad_norm": 1.4525302648544312, + "learning_rate": 9.080028388928319e-05, + "loss": 0.06872016787528992, + "step": 64840 + }, + { + "epoch": 9.205110007097232, + "grad_norm": 0.4770313501358032, + "learning_rate": 9.079886444286728e-05, + "loss": 0.03467918932437897, + "step": 64850 + }, + { + "epoch": 9.206529453513129, + "grad_norm": 3.841900110244751, + "learning_rate": 9.079744499645138e-05, + "loss": 0.02755351662635803, + "step": 64860 + }, + { + "epoch": 9.207948899929027, + "grad_norm": 5.2558112144470215, + "learning_rate": 9.079602555003549e-05, + "loss": 0.018217019736766815, + "step": 64870 + }, + { + "epoch": 9.209368346344926, + "grad_norm": 13.552385330200195, + "learning_rate": 9.079460610361959e-05, + "loss": 0.04716223478317261, + "step": 64880 + }, + { + "epoch": 9.210787792760824, + "grad_norm": 11.82503890991211, + "learning_rate": 9.07931866572037e-05, + "loss": 0.03754045367240906, + "step": 64890 + }, + { + "epoch": 9.21220723917672, + "grad_norm": 0.2998197078704834, + "learning_rate": 9.07917672107878e-05, + "loss": 0.04432125985622406, + "step": 64900 + }, + { + "epoch": 9.213626685592619, + "grad_norm": 0.033697254955768585, + "learning_rate": 9.07903477643719e-05, + "loss": 0.017217373847961424, + "step": 64910 + }, + { + "epoch": 9.215046132008517, + "grad_norm": 0.2636561989784241, + "learning_rate": 9.0788928317956e-05, + "loss": 0.030328923463821413, + "step": 64920 + }, + { + "epoch": 9.216465578424414, + "grad_norm": 2.3679654598236084, + "learning_rate": 9.07875088715401e-05, + "loss": 0.04711674749851227, + "step": 64930 + }, + { + "epoch": 9.217885024840312, + "grad_norm": 5.013309478759766, + "learning_rate": 9.078608942512421e-05, + "loss": 0.07335137724876403, + "step": 64940 + }, + { + "epoch": 9.21930447125621, + "grad_norm": 2.416539430618286, + "learning_rate": 9.07846699787083e-05, + "loss": 0.051906025409698485, + "step": 64950 + }, + { + "epoch": 9.220723917672109, + "grad_norm": 0.818121075630188, + "learning_rate": 9.078325053229241e-05, + "loss": 0.04151787757873535, + "step": 64960 + }, + { + "epoch": 9.222143364088005, + "grad_norm": 0.08963953703641891, + "learning_rate": 9.07818310858765e-05, + "loss": 0.03505766987800598, + "step": 64970 + }, + { + "epoch": 9.223562810503903, + "grad_norm": 0.3595544695854187, + "learning_rate": 9.078041163946062e-05, + "loss": 0.041363495588302615, + "step": 64980 + }, + { + "epoch": 9.224982256919802, + "grad_norm": 0.8693178296089172, + "learning_rate": 9.077899219304472e-05, + "loss": 0.018786983191967012, + "step": 64990 + }, + { + "epoch": 9.2264017033357, + "grad_norm": 0.8593055009841919, + "learning_rate": 9.077757274662881e-05, + "loss": 0.042645350098609924, + "step": 65000 + }, + { + "epoch": 9.2264017033357, + "eval_accuracy": 0.981814713549946, + "eval_loss": 0.056301869451999664, + "eval_runtime": 32.8156, + "eval_samples_per_second": 479.253, + "eval_steps_per_second": 14.993, + "step": 65000 + }, + { + "epoch": 9.227821149751597, + "grad_norm": 0.27893632650375366, + "learning_rate": 9.077615330021292e-05, + "loss": 0.034988516569137575, + "step": 65010 + }, + { + "epoch": 9.229240596167495, + "grad_norm": 5.20842981338501, + "learning_rate": 9.077473385379702e-05, + "loss": 0.030495092272758484, + "step": 65020 + }, + { + "epoch": 9.230660042583393, + "grad_norm": 4.230602264404297, + "learning_rate": 9.077331440738113e-05, + "loss": 0.025418007373809816, + "step": 65030 + }, + { + "epoch": 9.23207948899929, + "grad_norm": 0.7757551074028015, + "learning_rate": 9.077189496096523e-05, + "loss": 0.09136197566986085, + "step": 65040 + }, + { + "epoch": 9.233498935415188, + "grad_norm": 0.16610698401927948, + "learning_rate": 9.077047551454933e-05, + "loss": 0.04587730467319488, + "step": 65050 + }, + { + "epoch": 9.234918381831086, + "grad_norm": 0.07457069307565689, + "learning_rate": 9.076905606813342e-05, + "loss": 0.01524857133626938, + "step": 65060 + }, + { + "epoch": 9.236337828246985, + "grad_norm": 2.2226922512054443, + "learning_rate": 9.076763662171753e-05, + "loss": 0.01971132159233093, + "step": 65070 + }, + { + "epoch": 9.237757274662881, + "grad_norm": 6.34228515625, + "learning_rate": 9.076621717530163e-05, + "loss": 0.03835614919662476, + "step": 65080 + }, + { + "epoch": 9.23917672107878, + "grad_norm": 9.732237815856934, + "learning_rate": 9.076479772888574e-05, + "loss": 0.06072259545326233, + "step": 65090 + }, + { + "epoch": 9.240596167494678, + "grad_norm": 0.5269761681556702, + "learning_rate": 9.076337828246984e-05, + "loss": 0.023173244297504426, + "step": 65100 + }, + { + "epoch": 9.242015613910574, + "grad_norm": 3.5085811614990234, + "learning_rate": 9.076195883605394e-05, + "loss": 0.023165860772132875, + "step": 65110 + }, + { + "epoch": 9.243435060326473, + "grad_norm": 9.514446258544922, + "learning_rate": 9.076053938963805e-05, + "loss": 0.02069098949432373, + "step": 65120 + }, + { + "epoch": 9.24485450674237, + "grad_norm": 0.27682217955589294, + "learning_rate": 9.075911994322215e-05, + "loss": 0.01605180650949478, + "step": 65130 + }, + { + "epoch": 9.24627395315827, + "grad_norm": 0.49888181686401367, + "learning_rate": 9.075770049680626e-05, + "loss": 0.05236906409263611, + "step": 65140 + }, + { + "epoch": 9.247693399574166, + "grad_norm": 6.699531555175781, + "learning_rate": 9.075628105039035e-05, + "loss": 0.03403179347515106, + "step": 65150 + }, + { + "epoch": 9.249112845990064, + "grad_norm": 4.958370208740234, + "learning_rate": 9.075486160397445e-05, + "loss": 0.0508464515209198, + "step": 65160 + }, + { + "epoch": 9.250532292405962, + "grad_norm": 3.9333159923553467, + "learning_rate": 9.075358410220015e-05, + "loss": 0.12287168502807617, + "step": 65170 + }, + { + "epoch": 9.251951738821859, + "grad_norm": 4.830854892730713, + "learning_rate": 9.075216465578425e-05, + "loss": 0.034806248545646665, + "step": 65180 + }, + { + "epoch": 9.253371185237757, + "grad_norm": 6.123739719390869, + "learning_rate": 9.075074520936834e-05, + "loss": 0.03453691303730011, + "step": 65190 + }, + { + "epoch": 9.254790631653655, + "grad_norm": 0.9723725318908691, + "learning_rate": 9.074932576295246e-05, + "loss": 0.036412373185157776, + "step": 65200 + }, + { + "epoch": 9.256210078069554, + "grad_norm": 0.0696854367852211, + "learning_rate": 9.074790631653655e-05, + "loss": 0.03546989262104035, + "step": 65210 + }, + { + "epoch": 9.25762952448545, + "grad_norm": 4.158839702606201, + "learning_rate": 9.074648687012066e-05, + "loss": 0.05229751467704773, + "step": 65220 + }, + { + "epoch": 9.259048970901349, + "grad_norm": 5.570679664611816, + "learning_rate": 9.074506742370476e-05, + "loss": 0.05541685223579407, + "step": 65230 + }, + { + "epoch": 9.260468417317247, + "grad_norm": 0.8031320571899414, + "learning_rate": 9.074364797728886e-05, + "loss": 0.024064990878105163, + "step": 65240 + }, + { + "epoch": 9.261887863733143, + "grad_norm": 1.7579777240753174, + "learning_rate": 9.074222853087297e-05, + "loss": 0.08455089926719665, + "step": 65250 + }, + { + "epoch": 9.263307310149042, + "grad_norm": 0.7289173007011414, + "learning_rate": 9.074080908445707e-05, + "loss": 0.022726500034332277, + "step": 65260 + }, + { + "epoch": 9.26472675656494, + "grad_norm": 5.2052459716796875, + "learning_rate": 9.073938963804118e-05, + "loss": 0.017752929031848906, + "step": 65270 + }, + { + "epoch": 9.266146202980838, + "grad_norm": 2.6849477291107178, + "learning_rate": 9.073797019162526e-05, + "loss": 0.0206844300031662, + "step": 65280 + }, + { + "epoch": 9.267565649396735, + "grad_norm": 0.08539886772632599, + "learning_rate": 9.073655074520937e-05, + "loss": 0.03450865149497986, + "step": 65290 + }, + { + "epoch": 9.268985095812633, + "grad_norm": 5.187475681304932, + "learning_rate": 9.073513129879347e-05, + "loss": 0.054791712760925294, + "step": 65300 + }, + { + "epoch": 9.270404542228531, + "grad_norm": 1.5241824388504028, + "learning_rate": 9.073371185237758e-05, + "loss": 0.06886662840843201, + "step": 65310 + }, + { + "epoch": 9.271823988644428, + "grad_norm": 9.872211456298828, + "learning_rate": 9.073229240596168e-05, + "loss": 0.03144813776016235, + "step": 65320 + }, + { + "epoch": 9.273243435060326, + "grad_norm": 3.142368793487549, + "learning_rate": 9.073087295954578e-05, + "loss": 0.010226437449455261, + "step": 65330 + }, + { + "epoch": 9.274662881476225, + "grad_norm": 1.4327269792556763, + "learning_rate": 9.072945351312989e-05, + "loss": 0.032859033346176146, + "step": 65340 + }, + { + "epoch": 9.276082327892123, + "grad_norm": 2.3611676692962646, + "learning_rate": 9.072803406671398e-05, + "loss": 0.013543438911437989, + "step": 65350 + }, + { + "epoch": 9.27750177430802, + "grad_norm": 0.41527459025382996, + "learning_rate": 9.07266146202981e-05, + "loss": 0.02646762728691101, + "step": 65360 + }, + { + "epoch": 9.278921220723918, + "grad_norm": 0.8477177619934082, + "learning_rate": 9.072519517388219e-05, + "loss": 0.012412407249212266, + "step": 65370 + }, + { + "epoch": 9.280340667139816, + "grad_norm": 0.10813146084547043, + "learning_rate": 9.072377572746629e-05, + "loss": 0.04115345776081085, + "step": 65380 + }, + { + "epoch": 9.281760113555713, + "grad_norm": 0.7527191638946533, + "learning_rate": 9.072235628105039e-05, + "loss": 0.02868193984031677, + "step": 65390 + }, + { + "epoch": 9.283179559971611, + "grad_norm": 0.4436936378479004, + "learning_rate": 9.07209368346345e-05, + "loss": 0.009298932552337647, + "step": 65400 + }, + { + "epoch": 9.28459900638751, + "grad_norm": 1.910849690437317, + "learning_rate": 9.07195173882186e-05, + "loss": 0.024878501892089844, + "step": 65410 + }, + { + "epoch": 9.286018452803408, + "grad_norm": 0.1165904849767685, + "learning_rate": 9.07180979418027e-05, + "loss": 0.011494255065917969, + "step": 65420 + }, + { + "epoch": 9.287437899219304, + "grad_norm": 9.035189628601074, + "learning_rate": 9.07166784953868e-05, + "loss": 0.054299116134643555, + "step": 65430 + }, + { + "epoch": 9.288857345635202, + "grad_norm": 0.0229057427495718, + "learning_rate": 9.07152590489709e-05, + "loss": 0.038271555304527284, + "step": 65440 + }, + { + "epoch": 9.2902767920511, + "grad_norm": 3.428968667984009, + "learning_rate": 9.071383960255501e-05, + "loss": 0.022773563861846924, + "step": 65450 + }, + { + "epoch": 9.291696238466997, + "grad_norm": 1.1164143085479736, + "learning_rate": 9.071242015613911e-05, + "loss": 0.04025732278823853, + "step": 65460 + }, + { + "epoch": 9.293115684882896, + "grad_norm": 3.3081722259521484, + "learning_rate": 9.071100070972322e-05, + "loss": 0.05027411580085754, + "step": 65470 + }, + { + "epoch": 9.294535131298794, + "grad_norm": 0.058389388024806976, + "learning_rate": 9.070958126330732e-05, + "loss": 0.01988658607006073, + "step": 65480 + }, + { + "epoch": 9.295954577714692, + "grad_norm": 0.3912332355976105, + "learning_rate": 9.070816181689142e-05, + "loss": 0.03847215473651886, + "step": 65490 + }, + { + "epoch": 9.297374024130589, + "grad_norm": 3.076023578643799, + "learning_rate": 9.070674237047551e-05, + "loss": 0.036669176816940305, + "step": 65500 + }, + { + "epoch": 9.297374024130589, + "eval_accuracy": 0.9775545240668914, + "eval_loss": 0.07455466687679291, + "eval_runtime": 33.3145, + "eval_samples_per_second": 472.077, + "eval_steps_per_second": 14.768, + "step": 65500 + }, + { + "epoch": 9.298793470546487, + "grad_norm": 6.249359607696533, + "learning_rate": 9.070532292405962e-05, + "loss": 0.05166963934898376, + "step": 65510 + }, + { + "epoch": 9.300212916962385, + "grad_norm": 3.9852261543273926, + "learning_rate": 9.070390347764372e-05, + "loss": 0.016864025592803956, + "step": 65520 + }, + { + "epoch": 9.301632363378282, + "grad_norm": 8.56318473815918, + "learning_rate": 9.070248403122783e-05, + "loss": 0.060342812538146974, + "step": 65530 + }, + { + "epoch": 9.30305180979418, + "grad_norm": 0.9962877035140991, + "learning_rate": 9.070106458481193e-05, + "loss": 0.030284777283668518, + "step": 65540 + }, + { + "epoch": 9.304471256210078, + "grad_norm": 0.08095641434192657, + "learning_rate": 9.069964513839603e-05, + "loss": 0.011772031337022782, + "step": 65550 + }, + { + "epoch": 9.305890702625977, + "grad_norm": 3.3373711109161377, + "learning_rate": 9.069822569198014e-05, + "loss": 0.026495721936225892, + "step": 65560 + }, + { + "epoch": 9.307310149041873, + "grad_norm": 8.643132209777832, + "learning_rate": 9.069680624556424e-05, + "loss": 0.060178011655807495, + "step": 65570 + }, + { + "epoch": 9.308729595457772, + "grad_norm": 1.415281891822815, + "learning_rate": 9.069538679914835e-05, + "loss": 0.013044501841068267, + "step": 65580 + }, + { + "epoch": 9.31014904187367, + "grad_norm": 5.128058433532715, + "learning_rate": 9.069396735273243e-05, + "loss": 0.033860421180725096, + "step": 65590 + }, + { + "epoch": 9.311568488289566, + "grad_norm": 0.04642212390899658, + "learning_rate": 9.069254790631654e-05, + "loss": 0.041500359773635864, + "step": 65600 + }, + { + "epoch": 9.312987934705465, + "grad_norm": 0.04170290380716324, + "learning_rate": 9.069112845990064e-05, + "loss": 0.0672379732131958, + "step": 65610 + }, + { + "epoch": 9.314407381121363, + "grad_norm": 0.4078945517539978, + "learning_rate": 9.068970901348475e-05, + "loss": 0.018945935368537902, + "step": 65620 + }, + { + "epoch": 9.315826827537261, + "grad_norm": 6.94992733001709, + "learning_rate": 9.068828956706885e-05, + "loss": 0.04940144419670105, + "step": 65630 + }, + { + "epoch": 9.317246273953158, + "grad_norm": 0.44048407673835754, + "learning_rate": 9.068687012065294e-05, + "loss": 0.03995745182037354, + "step": 65640 + }, + { + "epoch": 9.318665720369056, + "grad_norm": 4.295497894287109, + "learning_rate": 9.068545067423705e-05, + "loss": 0.02688758969306946, + "step": 65650 + }, + { + "epoch": 9.320085166784954, + "grad_norm": 5.7648396492004395, + "learning_rate": 9.068403122782115e-05, + "loss": 0.013780666887760163, + "step": 65660 + }, + { + "epoch": 9.321504613200851, + "grad_norm": 0.3595585227012634, + "learning_rate": 9.068261178140526e-05, + "loss": 0.0018398284912109375, + "step": 65670 + }, + { + "epoch": 9.32292405961675, + "grad_norm": 0.18886813521385193, + "learning_rate": 9.068119233498936e-05, + "loss": 0.014816860854625701, + "step": 65680 + }, + { + "epoch": 9.324343506032648, + "grad_norm": 1.3162140846252441, + "learning_rate": 9.067977288857346e-05, + "loss": 0.019825367629528044, + "step": 65690 + }, + { + "epoch": 9.325762952448546, + "grad_norm": 1.3076903820037842, + "learning_rate": 9.067835344215756e-05, + "loss": 0.008933990448713302, + "step": 65700 + }, + { + "epoch": 9.327182398864442, + "grad_norm": 7.359697341918945, + "learning_rate": 9.067693399574167e-05, + "loss": 0.04452669322490692, + "step": 65710 + }, + { + "epoch": 9.32860184528034, + "grad_norm": 0.31702089309692383, + "learning_rate": 9.067551454932576e-05, + "loss": 0.05820360779762268, + "step": 65720 + }, + { + "epoch": 9.330021291696239, + "grad_norm": 0.04736144468188286, + "learning_rate": 9.067409510290987e-05, + "loss": 0.055652981996536253, + "step": 65730 + }, + { + "epoch": 9.331440738112136, + "grad_norm": 0.2538986802101135, + "learning_rate": 9.067267565649397e-05, + "loss": 0.020092563331127168, + "step": 65740 + }, + { + "epoch": 9.332860184528034, + "grad_norm": 10.348010063171387, + "learning_rate": 9.067125621007807e-05, + "loss": 0.05498163104057312, + "step": 65750 + }, + { + "epoch": 9.334279630943932, + "grad_norm": 10.26076889038086, + "learning_rate": 9.066983676366218e-05, + "loss": 0.06374727487564087, + "step": 65760 + }, + { + "epoch": 9.33569907735983, + "grad_norm": 0.08166330307722092, + "learning_rate": 9.066841731724628e-05, + "loss": 0.03318239748477936, + "step": 65770 + }, + { + "epoch": 9.337118523775727, + "grad_norm": 2.647860288619995, + "learning_rate": 9.066699787083039e-05, + "loss": 0.03421743810176849, + "step": 65780 + }, + { + "epoch": 9.338537970191625, + "grad_norm": 0.08706337213516235, + "learning_rate": 9.066557842441449e-05, + "loss": 0.02070632576942444, + "step": 65790 + }, + { + "epoch": 9.339957416607524, + "grad_norm": 1.6088865995407104, + "learning_rate": 9.066415897799858e-05, + "loss": 0.0166518896818161, + "step": 65800 + }, + { + "epoch": 9.34137686302342, + "grad_norm": 0.1422835737466812, + "learning_rate": 9.066273953158268e-05, + "loss": 0.05076624751091004, + "step": 65810 + }, + { + "epoch": 9.342796309439318, + "grad_norm": 0.6648264527320862, + "learning_rate": 9.066132008516679e-05, + "loss": 0.023625385761260987, + "step": 65820 + }, + { + "epoch": 9.344215755855217, + "grad_norm": 3.993276357650757, + "learning_rate": 9.065990063875089e-05, + "loss": 0.01901327967643738, + "step": 65830 + }, + { + "epoch": 9.345635202271115, + "grad_norm": 5.106206893920898, + "learning_rate": 9.0658481192335e-05, + "loss": 0.03103363811969757, + "step": 65840 + }, + { + "epoch": 9.347054648687012, + "grad_norm": 0.7105145454406738, + "learning_rate": 9.06570617459191e-05, + "loss": 0.04525960385799408, + "step": 65850 + }, + { + "epoch": 9.34847409510291, + "grad_norm": 2.2576792240142822, + "learning_rate": 9.06556422995032e-05, + "loss": 0.06363716125488281, + "step": 65860 + }, + { + "epoch": 9.349893541518808, + "grad_norm": 0.34290191531181335, + "learning_rate": 9.06542228530873e-05, + "loss": 0.0342064768075943, + "step": 65870 + }, + { + "epoch": 9.351312987934705, + "grad_norm": 6.060485363006592, + "learning_rate": 9.06528034066714e-05, + "loss": 0.020967322587966918, + "step": 65880 + }, + { + "epoch": 9.352732434350603, + "grad_norm": 2.8930869102478027, + "learning_rate": 9.065138396025551e-05, + "loss": 0.053651803731918336, + "step": 65890 + }, + { + "epoch": 9.354151880766501, + "grad_norm": 5.113282680511475, + "learning_rate": 9.06499645138396e-05, + "loss": 0.04868954718112946, + "step": 65900 + }, + { + "epoch": 9.3555713271824, + "grad_norm": 9.573057174682617, + "learning_rate": 9.064854506742371e-05, + "loss": 0.054132658243179324, + "step": 65910 + }, + { + "epoch": 9.356990773598296, + "grad_norm": 2.009981870651245, + "learning_rate": 9.06471256210078e-05, + "loss": 0.036932623386383055, + "step": 65920 + }, + { + "epoch": 9.358410220014195, + "grad_norm": 0.15597857534885406, + "learning_rate": 9.064570617459192e-05, + "loss": 0.014807553589344024, + "step": 65930 + }, + { + "epoch": 9.359829666430093, + "grad_norm": 0.6595566272735596, + "learning_rate": 9.064428672817601e-05, + "loss": 0.02623018026351929, + "step": 65940 + }, + { + "epoch": 9.36124911284599, + "grad_norm": 1.553436517715454, + "learning_rate": 9.064286728176011e-05, + "loss": 0.02999439835548401, + "step": 65950 + }, + { + "epoch": 9.362668559261888, + "grad_norm": 7.259316444396973, + "learning_rate": 9.064144783534422e-05, + "loss": 0.031955486536026, + "step": 65960 + }, + { + "epoch": 9.364088005677786, + "grad_norm": 0.7037152647972107, + "learning_rate": 9.064002838892832e-05, + "loss": 0.0743894875049591, + "step": 65970 + }, + { + "epoch": 9.365507452093684, + "grad_norm": 2.922961473464966, + "learning_rate": 9.063860894251243e-05, + "loss": 0.05946826934814453, + "step": 65980 + }, + { + "epoch": 9.36692689850958, + "grad_norm": 6.301733016967773, + "learning_rate": 9.063718949609653e-05, + "loss": 0.026286065578460693, + "step": 65990 + }, + { + "epoch": 9.36834634492548, + "grad_norm": 12.488518714904785, + "learning_rate": 9.063577004968063e-05, + "loss": 0.05177741050720215, + "step": 66000 + }, + { + "epoch": 9.36834634492548, + "eval_accuracy": 0.9830864119030965, + "eval_loss": 0.05414308235049248, + "eval_runtime": 32.7903, + "eval_samples_per_second": 479.623, + "eval_steps_per_second": 15.004, + "step": 66000 + }, + { + "epoch": 9.369765791341377, + "grad_norm": 5.741583824157715, + "learning_rate": 9.063435060326472e-05, + "loss": 0.022419145703315733, + "step": 66010 + }, + { + "epoch": 9.371185237757274, + "grad_norm": 1.4365832805633545, + "learning_rate": 9.063293115684883e-05, + "loss": 0.03936209380626678, + "step": 66020 + }, + { + "epoch": 9.372604684173172, + "grad_norm": 3.806220531463623, + "learning_rate": 9.063151171043293e-05, + "loss": 0.040252089500427246, + "step": 66030 + }, + { + "epoch": 9.37402413058907, + "grad_norm": 0.21159783005714417, + "learning_rate": 9.063009226401704e-05, + "loss": 0.043931758403778075, + "step": 66040 + }, + { + "epoch": 9.375443577004969, + "grad_norm": 0.23096901178359985, + "learning_rate": 9.062867281760114e-05, + "loss": 0.04873138964176178, + "step": 66050 + }, + { + "epoch": 9.376863023420865, + "grad_norm": 0.8391821384429932, + "learning_rate": 9.062725337118524e-05, + "loss": 0.026393789052963256, + "step": 66060 + }, + { + "epoch": 9.378282469836764, + "grad_norm": 0.03785236179828644, + "learning_rate": 9.062583392476935e-05, + "loss": 0.036301881074905396, + "step": 66070 + }, + { + "epoch": 9.379701916252662, + "grad_norm": 0.046141933649778366, + "learning_rate": 9.062441447835345e-05, + "loss": 0.06375334262847901, + "step": 66080 + }, + { + "epoch": 9.381121362668559, + "grad_norm": 6.387070178985596, + "learning_rate": 9.062299503193756e-05, + "loss": 0.03702278733253479, + "step": 66090 + }, + { + "epoch": 9.382540809084457, + "grad_norm": 0.19831642508506775, + "learning_rate": 9.062157558552165e-05, + "loss": 0.017668356001377106, + "step": 66100 + }, + { + "epoch": 9.383960255500355, + "grad_norm": 4.372278213500977, + "learning_rate": 9.062015613910575e-05, + "loss": 0.025728854537010192, + "step": 66110 + }, + { + "epoch": 9.385379701916253, + "grad_norm": 9.222113609313965, + "learning_rate": 9.061873669268985e-05, + "loss": 0.04258951544761658, + "step": 66120 + }, + { + "epoch": 9.38679914833215, + "grad_norm": 8.19491958618164, + "learning_rate": 9.061731724627396e-05, + "loss": 0.02233922928571701, + "step": 66130 + }, + { + "epoch": 9.388218594748048, + "grad_norm": 4.794766902923584, + "learning_rate": 9.061589779985806e-05, + "loss": 0.06196138858795166, + "step": 66140 + }, + { + "epoch": 9.389638041163947, + "grad_norm": 3.5216760635375977, + "learning_rate": 9.061447835344217e-05, + "loss": 0.010813835263252258, + "step": 66150 + }, + { + "epoch": 9.391057487579843, + "grad_norm": 0.24806861579418182, + "learning_rate": 9.061305890702626e-05, + "loss": 0.020065225660800934, + "step": 66160 + }, + { + "epoch": 9.392476933995741, + "grad_norm": 0.3911595344543457, + "learning_rate": 9.061163946061036e-05, + "loss": 0.0663827896118164, + "step": 66170 + }, + { + "epoch": 9.39389638041164, + "grad_norm": 1.4187536239624023, + "learning_rate": 9.061022001419447e-05, + "loss": 0.010493065416812896, + "step": 66180 + }, + { + "epoch": 9.395315826827538, + "grad_norm": 0.6327175498008728, + "learning_rate": 9.060880056777857e-05, + "loss": 0.026095515489578246, + "step": 66190 + }, + { + "epoch": 9.396735273243435, + "grad_norm": 4.311141490936279, + "learning_rate": 9.060738112136268e-05, + "loss": 0.022125279903411864, + "step": 66200 + }, + { + "epoch": 9.398154719659333, + "grad_norm": 0.7823535203933716, + "learning_rate": 9.060596167494677e-05, + "loss": 0.08423017859458923, + "step": 66210 + }, + { + "epoch": 9.399574166075231, + "grad_norm": 2.879866361618042, + "learning_rate": 9.060454222853088e-05, + "loss": 0.055038821697235105, + "step": 66220 + }, + { + "epoch": 9.400993612491128, + "grad_norm": 0.12221132963895798, + "learning_rate": 9.060312278211497e-05, + "loss": 0.05022150278091431, + "step": 66230 + }, + { + "epoch": 9.402413058907026, + "grad_norm": 5.080928325653076, + "learning_rate": 9.060170333569908e-05, + "loss": 0.05644909143447876, + "step": 66240 + }, + { + "epoch": 9.403832505322924, + "grad_norm": 1.559996485710144, + "learning_rate": 9.060028388928318e-05, + "loss": 0.052948832511901855, + "step": 66250 + }, + { + "epoch": 9.405251951738823, + "grad_norm": 0.12736868858337402, + "learning_rate": 9.059886444286728e-05, + "loss": 0.009814755618572235, + "step": 66260 + }, + { + "epoch": 9.40667139815472, + "grad_norm": 0.41833576560020447, + "learning_rate": 9.059744499645139e-05, + "loss": 0.06674405336380004, + "step": 66270 + }, + { + "epoch": 9.408090844570618, + "grad_norm": 0.0420779325067997, + "learning_rate": 9.059602555003549e-05, + "loss": 0.010850544273853301, + "step": 66280 + }, + { + "epoch": 9.409510290986516, + "grad_norm": 7.900463104248047, + "learning_rate": 9.05946061036196e-05, + "loss": 0.026490825414657592, + "step": 66290 + }, + { + "epoch": 9.410929737402412, + "grad_norm": 2.7261571884155273, + "learning_rate": 9.05931866572037e-05, + "loss": 0.039139583706855774, + "step": 66300 + }, + { + "epoch": 9.41234918381831, + "grad_norm": 1.2342069149017334, + "learning_rate": 9.059176721078779e-05, + "loss": 0.06747770309448242, + "step": 66310 + }, + { + "epoch": 9.413768630234209, + "grad_norm": 2.4731056690216064, + "learning_rate": 9.059034776437189e-05, + "loss": 0.10458157062530518, + "step": 66320 + }, + { + "epoch": 9.415188076650107, + "grad_norm": 8.4277982711792, + "learning_rate": 9.0588928317956e-05, + "loss": 0.09690378308296203, + "step": 66330 + }, + { + "epoch": 9.416607523066004, + "grad_norm": 5.209705829620361, + "learning_rate": 9.05875088715401e-05, + "loss": 0.03505201935768128, + "step": 66340 + }, + { + "epoch": 9.418026969481902, + "grad_norm": 7.857044219970703, + "learning_rate": 9.058608942512421e-05, + "loss": 0.060685038566589355, + "step": 66350 + }, + { + "epoch": 9.4194464158978, + "grad_norm": 0.43560779094696045, + "learning_rate": 9.058466997870831e-05, + "loss": 0.058388322591781616, + "step": 66360 + }, + { + "epoch": 9.420865862313697, + "grad_norm": 0.05114549398422241, + "learning_rate": 9.05832505322924e-05, + "loss": 0.019231802225112914, + "step": 66370 + }, + { + "epoch": 9.422285308729595, + "grad_norm": 9.367727279663086, + "learning_rate": 9.058183108587652e-05, + "loss": 0.038724538683891294, + "step": 66380 + }, + { + "epoch": 9.423704755145494, + "grad_norm": 9.952641487121582, + "learning_rate": 9.058041163946061e-05, + "loss": 0.03757392466068268, + "step": 66390 + }, + { + "epoch": 9.425124201561392, + "grad_norm": 0.6738945245742798, + "learning_rate": 9.057899219304472e-05, + "loss": 0.04065401554107666, + "step": 66400 + }, + { + "epoch": 9.426543647977288, + "grad_norm": 7.547946453094482, + "learning_rate": 9.057757274662881e-05, + "loss": 0.052340525388717654, + "step": 66410 + }, + { + "epoch": 9.427963094393187, + "grad_norm": 3.0092594623565674, + "learning_rate": 9.057615330021292e-05, + "loss": 0.012151481211185455, + "step": 66420 + }, + { + "epoch": 9.429382540809085, + "grad_norm": 1.3092892169952393, + "learning_rate": 9.057473385379702e-05, + "loss": 0.011581452190876007, + "step": 66430 + }, + { + "epoch": 9.430801987224982, + "grad_norm": 3.9269165992736816, + "learning_rate": 9.057331440738113e-05, + "loss": 0.03567465841770172, + "step": 66440 + }, + { + "epoch": 9.43222143364088, + "grad_norm": 6.909012794494629, + "learning_rate": 9.057189496096524e-05, + "loss": 0.008584094047546387, + "step": 66450 + }, + { + "epoch": 9.433640880056778, + "grad_norm": 8.529704093933105, + "learning_rate": 9.057047551454934e-05, + "loss": 0.08972499370574952, + "step": 66460 + }, + { + "epoch": 9.435060326472676, + "grad_norm": 0.0556497648358345, + "learning_rate": 9.056905606813343e-05, + "loss": 0.04341354668140411, + "step": 66470 + }, + { + "epoch": 9.436479772888573, + "grad_norm": 0.9770888686180115, + "learning_rate": 9.056763662171753e-05, + "loss": 0.010035674273967742, + "step": 66480 + }, + { + "epoch": 9.437899219304471, + "grad_norm": 1.1458122730255127, + "learning_rate": 9.056621717530164e-05, + "loss": 0.014818742871284485, + "step": 66490 + }, + { + "epoch": 9.43931866572037, + "grad_norm": 0.08403871953487396, + "learning_rate": 9.056479772888574e-05, + "loss": 0.023149700462818147, + "step": 66500 + }, + { + "epoch": 9.43931866572037, + "eval_accuracy": 0.9823233928912062, + "eval_loss": 0.06090559810400009, + "eval_runtime": 32.9512, + "eval_samples_per_second": 477.282, + "eval_steps_per_second": 14.931, + "step": 66500 + }, + { + "epoch": 9.440738112136266, + "grad_norm": 0.5672199726104736, + "learning_rate": 9.056337828246985e-05, + "loss": 0.005511279031634331, + "step": 66510 + }, + { + "epoch": 9.442157558552164, + "grad_norm": 0.029063401743769646, + "learning_rate": 9.056195883605393e-05, + "loss": 0.018873117864131927, + "step": 66520 + }, + { + "epoch": 9.443577004968063, + "grad_norm": 16.47539520263672, + "learning_rate": 9.056053938963804e-05, + "loss": 0.032866987586021426, + "step": 66530 + }, + { + "epoch": 9.444996451383961, + "grad_norm": 5.749122142791748, + "learning_rate": 9.055911994322215e-05, + "loss": 0.0254135400056839, + "step": 66540 + }, + { + "epoch": 9.446415897799858, + "grad_norm": 11.743706703186035, + "learning_rate": 9.055770049680625e-05, + "loss": 0.06181545257568359, + "step": 66550 + }, + { + "epoch": 9.447835344215756, + "grad_norm": 4.427557945251465, + "learning_rate": 9.055628105039036e-05, + "loss": 0.04300175309181213, + "step": 66560 + }, + { + "epoch": 9.449254790631654, + "grad_norm": 0.4895526170730591, + "learning_rate": 9.055486160397445e-05, + "loss": 0.006782171130180359, + "step": 66570 + }, + { + "epoch": 9.45067423704755, + "grad_norm": 6.674429893493652, + "learning_rate": 9.055344215755856e-05, + "loss": 0.028919672966003417, + "step": 66580 + }, + { + "epoch": 9.452093683463449, + "grad_norm": 2.419039487838745, + "learning_rate": 9.055202271114266e-05, + "loss": 0.07479876279830933, + "step": 66590 + }, + { + "epoch": 9.453513129879347, + "grad_norm": 0.10058147460222244, + "learning_rate": 9.055060326472677e-05, + "loss": 0.020939281582832335, + "step": 66600 + }, + { + "epoch": 9.454932576295246, + "grad_norm": 0.43330129981040955, + "learning_rate": 9.054918381831086e-05, + "loss": 0.027914851903915405, + "step": 66610 + }, + { + "epoch": 9.456352022711142, + "grad_norm": 3.174302339553833, + "learning_rate": 9.054776437189496e-05, + "loss": 0.018397895991802214, + "step": 66620 + }, + { + "epoch": 9.45777146912704, + "grad_norm": 0.3704909682273865, + "learning_rate": 9.054634492547907e-05, + "loss": 0.012019617855548859, + "step": 66630 + }, + { + "epoch": 9.459190915542939, + "grad_norm": 3.7187345027923584, + "learning_rate": 9.054492547906317e-05, + "loss": 0.016837552189826965, + "step": 66640 + }, + { + "epoch": 9.460610361958835, + "grad_norm": 2.88364315032959, + "learning_rate": 9.054350603264728e-05, + "loss": 0.0451316237449646, + "step": 66650 + }, + { + "epoch": 9.462029808374734, + "grad_norm": 0.3306542634963989, + "learning_rate": 9.054208658623138e-05, + "loss": 0.09204464554786682, + "step": 66660 + }, + { + "epoch": 9.463449254790632, + "grad_norm": 3.2169761657714844, + "learning_rate": 9.054066713981547e-05, + "loss": 0.018502812087535857, + "step": 66670 + }, + { + "epoch": 9.46486870120653, + "grad_norm": 0.5467488765716553, + "learning_rate": 9.053924769339957e-05, + "loss": 0.01434284746646881, + "step": 66680 + }, + { + "epoch": 9.466288147622427, + "grad_norm": 4.416463851928711, + "learning_rate": 9.053782824698368e-05, + "loss": 0.03661760985851288, + "step": 66690 + }, + { + "epoch": 9.467707594038325, + "grad_norm": 0.4002857506275177, + "learning_rate": 9.053640880056778e-05, + "loss": 0.10666844844818116, + "step": 66700 + }, + { + "epoch": 9.469127040454223, + "grad_norm": 1.848854422569275, + "learning_rate": 9.053498935415189e-05, + "loss": 0.036550584435462954, + "step": 66710 + }, + { + "epoch": 9.47054648687012, + "grad_norm": 8.114850997924805, + "learning_rate": 9.053356990773599e-05, + "loss": 0.0459254264831543, + "step": 66720 + }, + { + "epoch": 9.471965933286018, + "grad_norm": 2.8109030723571777, + "learning_rate": 9.053215046132009e-05, + "loss": 0.013741156458854676, + "step": 66730 + }, + { + "epoch": 9.473385379701917, + "grad_norm": 0.42820295691490173, + "learning_rate": 9.05307310149042e-05, + "loss": 0.035875517129898074, + "step": 66740 + }, + { + "epoch": 9.474804826117815, + "grad_norm": 1.107942819595337, + "learning_rate": 9.05293115684883e-05, + "loss": 0.018605363368988038, + "step": 66750 + }, + { + "epoch": 9.476224272533711, + "grad_norm": 0.3636723458766937, + "learning_rate": 9.05278921220724e-05, + "loss": 0.09011185765266419, + "step": 66760 + }, + { + "epoch": 9.47764371894961, + "grad_norm": 4.078773021697998, + "learning_rate": 9.052647267565649e-05, + "loss": 0.06534717082977295, + "step": 66770 + }, + { + "epoch": 9.479063165365508, + "grad_norm": 6.753770351409912, + "learning_rate": 9.05250532292406e-05, + "loss": 0.05468626618385315, + "step": 66780 + }, + { + "epoch": 9.480482611781405, + "grad_norm": 0.03908902779221535, + "learning_rate": 9.05236337828247e-05, + "loss": 0.030472838878631593, + "step": 66790 + }, + { + "epoch": 9.481902058197303, + "grad_norm": 0.09945162385702133, + "learning_rate": 9.052221433640881e-05, + "loss": 0.019605173170566557, + "step": 66800 + }, + { + "epoch": 9.483321504613201, + "grad_norm": 2.562227964401245, + "learning_rate": 9.05207948899929e-05, + "loss": 0.04079928696155548, + "step": 66810 + }, + { + "epoch": 9.4847409510291, + "grad_norm": 6.359819412231445, + "learning_rate": 9.051937544357702e-05, + "loss": 0.03491916060447693, + "step": 66820 + }, + { + "epoch": 9.486160397444996, + "grad_norm": 1.103685975074768, + "learning_rate": 9.051795599716111e-05, + "loss": 0.045868688821792604, + "step": 66830 + }, + { + "epoch": 9.487579843860894, + "grad_norm": 3.343043088912964, + "learning_rate": 9.051653655074521e-05, + "loss": 0.060406225919723514, + "step": 66840 + }, + { + "epoch": 9.488999290276793, + "grad_norm": 7.082602500915527, + "learning_rate": 9.051511710432932e-05, + "loss": 0.05447771549224854, + "step": 66850 + }, + { + "epoch": 9.490418736692689, + "grad_norm": 0.6787410378456116, + "learning_rate": 9.051369765791342e-05, + "loss": 0.012434682250022889, + "step": 66860 + }, + { + "epoch": 9.491838183108587, + "grad_norm": 6.561841011047363, + "learning_rate": 9.051227821149753e-05, + "loss": 0.016962145268917084, + "step": 66870 + }, + { + "epoch": 9.493257629524486, + "grad_norm": 3.4125189781188965, + "learning_rate": 9.051085876508161e-05, + "loss": 0.035667648911476134, + "step": 66880 + }, + { + "epoch": 9.494677075940384, + "grad_norm": 1.578565001487732, + "learning_rate": 9.050943931866573e-05, + "loss": 0.023421129584312438, + "step": 66890 + }, + { + "epoch": 9.49609652235628, + "grad_norm": 0.06974627077579498, + "learning_rate": 9.050801987224982e-05, + "loss": 0.0638196587562561, + "step": 66900 + }, + { + "epoch": 9.497515968772179, + "grad_norm": 0.9934460520744324, + "learning_rate": 9.050660042583393e-05, + "loss": 0.03134672641754151, + "step": 66910 + }, + { + "epoch": 9.498935415188077, + "grad_norm": 6.786892890930176, + "learning_rate": 9.050518097941803e-05, + "loss": 0.04574669897556305, + "step": 66920 + }, + { + "epoch": 9.500354861603974, + "grad_norm": 0.9338919520378113, + "learning_rate": 9.050376153300213e-05, + "loss": 0.035261183977127075, + "step": 66930 + }, + { + "epoch": 9.501774308019872, + "grad_norm": 3.783557653427124, + "learning_rate": 9.050234208658624e-05, + "loss": 0.03577567338943481, + "step": 66940 + }, + { + "epoch": 9.50319375443577, + "grad_norm": 0.2062395215034485, + "learning_rate": 9.050092264017034e-05, + "loss": 0.05642724633216858, + "step": 66950 + }, + { + "epoch": 9.504613200851669, + "grad_norm": 0.6609073281288147, + "learning_rate": 9.049950319375445e-05, + "loss": 0.035305237770080565, + "step": 66960 + }, + { + "epoch": 9.506032647267565, + "grad_norm": 0.12313273549079895, + "learning_rate": 9.049808374733855e-05, + "loss": 0.020653310418128967, + "step": 66970 + }, + { + "epoch": 9.507452093683463, + "grad_norm": 5.6952996253967285, + "learning_rate": 9.049666430092264e-05, + "loss": 0.027990663051605226, + "step": 66980 + }, + { + "epoch": 9.508871540099362, + "grad_norm": 5.661862850189209, + "learning_rate": 9.049524485450674e-05, + "loss": 0.014997878670692444, + "step": 66990 + }, + { + "epoch": 9.510290986515258, + "grad_norm": 2.4455645084381104, + "learning_rate": 9.049382540809085e-05, + "loss": 0.005252880230545998, + "step": 67000 + }, + { + "epoch": 9.510290986515258, + "eval_accuracy": 0.9795892414319324, + "eval_loss": 0.07509937882423401, + "eval_runtime": 32.595, + "eval_samples_per_second": 482.498, + "eval_steps_per_second": 15.094, + "step": 67000 + }, + { + "epoch": 9.511710432931157, + "grad_norm": 0.44350701570510864, + "learning_rate": 9.049240596167495e-05, + "loss": 0.07043783068656921, + "step": 67010 + }, + { + "epoch": 9.513129879347055, + "grad_norm": 0.09790097177028656, + "learning_rate": 9.049098651525906e-05, + "loss": 0.03614462614059448, + "step": 67020 + }, + { + "epoch": 9.514549325762953, + "grad_norm": 4.326782703399658, + "learning_rate": 9.048956706884316e-05, + "loss": 0.055505746603012086, + "step": 67030 + }, + { + "epoch": 9.51596877217885, + "grad_norm": 1.6764984130859375, + "learning_rate": 9.048814762242725e-05, + "loss": 0.04317740499973297, + "step": 67040 + }, + { + "epoch": 9.517388218594748, + "grad_norm": 11.991434097290039, + "learning_rate": 9.048672817601136e-05, + "loss": 0.033396673202514646, + "step": 67050 + }, + { + "epoch": 9.518807665010646, + "grad_norm": 0.31179875135421753, + "learning_rate": 9.048530872959546e-05, + "loss": 0.007232289761304855, + "step": 67060 + }, + { + "epoch": 9.520227111426543, + "grad_norm": 2.5120632648468018, + "learning_rate": 9.048388928317957e-05, + "loss": 0.07684400081634521, + "step": 67070 + }, + { + "epoch": 9.521646557842441, + "grad_norm": 1.4562314748764038, + "learning_rate": 9.048246983676366e-05, + "loss": 0.026847514510154723, + "step": 67080 + }, + { + "epoch": 9.52306600425834, + "grad_norm": 1.2154396772384644, + "learning_rate": 9.048105039034777e-05, + "loss": 0.015740375220775604, + "step": 67090 + }, + { + "epoch": 9.524485450674238, + "grad_norm": 1.455774188041687, + "learning_rate": 9.047963094393187e-05, + "loss": 0.01216294914484024, + "step": 67100 + }, + { + "epoch": 9.525904897090134, + "grad_norm": 4.984959125518799, + "learning_rate": 9.047821149751598e-05, + "loss": 0.06393689513206482, + "step": 67110 + }, + { + "epoch": 9.527324343506033, + "grad_norm": 1.871148705482483, + "learning_rate": 9.047679205110007e-05, + "loss": 0.02973054051399231, + "step": 67120 + }, + { + "epoch": 9.528743789921931, + "grad_norm": 6.94720983505249, + "learning_rate": 9.047537260468417e-05, + "loss": 0.03340931236743927, + "step": 67130 + }, + { + "epoch": 9.530163236337827, + "grad_norm": 8.067058563232422, + "learning_rate": 9.047395315826828e-05, + "loss": 0.027840656042099, + "step": 67140 + }, + { + "epoch": 9.531582682753726, + "grad_norm": 0.03339483216404915, + "learning_rate": 9.047253371185238e-05, + "loss": 0.03909151256084442, + "step": 67150 + }, + { + "epoch": 9.533002129169624, + "grad_norm": 0.03475292772054672, + "learning_rate": 9.047111426543649e-05, + "loss": 0.017502933740615845, + "step": 67160 + }, + { + "epoch": 9.534421575585522, + "grad_norm": 8.537821769714355, + "learning_rate": 9.046983676366218e-05, + "loss": 0.0501953661441803, + "step": 67170 + }, + { + "epoch": 9.535841022001419, + "grad_norm": 8.778392791748047, + "learning_rate": 9.046841731724627e-05, + "loss": 0.0493028998374939, + "step": 67180 + }, + { + "epoch": 9.537260468417317, + "grad_norm": 0.6387729644775391, + "learning_rate": 9.046699787083038e-05, + "loss": 0.021017967164516448, + "step": 67190 + }, + { + "epoch": 9.538679914833216, + "grad_norm": 0.1580241322517395, + "learning_rate": 9.04655784244145e-05, + "loss": 0.0033407047390937804, + "step": 67200 + }, + { + "epoch": 9.540099361249112, + "grad_norm": 10.30275821685791, + "learning_rate": 9.046415897799858e-05, + "loss": 0.030325162410736083, + "step": 67210 + }, + { + "epoch": 9.54151880766501, + "grad_norm": 8.054594039916992, + "learning_rate": 9.046273953158269e-05, + "loss": 0.026592501997947694, + "step": 67220 + }, + { + "epoch": 9.542938254080909, + "grad_norm": 5.245704174041748, + "learning_rate": 9.046132008516679e-05, + "loss": 0.041508796811103824, + "step": 67230 + }, + { + "epoch": 9.544357700496807, + "grad_norm": 8.013243675231934, + "learning_rate": 9.04599006387509e-05, + "loss": 0.0365505576133728, + "step": 67240 + }, + { + "epoch": 9.545777146912704, + "grad_norm": 7.659332752227783, + "learning_rate": 9.0458481192335e-05, + "loss": 0.07635741233825684, + "step": 67250 + }, + { + "epoch": 9.547196593328602, + "grad_norm": 8.488255500793457, + "learning_rate": 9.045706174591909e-05, + "loss": 0.017268522083759306, + "step": 67260 + }, + { + "epoch": 9.5486160397445, + "grad_norm": 0.7536658048629761, + "learning_rate": 9.045564229950319e-05, + "loss": 0.049896126985549925, + "step": 67270 + }, + { + "epoch": 9.550035486160397, + "grad_norm": 0.050682421773672104, + "learning_rate": 9.04542228530873e-05, + "loss": 0.09405158758163452, + "step": 67280 + }, + { + "epoch": 9.551454932576295, + "grad_norm": 2.3281517028808594, + "learning_rate": 9.045280340667141e-05, + "loss": 0.030408984422683714, + "step": 67290 + }, + { + "epoch": 9.552874378992193, + "grad_norm": 4.442478656768799, + "learning_rate": 9.045138396025551e-05, + "loss": 0.04440068006515503, + "step": 67300 + }, + { + "epoch": 9.554293825408092, + "grad_norm": 1.6404725313186646, + "learning_rate": 9.04499645138396e-05, + "loss": 0.057572323083877566, + "step": 67310 + }, + { + "epoch": 9.555713271823988, + "grad_norm": 0.7896831035614014, + "learning_rate": 9.04485450674237e-05, + "loss": 0.0089239239692688, + "step": 67320 + }, + { + "epoch": 9.557132718239886, + "grad_norm": 1.7920589447021484, + "learning_rate": 9.044712562100781e-05, + "loss": 0.01239323690533638, + "step": 67330 + }, + { + "epoch": 9.558552164655785, + "grad_norm": 0.457255095243454, + "learning_rate": 9.044570617459191e-05, + "loss": 0.02275240421295166, + "step": 67340 + }, + { + "epoch": 9.559971611071681, + "grad_norm": 1.4812309741973877, + "learning_rate": 9.044428672817602e-05, + "loss": 0.022527748346328737, + "step": 67350 + }, + { + "epoch": 9.56139105748758, + "grad_norm": 0.7697468400001526, + "learning_rate": 9.04428672817601e-05, + "loss": 0.022563908994197846, + "step": 67360 + }, + { + "epoch": 9.562810503903478, + "grad_norm": 0.5426087975502014, + "learning_rate": 9.044144783534422e-05, + "loss": 0.032289788126945496, + "step": 67370 + }, + { + "epoch": 9.564229950319376, + "grad_norm": 0.11037856340408325, + "learning_rate": 9.044002838892833e-05, + "loss": 0.029405874013900758, + "step": 67380 + }, + { + "epoch": 9.565649396735273, + "grad_norm": 6.742136001586914, + "learning_rate": 9.043860894251243e-05, + "loss": 0.03677998483181, + "step": 67390 + }, + { + "epoch": 9.567068843151171, + "grad_norm": 0.12115723639726639, + "learning_rate": 9.043718949609654e-05, + "loss": 0.03549255430698395, + "step": 67400 + }, + { + "epoch": 9.56848828956707, + "grad_norm": 2.4344263076782227, + "learning_rate": 9.043577004968062e-05, + "loss": 0.025998961925506592, + "step": 67410 + }, + { + "epoch": 9.569907735982966, + "grad_norm": 9.523209571838379, + "learning_rate": 9.043435060326473e-05, + "loss": 0.05110321044921875, + "step": 67420 + }, + { + "epoch": 9.571327182398864, + "grad_norm": 8.926665306091309, + "learning_rate": 9.043293115684883e-05, + "loss": 0.038002151250839236, + "step": 67430 + }, + { + "epoch": 9.572746628814762, + "grad_norm": 7.105165958404541, + "learning_rate": 9.043151171043294e-05, + "loss": 0.022980785369873045, + "step": 67440 + }, + { + "epoch": 9.57416607523066, + "grad_norm": 4.546730041503906, + "learning_rate": 9.043009226401704e-05, + "loss": 0.04310915470123291, + "step": 67450 + }, + { + "epoch": 9.575585521646557, + "grad_norm": 0.07590577006340027, + "learning_rate": 9.042867281760113e-05, + "loss": 0.024601469933986663, + "step": 67460 + }, + { + "epoch": 9.577004968062456, + "grad_norm": 5.404637336730957, + "learning_rate": 9.042725337118525e-05, + "loss": 0.03879334032535553, + "step": 67470 + }, + { + "epoch": 9.578424414478354, + "grad_norm": 7.526176452636719, + "learning_rate": 9.042583392476934e-05, + "loss": 0.05651354193687439, + "step": 67480 + }, + { + "epoch": 9.57984386089425, + "grad_norm": 2.2813119888305664, + "learning_rate": 9.042441447835345e-05, + "loss": 0.06973368525505066, + "step": 67490 + }, + { + "epoch": 9.581263307310149, + "grad_norm": 0.2586059272289276, + "learning_rate": 9.042299503193755e-05, + "loss": 0.01707226037979126, + "step": 67500 + }, + { + "epoch": 9.581263307310149, + "eval_accuracy": 0.9779360335728365, + "eval_loss": 0.07528796792030334, + "eval_runtime": 34.2604, + "eval_samples_per_second": 459.043, + "eval_steps_per_second": 14.361, + "step": 67500 + }, + { + "epoch": 9.582682753726047, + "grad_norm": 0.9092231392860413, + "learning_rate": 9.042157558552166e-05, + "loss": 0.057588744163513186, + "step": 67510 + }, + { + "epoch": 9.584102200141945, + "grad_norm": 0.9042927622795105, + "learning_rate": 9.042015613910575e-05, + "loss": 0.048612546920776364, + "step": 67520 + }, + { + "epoch": 9.585521646557842, + "grad_norm": 9.093894004821777, + "learning_rate": 9.041873669268986e-05, + "loss": 0.049756836891174314, + "step": 67530 + }, + { + "epoch": 9.58694109297374, + "grad_norm": 0.08922228217124939, + "learning_rate": 9.041731724627395e-05, + "loss": 0.02695835828781128, + "step": 67540 + }, + { + "epoch": 9.588360539389639, + "grad_norm": 0.0805022269487381, + "learning_rate": 9.041589779985807e-05, + "loss": 0.03669121265411377, + "step": 67550 + }, + { + "epoch": 9.589779985805535, + "grad_norm": 7.025901794433594, + "learning_rate": 9.041447835344216e-05, + "loss": 0.04222923517227173, + "step": 67560 + }, + { + "epoch": 9.591199432221433, + "grad_norm": 0.37235766649246216, + "learning_rate": 9.041305890702626e-05, + "loss": 0.05251051783561707, + "step": 67570 + }, + { + "epoch": 9.592618878637332, + "grad_norm": 0.1628137230873108, + "learning_rate": 9.041163946061037e-05, + "loss": 0.003060714900493622, + "step": 67580 + }, + { + "epoch": 9.59403832505323, + "grad_norm": 0.05945875868201256, + "learning_rate": 9.041022001419447e-05, + "loss": 0.012935033440589905, + "step": 67590 + }, + { + "epoch": 9.595457771469126, + "grad_norm": 4.107996940612793, + "learning_rate": 9.040880056777858e-05, + "loss": 0.04241987466812134, + "step": 67600 + }, + { + "epoch": 9.596877217885025, + "grad_norm": 0.3481174111366272, + "learning_rate": 9.040738112136268e-05, + "loss": 0.04319833219051361, + "step": 67610 + }, + { + "epoch": 9.598296664300923, + "grad_norm": 0.14052341878414154, + "learning_rate": 9.040596167494677e-05, + "loss": 0.030803701281547545, + "step": 67620 + }, + { + "epoch": 9.59971611071682, + "grad_norm": 0.5670154094696045, + "learning_rate": 9.040454222853087e-05, + "loss": 0.024802103638648987, + "step": 67630 + }, + { + "epoch": 9.601135557132718, + "grad_norm": 1.3572872877120972, + "learning_rate": 9.040312278211498e-05, + "loss": 0.005655725300312042, + "step": 67640 + }, + { + "epoch": 9.602555003548616, + "grad_norm": 0.18713583052158356, + "learning_rate": 9.040170333569908e-05, + "loss": 0.02235008031129837, + "step": 67650 + }, + { + "epoch": 9.603974449964515, + "grad_norm": 0.08048121631145477, + "learning_rate": 9.040028388928319e-05, + "loss": 0.03131297528743744, + "step": 67660 + }, + { + "epoch": 9.605393896380411, + "grad_norm": 0.01172435563057661, + "learning_rate": 9.039886444286729e-05, + "loss": 0.015043826401233673, + "step": 67670 + }, + { + "epoch": 9.60681334279631, + "grad_norm": 0.03790862113237381, + "learning_rate": 9.039744499645139e-05, + "loss": 0.036029329895973204, + "step": 67680 + }, + { + "epoch": 9.608232789212208, + "grad_norm": 1.4519929885864258, + "learning_rate": 9.03960255500355e-05, + "loss": 0.02968733012676239, + "step": 67690 + }, + { + "epoch": 9.609652235628104, + "grad_norm": 6.6910552978515625, + "learning_rate": 9.03946061036196e-05, + "loss": 0.029774963855743408, + "step": 67700 + }, + { + "epoch": 9.611071682044003, + "grad_norm": 1.6743693351745605, + "learning_rate": 9.03931866572037e-05, + "loss": 0.033174237608909606, + "step": 67710 + }, + { + "epoch": 9.6124911284599, + "grad_norm": 0.9147611856460571, + "learning_rate": 9.039176721078779e-05, + "loss": 0.034508511424064636, + "step": 67720 + }, + { + "epoch": 9.6139105748758, + "grad_norm": 0.6280707716941833, + "learning_rate": 9.03903477643719e-05, + "loss": 0.016519129276275635, + "step": 67730 + }, + { + "epoch": 9.615330021291696, + "grad_norm": 0.500525712966919, + "learning_rate": 9.0388928317956e-05, + "loss": 0.02264741063117981, + "step": 67740 + }, + { + "epoch": 9.616749467707594, + "grad_norm": 5.839907646179199, + "learning_rate": 9.038750887154011e-05, + "loss": 0.015506619215011596, + "step": 67750 + }, + { + "epoch": 9.618168914123492, + "grad_norm": 7.888493061065674, + "learning_rate": 9.03860894251242e-05, + "loss": 0.0280093789100647, + "step": 67760 + }, + { + "epoch": 9.619588360539389, + "grad_norm": 0.25900712609291077, + "learning_rate": 9.03846699787083e-05, + "loss": 0.03482568860054016, + "step": 67770 + }, + { + "epoch": 9.621007806955287, + "grad_norm": 1.0274555683135986, + "learning_rate": 9.038325053229241e-05, + "loss": 0.0375358372926712, + "step": 67780 + }, + { + "epoch": 9.622427253371185, + "grad_norm": 2.778224468231201, + "learning_rate": 9.038183108587651e-05, + "loss": 0.023729130625724792, + "step": 67790 + }, + { + "epoch": 9.623846699787084, + "grad_norm": 7.2992939949035645, + "learning_rate": 9.038041163946062e-05, + "loss": 0.028829315304756166, + "step": 67800 + }, + { + "epoch": 9.62526614620298, + "grad_norm": 0.026992222294211388, + "learning_rate": 9.037899219304472e-05, + "loss": 0.04649405777454376, + "step": 67810 + }, + { + "epoch": 9.626685592618879, + "grad_norm": 3.8848114013671875, + "learning_rate": 9.037757274662882e-05, + "loss": 0.04889317154884339, + "step": 67820 + }, + { + "epoch": 9.628105039034777, + "grad_norm": 1.3498327732086182, + "learning_rate": 9.037615330021291e-05, + "loss": 0.027367666363716125, + "step": 67830 + }, + { + "epoch": 9.629524485450673, + "grad_norm": 0.817976713180542, + "learning_rate": 9.037473385379702e-05, + "loss": 0.04165278971195221, + "step": 67840 + }, + { + "epoch": 9.630943931866572, + "grad_norm": 1.5236300230026245, + "learning_rate": 9.037331440738112e-05, + "loss": 0.019522148370742797, + "step": 67850 + }, + { + "epoch": 9.63236337828247, + "grad_norm": 2.0810041427612305, + "learning_rate": 9.037189496096523e-05, + "loss": 0.03534324169158935, + "step": 67860 + }, + { + "epoch": 9.633782824698368, + "grad_norm": 2.70131778717041, + "learning_rate": 9.037047551454933e-05, + "loss": 0.01612260490655899, + "step": 67870 + }, + { + "epoch": 9.635202271114265, + "grad_norm": 0.48344776034355164, + "learning_rate": 9.036905606813343e-05, + "loss": 0.04688323438167572, + "step": 67880 + }, + { + "epoch": 9.636621717530163, + "grad_norm": 5.539488792419434, + "learning_rate": 9.036763662171754e-05, + "loss": 0.02970455288887024, + "step": 67890 + }, + { + "epoch": 9.638041163946061, + "grad_norm": 0.35806792974472046, + "learning_rate": 9.036621717530164e-05, + "loss": 0.0037241220474243165, + "step": 67900 + }, + { + "epoch": 9.639460610361958, + "grad_norm": 7.574960708618164, + "learning_rate": 9.036479772888575e-05, + "loss": 0.030697906017303468, + "step": 67910 + }, + { + "epoch": 9.640880056777856, + "grad_norm": 3.901747941970825, + "learning_rate": 9.036337828246984e-05, + "loss": 0.030141755938529968, + "step": 67920 + }, + { + "epoch": 9.642299503193755, + "grad_norm": 1.6139155626296997, + "learning_rate": 9.036195883605394e-05, + "loss": 0.04500369131565094, + "step": 67930 + }, + { + "epoch": 9.643718949609653, + "grad_norm": 11.200180053710938, + "learning_rate": 9.036053938963804e-05, + "loss": 0.024948553740978242, + "step": 67940 + }, + { + "epoch": 9.64513839602555, + "grad_norm": 5.555013656616211, + "learning_rate": 9.035911994322215e-05, + "loss": 0.0150326669216156, + "step": 67950 + }, + { + "epoch": 9.646557842441448, + "grad_norm": 0.21982567012310028, + "learning_rate": 9.035770049680625e-05, + "loss": 0.041651162505149844, + "step": 67960 + }, + { + "epoch": 9.647977288857346, + "grad_norm": 4.405213356018066, + "learning_rate": 9.035628105039036e-05, + "loss": 0.022198933362960815, + "step": 67970 + }, + { + "epoch": 9.649396735273243, + "grad_norm": 0.15988144278526306, + "learning_rate": 9.035486160397446e-05, + "loss": 0.025725823640823365, + "step": 67980 + }, + { + "epoch": 9.650816181689141, + "grad_norm": 0.0842059925198555, + "learning_rate": 9.035344215755855e-05, + "loss": 0.025674444437026978, + "step": 67990 + }, + { + "epoch": 9.65223562810504, + "grad_norm": 0.30113139748573303, + "learning_rate": 9.035202271114266e-05, + "loss": 0.05994483232498169, + "step": 68000 + }, + { + "epoch": 9.65223562810504, + "eval_accuracy": 0.9802250906085077, + "eval_loss": 0.06837733089923859, + "eval_runtime": 32.9171, + "eval_samples_per_second": 477.777, + "eval_steps_per_second": 14.947, + "step": 68000 + }, + { + "epoch": 9.653655074520938, + "grad_norm": 0.037046462297439575, + "learning_rate": 9.035060326472676e-05, + "loss": 0.015805913507938384, + "step": 68010 + }, + { + "epoch": 9.655074520936834, + "grad_norm": 0.9824765920639038, + "learning_rate": 9.034918381831087e-05, + "loss": 0.021517379581928252, + "step": 68020 + }, + { + "epoch": 9.656493967352732, + "grad_norm": 0.04214629903435707, + "learning_rate": 9.034776437189496e-05, + "loss": 0.04769000113010406, + "step": 68030 + }, + { + "epoch": 9.65791341376863, + "grad_norm": 0.6568806171417236, + "learning_rate": 9.034634492547907e-05, + "loss": 0.01876375675201416, + "step": 68040 + }, + { + "epoch": 9.659332860184527, + "grad_norm": 2.5729522705078125, + "learning_rate": 9.034492547906316e-05, + "loss": 0.014666444063186646, + "step": 68050 + }, + { + "epoch": 9.660752306600425, + "grad_norm": 0.447099506855011, + "learning_rate": 9.034350603264728e-05, + "loss": 0.07560728788375855, + "step": 68060 + }, + { + "epoch": 9.662171753016324, + "grad_norm": 12.226578712463379, + "learning_rate": 9.034208658623137e-05, + "loss": 0.04010969698429108, + "step": 68070 + }, + { + "epoch": 9.663591199432222, + "grad_norm": 6.650430202484131, + "learning_rate": 9.034066713981547e-05, + "loss": 0.046042519807815555, + "step": 68080 + }, + { + "epoch": 9.665010645848119, + "grad_norm": 5.328957557678223, + "learning_rate": 9.033924769339958e-05, + "loss": 0.02840046286582947, + "step": 68090 + }, + { + "epoch": 9.666430092264017, + "grad_norm": 3.466201066970825, + "learning_rate": 9.033782824698368e-05, + "loss": 0.028611963987350462, + "step": 68100 + }, + { + "epoch": 9.667849538679915, + "grad_norm": 4.610507965087891, + "learning_rate": 9.033640880056779e-05, + "loss": 0.061683553457260135, + "step": 68110 + }, + { + "epoch": 9.669268985095812, + "grad_norm": 4.94869327545166, + "learning_rate": 9.033498935415189e-05, + "loss": 0.03219112753868103, + "step": 68120 + }, + { + "epoch": 9.67068843151171, + "grad_norm": 0.040585510432720184, + "learning_rate": 9.033356990773598e-05, + "loss": 0.08483388423919677, + "step": 68130 + }, + { + "epoch": 9.672107877927608, + "grad_norm": 0.7158527970314026, + "learning_rate": 9.033215046132008e-05, + "loss": 0.012271341681480408, + "step": 68140 + }, + { + "epoch": 9.673527324343507, + "grad_norm": 1.417113184928894, + "learning_rate": 9.033073101490419e-05, + "loss": 0.027994108200073243, + "step": 68150 + }, + { + "epoch": 9.674946770759403, + "grad_norm": 0.13532933592796326, + "learning_rate": 9.032931156848829e-05, + "loss": 0.0153349369764328, + "step": 68160 + }, + { + "epoch": 9.676366217175302, + "grad_norm": 6.49558687210083, + "learning_rate": 9.03278921220724e-05, + "loss": 0.05084382295608521, + "step": 68170 + }, + { + "epoch": 9.6777856635912, + "grad_norm": 3.603513717651367, + "learning_rate": 9.03264726756565e-05, + "loss": 0.02989896833896637, + "step": 68180 + }, + { + "epoch": 9.679205110007096, + "grad_norm": 6.750855922698975, + "learning_rate": 9.03250532292406e-05, + "loss": 0.04880207479000091, + "step": 68190 + }, + { + "epoch": 9.680624556422995, + "grad_norm": 0.35573068261146545, + "learning_rate": 9.03236337828247e-05, + "loss": 0.023963749408721924, + "step": 68200 + }, + { + "epoch": 9.682044002838893, + "grad_norm": 3.9779114723205566, + "learning_rate": 9.03222143364088e-05, + "loss": 0.012136822938919068, + "step": 68210 + }, + { + "epoch": 9.683463449254791, + "grad_norm": 3.566655397415161, + "learning_rate": 9.032079488999291e-05, + "loss": 0.030810701847076415, + "step": 68220 + }, + { + "epoch": 9.684882895670688, + "grad_norm": 0.5124838948249817, + "learning_rate": 9.031937544357701e-05, + "loss": 0.032623404264450075, + "step": 68230 + }, + { + "epoch": 9.686302342086586, + "grad_norm": 0.8495200872421265, + "learning_rate": 9.031795599716111e-05, + "loss": 0.04098401665687561, + "step": 68240 + }, + { + "epoch": 9.687721788502484, + "grad_norm": 2.748220205307007, + "learning_rate": 9.03165365507452e-05, + "loss": 0.038351207971572876, + "step": 68250 + }, + { + "epoch": 9.689141234918381, + "grad_norm": 2.5813050270080566, + "learning_rate": 9.031511710432932e-05, + "loss": 0.027606451511383058, + "step": 68260 + }, + { + "epoch": 9.69056068133428, + "grad_norm": 0.36798134446144104, + "learning_rate": 9.031369765791342e-05, + "loss": 0.041548147797584534, + "step": 68270 + }, + { + "epoch": 9.691980127750178, + "grad_norm": 0.32126396894454956, + "learning_rate": 9.031227821149753e-05, + "loss": 0.013662393391132354, + "step": 68280 + }, + { + "epoch": 9.693399574166076, + "grad_norm": 0.9041917324066162, + "learning_rate": 9.031085876508162e-05, + "loss": 0.030037564039230347, + "step": 68290 + }, + { + "epoch": 9.694819020581972, + "grad_norm": 15.671060562133789, + "learning_rate": 9.030943931866572e-05, + "loss": 0.04284512996673584, + "step": 68300 + }, + { + "epoch": 9.69623846699787, + "grad_norm": 2.6367321014404297, + "learning_rate": 9.030801987224983e-05, + "loss": 0.024413591623306273, + "step": 68310 + }, + { + "epoch": 9.697657913413769, + "grad_norm": 0.5867029428482056, + "learning_rate": 9.030660042583393e-05, + "loss": 0.09276617169380189, + "step": 68320 + }, + { + "epoch": 9.699077359829666, + "grad_norm": 0.19113053381443024, + "learning_rate": 9.030518097941804e-05, + "loss": 0.016789191961288454, + "step": 68330 + }, + { + "epoch": 9.700496806245564, + "grad_norm": 5.9239654541015625, + "learning_rate": 9.030376153300212e-05, + "loss": 0.015049314498901368, + "step": 68340 + }, + { + "epoch": 9.701916252661462, + "grad_norm": 1.9666008949279785, + "learning_rate": 9.030234208658623e-05, + "loss": 0.034362810850143435, + "step": 68350 + }, + { + "epoch": 9.70333569907736, + "grad_norm": 0.094641774892807, + "learning_rate": 9.030092264017033e-05, + "loss": 0.024285507202148438, + "step": 68360 + }, + { + "epoch": 9.704755145493257, + "grad_norm": 0.024246973916888237, + "learning_rate": 9.029950319375444e-05, + "loss": 0.04042229950428009, + "step": 68370 + }, + { + "epoch": 9.706174591909155, + "grad_norm": 11.903240203857422, + "learning_rate": 9.029808374733854e-05, + "loss": 0.05841625332832336, + "step": 68380 + }, + { + "epoch": 9.707594038325054, + "grad_norm": 8.026371955871582, + "learning_rate": 9.029666430092264e-05, + "loss": 0.0390622079372406, + "step": 68390 + }, + { + "epoch": 9.70901348474095, + "grad_norm": 0.35515421628952026, + "learning_rate": 9.029524485450675e-05, + "loss": 0.04484111368656159, + "step": 68400 + }, + { + "epoch": 9.710432931156848, + "grad_norm": 0.31352072954177856, + "learning_rate": 9.029382540809085e-05, + "loss": 0.024947115778923036, + "step": 68410 + }, + { + "epoch": 9.711852377572747, + "grad_norm": 0.3414263427257538, + "learning_rate": 9.029240596167496e-05, + "loss": 0.007836203277111053, + "step": 68420 + }, + { + "epoch": 9.713271823988645, + "grad_norm": 1.5273081064224243, + "learning_rate": 9.029098651525905e-05, + "loss": 0.018830813467502594, + "step": 68430 + }, + { + "epoch": 9.714691270404542, + "grad_norm": 0.6126468181610107, + "learning_rate": 9.028956706884315e-05, + "loss": 0.01927516460418701, + "step": 68440 + }, + { + "epoch": 9.71611071682044, + "grad_norm": 0.3930172026157379, + "learning_rate": 9.028814762242725e-05, + "loss": 0.01516784429550171, + "step": 68450 + }, + { + "epoch": 9.717530163236338, + "grad_norm": 0.051625289022922516, + "learning_rate": 9.028672817601136e-05, + "loss": 0.015130971372127534, + "step": 68460 + }, + { + "epoch": 9.718949609652235, + "grad_norm": 14.71915340423584, + "learning_rate": 9.028530872959546e-05, + "loss": 0.05186105966567993, + "step": 68470 + }, + { + "epoch": 9.720369056068133, + "grad_norm": 1.3532153367996216, + "learning_rate": 9.028388928317957e-05, + "loss": 0.04229007363319397, + "step": 68480 + }, + { + "epoch": 9.721788502484031, + "grad_norm": 11.942264556884766, + "learning_rate": 9.028246983676367e-05, + "loss": 0.04268681704998016, + "step": 68490 + }, + { + "epoch": 9.72320794889993, + "grad_norm": 9.604484558105469, + "learning_rate": 9.028105039034776e-05, + "loss": 0.0274705708026886, + "step": 68500 + }, + { + "epoch": 9.72320794889993, + "eval_accuracy": 0.9760284860431105, + "eval_loss": 0.07992067188024521, + "eval_runtime": 32.7675, + "eval_samples_per_second": 479.957, + "eval_steps_per_second": 15.015, + "step": 68500 + }, + { + "epoch": 9.724627395315826, + "grad_norm": 0.07708755880594254, + "learning_rate": 9.027963094393187e-05, + "loss": 0.054282760620117186, + "step": 68510 + }, + { + "epoch": 9.726046841731725, + "grad_norm": 0.1621921807527542, + "learning_rate": 9.027821149751597e-05, + "loss": 0.05011662244796753, + "step": 68520 + }, + { + "epoch": 9.727466288147623, + "grad_norm": 3.5815699100494385, + "learning_rate": 9.027679205110008e-05, + "loss": 0.022197265923023225, + "step": 68530 + }, + { + "epoch": 9.72888573456352, + "grad_norm": 3.2508788108825684, + "learning_rate": 9.027537260468418e-05, + "loss": 0.06533399224281311, + "step": 68540 + }, + { + "epoch": 9.730305180979418, + "grad_norm": 1.647470474243164, + "learning_rate": 9.027395315826828e-05, + "loss": 0.01592020094394684, + "step": 68550 + }, + { + "epoch": 9.731724627395316, + "grad_norm": 8.036884307861328, + "learning_rate": 9.027253371185237e-05, + "loss": 0.029971572756767272, + "step": 68560 + }, + { + "epoch": 9.733144073811214, + "grad_norm": 1.3174266815185547, + "learning_rate": 9.027111426543649e-05, + "loss": 0.03688263297080994, + "step": 68570 + }, + { + "epoch": 9.73456352022711, + "grad_norm": 0.30061954259872437, + "learning_rate": 9.026969481902058e-05, + "loss": 0.01946229040622711, + "step": 68580 + }, + { + "epoch": 9.735982966643009, + "grad_norm": 0.08620461076498032, + "learning_rate": 9.02682753726047e-05, + "loss": 0.027223414182662962, + "step": 68590 + }, + { + "epoch": 9.737402413058907, + "grad_norm": 9.125005722045898, + "learning_rate": 9.026685592618879e-05, + "loss": 0.03019559383392334, + "step": 68600 + }, + { + "epoch": 9.738821859474804, + "grad_norm": 0.14172478020191193, + "learning_rate": 9.026543647977289e-05, + "loss": 0.04983446002006531, + "step": 68610 + }, + { + "epoch": 9.740241305890702, + "grad_norm": 0.6675568222999573, + "learning_rate": 9.0264017033357e-05, + "loss": 0.026645490527153017, + "step": 68620 + }, + { + "epoch": 9.7416607523066, + "grad_norm": 6.146869659423828, + "learning_rate": 9.02625975869411e-05, + "loss": 0.02890416979789734, + "step": 68630 + }, + { + "epoch": 9.743080198722499, + "grad_norm": 6.365429878234863, + "learning_rate": 9.026117814052521e-05, + "loss": 0.05891613364219665, + "step": 68640 + }, + { + "epoch": 9.744499645138395, + "grad_norm": 0.1611870676279068, + "learning_rate": 9.025975869410929e-05, + "loss": 0.01408046782016754, + "step": 68650 + }, + { + "epoch": 9.745919091554294, + "grad_norm": 0.32437679171562195, + "learning_rate": 9.02583392476934e-05, + "loss": 0.056261765956878665, + "step": 68660 + }, + { + "epoch": 9.747338537970192, + "grad_norm": 8.895389556884766, + "learning_rate": 9.02569198012775e-05, + "loss": 0.06496468782424927, + "step": 68670 + }, + { + "epoch": 9.748757984386089, + "grad_norm": 0.5776633024215698, + "learning_rate": 9.025550035486161e-05, + "loss": 0.03514551818370819, + "step": 68680 + }, + { + "epoch": 9.750177430801987, + "grad_norm": 7.958167552947998, + "learning_rate": 9.025408090844572e-05, + "loss": 0.10595873594284058, + "step": 68690 + }, + { + "epoch": 9.751596877217885, + "grad_norm": 2.3124489784240723, + "learning_rate": 9.02526614620298e-05, + "loss": 0.0070076905190944675, + "step": 68700 + }, + { + "epoch": 9.753016323633783, + "grad_norm": 8.000714302062988, + "learning_rate": 9.025124201561392e-05, + "loss": 0.061643147468566896, + "step": 68710 + }, + { + "epoch": 9.75443577004968, + "grad_norm": 6.112409591674805, + "learning_rate": 9.024982256919801e-05, + "loss": 0.02671927809715271, + "step": 68720 + }, + { + "epoch": 9.755855216465578, + "grad_norm": 0.09724577516317368, + "learning_rate": 9.024840312278212e-05, + "loss": 0.011995351314544678, + "step": 68730 + }, + { + "epoch": 9.757274662881477, + "grad_norm": 0.11631211638450623, + "learning_rate": 9.024698367636622e-05, + "loss": 0.026068294048309328, + "step": 68740 + }, + { + "epoch": 9.758694109297373, + "grad_norm": 5.72356653213501, + "learning_rate": 9.024556422995032e-05, + "loss": 0.04342763423919678, + "step": 68750 + }, + { + "epoch": 9.760113555713271, + "grad_norm": 0.0995788425207138, + "learning_rate": 9.024414478353442e-05, + "loss": 0.04173007309436798, + "step": 68760 + }, + { + "epoch": 9.76153300212917, + "grad_norm": 0.45903313159942627, + "learning_rate": 9.024272533711853e-05, + "loss": 0.046264901757240295, + "step": 68770 + }, + { + "epoch": 9.762952448545068, + "grad_norm": 0.18278242647647858, + "learning_rate": 9.024130589070264e-05, + "loss": 0.015135698020458221, + "step": 68780 + }, + { + "epoch": 9.764371894960965, + "grad_norm": 11.565159797668457, + "learning_rate": 9.023988644428674e-05, + "loss": 0.025888818502426147, + "step": 68790 + }, + { + "epoch": 9.765791341376863, + "grad_norm": 10.3805570602417, + "learning_rate": 9.023846699787083e-05, + "loss": 0.0638617753982544, + "step": 68800 + }, + { + "epoch": 9.767210787792761, + "grad_norm": 4.832674980163574, + "learning_rate": 9.023704755145493e-05, + "loss": 0.027055513858795167, + "step": 68810 + }, + { + "epoch": 9.768630234208658, + "grad_norm": 6.372629165649414, + "learning_rate": 9.023562810503904e-05, + "loss": 0.04173833131790161, + "step": 68820 + }, + { + "epoch": 9.770049680624556, + "grad_norm": 3.9561760425567627, + "learning_rate": 9.023420865862314e-05, + "loss": 0.028818246722221375, + "step": 68830 + }, + { + "epoch": 9.771469127040454, + "grad_norm": 0.03159647807478905, + "learning_rate": 9.023278921220725e-05, + "loss": 0.05534324049949646, + "step": 68840 + }, + { + "epoch": 9.772888573456353, + "grad_norm": 0.6982274055480957, + "learning_rate": 9.023136976579133e-05, + "loss": 0.0524729311466217, + "step": 68850 + }, + { + "epoch": 9.77430801987225, + "grad_norm": 0.20039992034435272, + "learning_rate": 9.022995031937544e-05, + "loss": 0.11307457685470582, + "step": 68860 + }, + { + "epoch": 9.775727466288147, + "grad_norm": 0.5845163464546204, + "learning_rate": 9.022853087295956e-05, + "loss": 0.03681440949440003, + "step": 68870 + }, + { + "epoch": 9.777146912704046, + "grad_norm": 0.35227110981941223, + "learning_rate": 9.022711142654365e-05, + "loss": 0.05711947083473205, + "step": 68880 + }, + { + "epoch": 9.778566359119942, + "grad_norm": 0.7870414853096008, + "learning_rate": 9.022569198012776e-05, + "loss": 0.08433527350425721, + "step": 68890 + }, + { + "epoch": 9.77998580553584, + "grad_norm": 4.1868085861206055, + "learning_rate": 9.022427253371186e-05, + "loss": 0.03198896646499634, + "step": 68900 + }, + { + "epoch": 9.781405251951739, + "grad_norm": 3.233597755432129, + "learning_rate": 9.022285308729596e-05, + "loss": 0.019787636399269105, + "step": 68910 + }, + { + "epoch": 9.782824698367637, + "grad_norm": 0.9820303916931152, + "learning_rate": 9.022143364088006e-05, + "loss": 0.05114408135414124, + "step": 68920 + }, + { + "epoch": 9.784244144783534, + "grad_norm": 2.048151731491089, + "learning_rate": 9.022001419446417e-05, + "loss": 0.028573969006538393, + "step": 68930 + }, + { + "epoch": 9.785663591199432, + "grad_norm": 6.333803176879883, + "learning_rate": 9.021859474804826e-05, + "loss": 0.04720070958137512, + "step": 68940 + }, + { + "epoch": 9.78708303761533, + "grad_norm": 4.3006768226623535, + "learning_rate": 9.021717530163238e-05, + "loss": 0.028902134299278258, + "step": 68950 + }, + { + "epoch": 9.788502484031227, + "grad_norm": 0.016134673729538918, + "learning_rate": 9.021575585521647e-05, + "loss": 0.06601372957229615, + "step": 68960 + }, + { + "epoch": 9.789921930447125, + "grad_norm": 0.956045925617218, + "learning_rate": 9.021433640880057e-05, + "loss": 0.007774467766284943, + "step": 68970 + }, + { + "epoch": 9.791341376863024, + "grad_norm": 0.9125019311904907, + "learning_rate": 9.021291696238468e-05, + "loss": 0.017521186172962187, + "step": 68980 + }, + { + "epoch": 9.792760823278922, + "grad_norm": 0.5352014899253845, + "learning_rate": 9.021149751596878e-05, + "loss": 0.04417323470115662, + "step": 68990 + }, + { + "epoch": 9.794180269694818, + "grad_norm": 2.805211305618286, + "learning_rate": 9.021007806955289e-05, + "loss": 0.013200858235359192, + "step": 69000 + }, + { + "epoch": 9.794180269694818, + "eval_accuracy": 0.9781903732434667, + "eval_loss": 0.07315292954444885, + "eval_runtime": 33.3338, + "eval_samples_per_second": 471.804, + "eval_steps_per_second": 14.76, + "step": 69000 + }, + { + "epoch": 9.795599716110717, + "grad_norm": 5.417062282562256, + "learning_rate": 9.020865862313697e-05, + "loss": 0.03514570593833923, + "step": 69010 + }, + { + "epoch": 9.797019162526615, + "grad_norm": 16.58610725402832, + "learning_rate": 9.020723917672108e-05, + "loss": 0.048204198479652405, + "step": 69020 + }, + { + "epoch": 9.798438608942512, + "grad_norm": 1.3934992551803589, + "learning_rate": 9.020581973030518e-05, + "loss": 0.03688859939575195, + "step": 69030 + }, + { + "epoch": 9.79985805535841, + "grad_norm": 2.5325000286102295, + "learning_rate": 9.020440028388929e-05, + "loss": 0.060921496152877806, + "step": 69040 + }, + { + "epoch": 9.801277501774308, + "grad_norm": 2.1352107524871826, + "learning_rate": 9.020298083747339e-05, + "loss": 0.05504473447799683, + "step": 69050 + }, + { + "epoch": 9.802696948190206, + "grad_norm": 0.06280253082513809, + "learning_rate": 9.020156139105749e-05, + "loss": 0.05045873522758484, + "step": 69060 + }, + { + "epoch": 9.804116394606103, + "grad_norm": 2.113204002380371, + "learning_rate": 9.02001419446416e-05, + "loss": 0.027818793058395387, + "step": 69070 + }, + { + "epoch": 9.805535841022001, + "grad_norm": 0.8788726329803467, + "learning_rate": 9.01987224982257e-05, + "loss": 0.0661763608455658, + "step": 69080 + }, + { + "epoch": 9.8069552874379, + "grad_norm": 1.3066922426223755, + "learning_rate": 9.01973030518098e-05, + "loss": 0.013252317905426025, + "step": 69090 + }, + { + "epoch": 9.808374733853796, + "grad_norm": 5.125593662261963, + "learning_rate": 9.01958836053939e-05, + "loss": 0.016374337673187255, + "step": 69100 + }, + { + "epoch": 9.809794180269694, + "grad_norm": 1.2147674560546875, + "learning_rate": 9.0194464158978e-05, + "loss": 0.043211477994918826, + "step": 69110 + }, + { + "epoch": 9.811213626685593, + "grad_norm": 1.1660009622573853, + "learning_rate": 9.01930447125621e-05, + "loss": 0.03999923467636109, + "step": 69120 + }, + { + "epoch": 9.812633073101491, + "grad_norm": 0.08482635766267776, + "learning_rate": 9.019162526614621e-05, + "loss": 0.005695971474051475, + "step": 69130 + }, + { + "epoch": 9.814052519517388, + "grad_norm": 6.350915431976318, + "learning_rate": 9.019020581973031e-05, + "loss": 0.03713645339012146, + "step": 69140 + }, + { + "epoch": 9.815471965933286, + "grad_norm": 0.1697952151298523, + "learning_rate": 9.018878637331442e-05, + "loss": 0.05905236601829529, + "step": 69150 + }, + { + "epoch": 9.816891412349184, + "grad_norm": 6.711185455322266, + "learning_rate": 9.018736692689852e-05, + "loss": 0.04255087375640869, + "step": 69160 + }, + { + "epoch": 9.81831085876508, + "grad_norm": 0.012068729847669601, + "learning_rate": 9.018594748048261e-05, + "loss": 0.03160058557987213, + "step": 69170 + }, + { + "epoch": 9.819730305180979, + "grad_norm": 0.011125321500003338, + "learning_rate": 9.018452803406672e-05, + "loss": 0.040947556495666504, + "step": 69180 + }, + { + "epoch": 9.821149751596877, + "grad_norm": 2.1126627922058105, + "learning_rate": 9.018310858765082e-05, + "loss": 0.09066906571388245, + "step": 69190 + }, + { + "epoch": 9.822569198012776, + "grad_norm": 0.3607144057750702, + "learning_rate": 9.018168914123493e-05, + "loss": 0.03679504990577698, + "step": 69200 + }, + { + "epoch": 9.823988644428672, + "grad_norm": 8.781025886535645, + "learning_rate": 9.018026969481902e-05, + "loss": 0.05192814469337463, + "step": 69210 + }, + { + "epoch": 9.82540809084457, + "grad_norm": 0.5350939035415649, + "learning_rate": 9.017885024840313e-05, + "loss": 0.04593566954135895, + "step": 69220 + }, + { + "epoch": 9.826827537260469, + "grad_norm": 11.871026039123535, + "learning_rate": 9.017743080198722e-05, + "loss": 0.044462653994560244, + "step": 69230 + }, + { + "epoch": 9.828246983676365, + "grad_norm": 0.27279049158096313, + "learning_rate": 9.017601135557133e-05, + "loss": 0.019877782464027403, + "step": 69240 + }, + { + "epoch": 9.829666430092264, + "grad_norm": 1.0789798498153687, + "learning_rate": 9.017459190915543e-05, + "loss": 0.0799859881401062, + "step": 69250 + }, + { + "epoch": 9.831085876508162, + "grad_norm": 1.5498286485671997, + "learning_rate": 9.017317246273954e-05, + "loss": 0.01284824013710022, + "step": 69260 + }, + { + "epoch": 9.83250532292406, + "grad_norm": 0.40647804737091064, + "learning_rate": 9.017175301632364e-05, + "loss": 0.07362874746322631, + "step": 69270 + }, + { + "epoch": 9.833924769339957, + "grad_norm": 5.722687721252441, + "learning_rate": 9.017033356990774e-05, + "loss": 0.012973059713840485, + "step": 69280 + }, + { + "epoch": 9.835344215755855, + "grad_norm": 4.900279998779297, + "learning_rate": 9.016891412349185e-05, + "loss": 0.03889691233634949, + "step": 69290 + }, + { + "epoch": 9.836763662171753, + "grad_norm": 2.8748764991760254, + "learning_rate": 9.016749467707595e-05, + "loss": 0.02030760645866394, + "step": 69300 + }, + { + "epoch": 9.83818310858765, + "grad_norm": 5.670338153839111, + "learning_rate": 9.016607523066006e-05, + "loss": 0.05160186290740967, + "step": 69310 + }, + { + "epoch": 9.839602555003548, + "grad_norm": 0.7216220498085022, + "learning_rate": 9.016465578424414e-05, + "loss": 0.02686397135257721, + "step": 69320 + }, + { + "epoch": 9.841022001419446, + "grad_norm": 0.06797802448272705, + "learning_rate": 9.016323633782825e-05, + "loss": 0.019428203999996185, + "step": 69330 + }, + { + "epoch": 9.842441447835345, + "grad_norm": 0.248221755027771, + "learning_rate": 9.016181689141235e-05, + "loss": 0.015675346553325652, + "step": 69340 + }, + { + "epoch": 9.843860894251241, + "grad_norm": 9.568998336791992, + "learning_rate": 9.016039744499646e-05, + "loss": 0.06899868249893189, + "step": 69350 + }, + { + "epoch": 9.84528034066714, + "grad_norm": 0.03419743478298187, + "learning_rate": 9.015897799858056e-05, + "loss": 0.03135853111743927, + "step": 69360 + }, + { + "epoch": 9.846699787083038, + "grad_norm": 0.8306211829185486, + "learning_rate": 9.015755855216465e-05, + "loss": 0.021877869963645935, + "step": 69370 + }, + { + "epoch": 9.848119233498934, + "grad_norm": 7.840019226074219, + "learning_rate": 9.015613910574877e-05, + "loss": 0.06136330366134644, + "step": 69380 + }, + { + "epoch": 9.849538679914833, + "grad_norm": 6.342380523681641, + "learning_rate": 9.015471965933286e-05, + "loss": 0.012593349814414978, + "step": 69390 + }, + { + "epoch": 9.850958126330731, + "grad_norm": 11.976713180541992, + "learning_rate": 9.015330021291697e-05, + "loss": 0.05724484324455261, + "step": 69400 + }, + { + "epoch": 9.85237757274663, + "grad_norm": 1.395046353340149, + "learning_rate": 9.015188076650107e-05, + "loss": 0.019671787321567536, + "step": 69410 + }, + { + "epoch": 9.853797019162526, + "grad_norm": 2.9691336154937744, + "learning_rate": 9.015046132008517e-05, + "loss": 0.022539976239204406, + "step": 69420 + }, + { + "epoch": 9.855216465578424, + "grad_norm": 0.34182435274124146, + "learning_rate": 9.014904187366927e-05, + "loss": 0.02716274857521057, + "step": 69430 + }, + { + "epoch": 9.856635911994323, + "grad_norm": 0.361360102891922, + "learning_rate": 9.014762242725338e-05, + "loss": 0.021220579743385315, + "step": 69440 + }, + { + "epoch": 9.858055358410219, + "grad_norm": 0.6431921720504761, + "learning_rate": 9.014620298083747e-05, + "loss": 0.024566707015037537, + "step": 69450 + }, + { + "epoch": 9.859474804826117, + "grad_norm": 0.2722751200199127, + "learning_rate": 9.014478353442159e-05, + "loss": 0.03161434531211853, + "step": 69460 + }, + { + "epoch": 9.860894251242016, + "grad_norm": 0.27604079246520996, + "learning_rate": 9.014336408800568e-05, + "loss": 0.04619441032409668, + "step": 69470 + }, + { + "epoch": 9.862313697657914, + "grad_norm": 8.648920059204102, + "learning_rate": 9.014194464158978e-05, + "loss": 0.09676159620285034, + "step": 69480 + }, + { + "epoch": 9.86373314407381, + "grad_norm": 0.4807116687297821, + "learning_rate": 9.014052519517389e-05, + "loss": 0.04626038074493408, + "step": 69490 + }, + { + "epoch": 9.865152590489709, + "grad_norm": 0.0991576537489891, + "learning_rate": 9.013910574875799e-05, + "loss": 0.044435915350914, + "step": 69500 + }, + { + "epoch": 9.865152590489709, + "eval_accuracy": 0.9850575443504801, + "eval_loss": 0.04669315740466118, + "eval_runtime": 33.0656, + "eval_samples_per_second": 475.63, + "eval_steps_per_second": 14.88, + "step": 69500 + }, + { + "epoch": 9.866572036905607, + "grad_norm": 0.023003293201327324, + "learning_rate": 9.01376863023421e-05, + "loss": 0.023037827014923094, + "step": 69510 + }, + { + "epoch": 9.867991483321505, + "grad_norm": 0.8228864669799805, + "learning_rate": 9.013640880056778e-05, + "loss": 0.08946434259414673, + "step": 69520 + }, + { + "epoch": 9.869410929737402, + "grad_norm": 2.343458414077759, + "learning_rate": 9.01349893541519e-05, + "loss": 0.022932544350624084, + "step": 69530 + }, + { + "epoch": 9.8708303761533, + "grad_norm": 1.5184394121170044, + "learning_rate": 9.013356990773598e-05, + "loss": 0.010104528069496155, + "step": 69540 + }, + { + "epoch": 9.872249822569199, + "grad_norm": 0.2630665600299835, + "learning_rate": 9.013215046132009e-05, + "loss": 0.024105256795883177, + "step": 69550 + }, + { + "epoch": 9.873669268985095, + "grad_norm": 0.013559470884501934, + "learning_rate": 9.013073101490419e-05, + "loss": 0.02312069237232208, + "step": 69560 + }, + { + "epoch": 9.875088715400993, + "grad_norm": 0.2757904827594757, + "learning_rate": 9.01293115684883e-05, + "loss": 0.03828359246253967, + "step": 69570 + }, + { + "epoch": 9.876508161816892, + "grad_norm": 1.3558392524719238, + "learning_rate": 9.01278921220724e-05, + "loss": 0.0364987313747406, + "step": 69580 + }, + { + "epoch": 9.87792760823279, + "grad_norm": 0.9141183495521545, + "learning_rate": 9.01264726756565e-05, + "loss": 0.05945742130279541, + "step": 69590 + }, + { + "epoch": 9.879347054648687, + "grad_norm": 10.217071533203125, + "learning_rate": 9.012505322924059e-05, + "loss": 0.0646911084651947, + "step": 69600 + }, + { + "epoch": 9.880766501064585, + "grad_norm": 1.882594108581543, + "learning_rate": 9.01236337828247e-05, + "loss": 0.013017700612545013, + "step": 69610 + }, + { + "epoch": 9.882185947480483, + "grad_norm": 1.8000305891036987, + "learning_rate": 9.012221433640881e-05, + "loss": 0.02973770201206207, + "step": 69620 + }, + { + "epoch": 9.88360539389638, + "grad_norm": 2.1482441425323486, + "learning_rate": 9.012079488999291e-05, + "loss": 0.043613281846046445, + "step": 69630 + }, + { + "epoch": 9.885024840312278, + "grad_norm": 1.8047417402267456, + "learning_rate": 9.011937544357702e-05, + "loss": 0.010743890702724457, + "step": 69640 + }, + { + "epoch": 9.886444286728176, + "grad_norm": 0.356742799282074, + "learning_rate": 9.01179559971611e-05, + "loss": 0.04975705146789551, + "step": 69650 + }, + { + "epoch": 9.887863733144075, + "grad_norm": 0.8651915788650513, + "learning_rate": 9.011653655074522e-05, + "loss": 0.03396418988704682, + "step": 69660 + }, + { + "epoch": 9.889283179559971, + "grad_norm": 5.231709957122803, + "learning_rate": 9.011511710432931e-05, + "loss": 0.035730010271072386, + "step": 69670 + }, + { + "epoch": 9.89070262597587, + "grad_norm": 19.371747970581055, + "learning_rate": 9.011369765791342e-05, + "loss": 0.035460355877876285, + "step": 69680 + }, + { + "epoch": 9.892122072391768, + "grad_norm": 5.615591526031494, + "learning_rate": 9.011227821149752e-05, + "loss": 0.06311107277870179, + "step": 69690 + }, + { + "epoch": 9.893541518807664, + "grad_norm": 0.6916624307632446, + "learning_rate": 9.011085876508162e-05, + "loss": 0.01275596022605896, + "step": 69700 + }, + { + "epoch": 9.894960965223563, + "grad_norm": 0.03846505656838417, + "learning_rate": 9.010943931866573e-05, + "loss": 0.019737032055854798, + "step": 69710 + }, + { + "epoch": 9.896380411639461, + "grad_norm": 12.611459732055664, + "learning_rate": 9.010801987224983e-05, + "loss": 0.025944510102272035, + "step": 69720 + }, + { + "epoch": 9.89779985805536, + "grad_norm": 0.03990490362048149, + "learning_rate": 9.010660042583394e-05, + "loss": 0.05266411900520325, + "step": 69730 + }, + { + "epoch": 9.899219304471256, + "grad_norm": 8.41258716583252, + "learning_rate": 9.010518097941804e-05, + "loss": 0.03398913741111755, + "step": 69740 + }, + { + "epoch": 9.900638750887154, + "grad_norm": 0.4625275731086731, + "learning_rate": 9.010376153300213e-05, + "loss": 0.028921571373939515, + "step": 69750 + }, + { + "epoch": 9.902058197303052, + "grad_norm": 2.8440163135528564, + "learning_rate": 9.010234208658623e-05, + "loss": 0.026828548312187193, + "step": 69760 + }, + { + "epoch": 9.903477643718949, + "grad_norm": 0.30509254336357117, + "learning_rate": 9.010092264017034e-05, + "loss": 0.02407469302415848, + "step": 69770 + }, + { + "epoch": 9.904897090134847, + "grad_norm": 0.16248999536037445, + "learning_rate": 9.009950319375444e-05, + "loss": 0.032114657759666446, + "step": 69780 + }, + { + "epoch": 9.906316536550746, + "grad_norm": 9.057454109191895, + "learning_rate": 9.009808374733855e-05, + "loss": 0.05952262282371521, + "step": 69790 + }, + { + "epoch": 9.907735982966644, + "grad_norm": 0.25342628359794617, + "learning_rate": 9.009666430092265e-05, + "loss": 0.012016575783491135, + "step": 69800 + }, + { + "epoch": 9.90915542938254, + "grad_norm": 5.12114143371582, + "learning_rate": 9.009524485450674e-05, + "loss": 0.018399667739868165, + "step": 69810 + }, + { + "epoch": 9.910574875798439, + "grad_norm": 0.6642940044403076, + "learning_rate": 9.009382540809085e-05, + "loss": 0.023203255236148836, + "step": 69820 + }, + { + "epoch": 9.911994322214337, + "grad_norm": 0.4026901423931122, + "learning_rate": 9.009240596167495e-05, + "loss": 0.03522194027900696, + "step": 69830 + }, + { + "epoch": 9.913413768630233, + "grad_norm": 0.011791981756687164, + "learning_rate": 9.009098651525906e-05, + "loss": 0.02517768442630768, + "step": 69840 + }, + { + "epoch": 9.914833215046132, + "grad_norm": 6.152438640594482, + "learning_rate": 9.008956706884315e-05, + "loss": 0.06228979229927063, + "step": 69850 + }, + { + "epoch": 9.91625266146203, + "grad_norm": 0.020824043080210686, + "learning_rate": 9.008814762242726e-05, + "loss": 0.0055544193834066394, + "step": 69860 + }, + { + "epoch": 9.917672107877928, + "grad_norm": 10.714980125427246, + "learning_rate": 9.008672817601136e-05, + "loss": 0.0562599778175354, + "step": 69870 + }, + { + "epoch": 9.919091554293825, + "grad_norm": 0.71759033203125, + "learning_rate": 9.008530872959547e-05, + "loss": 0.015090197324752808, + "step": 69880 + }, + { + "epoch": 9.920511000709723, + "grad_norm": 7.554685592651367, + "learning_rate": 9.008388928317956e-05, + "loss": 0.04340165555477142, + "step": 69890 + }, + { + "epoch": 9.921930447125622, + "grad_norm": 16.142648696899414, + "learning_rate": 9.008246983676366e-05, + "loss": 0.06557026505470276, + "step": 69900 + }, + { + "epoch": 9.923349893541518, + "grad_norm": 2.8899734020233154, + "learning_rate": 9.008105039034777e-05, + "loss": 0.05298327207565308, + "step": 69910 + }, + { + "epoch": 9.924769339957416, + "grad_norm": 0.2371891736984253, + "learning_rate": 9.007963094393187e-05, + "loss": 0.020034009218215944, + "step": 69920 + }, + { + "epoch": 9.926188786373315, + "grad_norm": 0.7864499688148499, + "learning_rate": 9.007821149751598e-05, + "loss": 0.005942384526133537, + "step": 69930 + }, + { + "epoch": 9.927608232789213, + "grad_norm": 1.570546269416809, + "learning_rate": 9.007679205110008e-05, + "loss": 0.02644781768321991, + "step": 69940 + }, + { + "epoch": 9.92902767920511, + "grad_norm": 0.30083009600639343, + "learning_rate": 9.007537260468419e-05, + "loss": 0.012635472416877746, + "step": 69950 + }, + { + "epoch": 9.930447125621008, + "grad_norm": 0.0356457456946373, + "learning_rate": 9.007395315826827e-05, + "loss": 0.03070034384727478, + "step": 69960 + }, + { + "epoch": 9.931866572036906, + "grad_norm": 0.558660089969635, + "learning_rate": 9.007253371185238e-05, + "loss": 0.020227883756160737, + "step": 69970 + }, + { + "epoch": 9.933286018452803, + "grad_norm": 7.517212390899658, + "learning_rate": 9.007111426543648e-05, + "loss": 0.13844293355941772, + "step": 69980 + }, + { + "epoch": 9.934705464868701, + "grad_norm": 0.020189205184578896, + "learning_rate": 9.006969481902059e-05, + "loss": 0.018152689933776854, + "step": 69990 + }, + { + "epoch": 9.9361249112846, + "grad_norm": 11.410741806030273, + "learning_rate": 9.006827537260469e-05, + "loss": 0.029291608929634096, + "step": 70000 + }, + { + "epoch": 9.9361249112846, + "eval_accuracy": 0.9816239587969734, + "eval_loss": 0.06005491688847542, + "eval_runtime": 33.3087, + "eval_samples_per_second": 472.159, + "eval_steps_per_second": 14.771, + "step": 70000 + }, + { + "epoch": 9.937544357700498, + "grad_norm": 0.02899947017431259, + "learning_rate": 9.006685592618879e-05, + "loss": 0.05207945704460144, + "step": 70010 + }, + { + "epoch": 9.938963804116394, + "grad_norm": 2.290057420730591, + "learning_rate": 9.00654364797729e-05, + "loss": 0.03104778826236725, + "step": 70020 + }, + { + "epoch": 9.940383250532292, + "grad_norm": 4.646543502807617, + "learning_rate": 9.0064017033357e-05, + "loss": 0.026638334989547728, + "step": 70030 + }, + { + "epoch": 9.94180269694819, + "grad_norm": 0.7747595310211182, + "learning_rate": 9.00625975869411e-05, + "loss": 0.057200342416763306, + "step": 70040 + }, + { + "epoch": 9.943222143364087, + "grad_norm": 13.665855407714844, + "learning_rate": 9.00611781405252e-05, + "loss": 0.041640186309814455, + "step": 70050 + }, + { + "epoch": 9.944641589779986, + "grad_norm": 0.8320010900497437, + "learning_rate": 9.00597586941093e-05, + "loss": 0.03797149658203125, + "step": 70060 + }, + { + "epoch": 9.946061036195884, + "grad_norm": 0.46569153666496277, + "learning_rate": 9.00583392476934e-05, + "loss": 0.018002772331237794, + "step": 70070 + }, + { + "epoch": 9.947480482611782, + "grad_norm": 0.32417356967926025, + "learning_rate": 9.005691980127751e-05, + "loss": 0.07594256997108459, + "step": 70080 + }, + { + "epoch": 9.948899929027679, + "grad_norm": 2.290781259536743, + "learning_rate": 9.00555003548616e-05, + "loss": 0.02159411907196045, + "step": 70090 + }, + { + "epoch": 9.950319375443577, + "grad_norm": 1.5285451412200928, + "learning_rate": 9.005408090844572e-05, + "loss": 0.05258944034576416, + "step": 70100 + }, + { + "epoch": 9.951738821859475, + "grad_norm": 6.222751617431641, + "learning_rate": 9.005266146202981e-05, + "loss": 0.021117933094501495, + "step": 70110 + }, + { + "epoch": 9.953158268275372, + "grad_norm": 6.542359828948975, + "learning_rate": 9.005124201561391e-05, + "loss": 0.03911808133125305, + "step": 70120 + }, + { + "epoch": 9.95457771469127, + "grad_norm": 0.1534150093793869, + "learning_rate": 9.004982256919802e-05, + "loss": 0.10638805627822875, + "step": 70130 + }, + { + "epoch": 9.955997161107168, + "grad_norm": 0.11701802909374237, + "learning_rate": 9.004840312278212e-05, + "loss": 0.01432393044233322, + "step": 70140 + }, + { + "epoch": 9.957416607523067, + "grad_norm": 0.2519495189189911, + "learning_rate": 9.004698367636623e-05, + "loss": 0.03227570950984955, + "step": 70150 + }, + { + "epoch": 9.958836053938963, + "grad_norm": 5.316908359527588, + "learning_rate": 9.004556422995031e-05, + "loss": 0.055128943920135495, + "step": 70160 + }, + { + "epoch": 9.960255500354862, + "grad_norm": 0.15168063342571259, + "learning_rate": 9.004414478353443e-05, + "loss": 0.01621713936328888, + "step": 70170 + }, + { + "epoch": 9.96167494677076, + "grad_norm": 1.0486849546432495, + "learning_rate": 9.004272533711852e-05, + "loss": 0.02327948361635208, + "step": 70180 + }, + { + "epoch": 9.963094393186656, + "grad_norm": 0.1740347445011139, + "learning_rate": 9.004130589070263e-05, + "loss": 0.02720318138599396, + "step": 70190 + }, + { + "epoch": 9.964513839602555, + "grad_norm": 0.25774815678596497, + "learning_rate": 9.003988644428673e-05, + "loss": 0.04783263504505157, + "step": 70200 + }, + { + "epoch": 9.965933286018453, + "grad_norm": 6.590445041656494, + "learning_rate": 9.003846699787083e-05, + "loss": 0.03629983365535736, + "step": 70210 + }, + { + "epoch": 9.967352732434351, + "grad_norm": 1.5156173706054688, + "learning_rate": 9.003704755145494e-05, + "loss": 0.07823289632797241, + "step": 70220 + }, + { + "epoch": 9.968772178850248, + "grad_norm": 0.08737379312515259, + "learning_rate": 9.003562810503904e-05, + "loss": 0.01150958240032196, + "step": 70230 + }, + { + "epoch": 9.970191625266146, + "grad_norm": 6.838068962097168, + "learning_rate": 9.003420865862315e-05, + "loss": 0.05327457785606384, + "step": 70240 + }, + { + "epoch": 9.971611071682045, + "grad_norm": 7.426832675933838, + "learning_rate": 9.003278921220725e-05, + "loss": 0.030006197094917298, + "step": 70250 + }, + { + "epoch": 9.973030518097941, + "grad_norm": 5.167942523956299, + "learning_rate": 9.003136976579134e-05, + "loss": 0.08623776435852051, + "step": 70260 + }, + { + "epoch": 9.97444996451384, + "grad_norm": 4.396407604217529, + "learning_rate": 9.002995031937544e-05, + "loss": 0.03299508690834045, + "step": 70270 + }, + { + "epoch": 9.975869410929738, + "grad_norm": 8.55590534210205, + "learning_rate": 9.002853087295955e-05, + "loss": 0.04857953190803528, + "step": 70280 + }, + { + "epoch": 9.977288857345636, + "grad_norm": 0.14077264070510864, + "learning_rate": 9.002711142654365e-05, + "loss": 0.04843473136425018, + "step": 70290 + }, + { + "epoch": 9.978708303761533, + "grad_norm": 11.530470848083496, + "learning_rate": 9.002569198012776e-05, + "loss": 0.042692869901657104, + "step": 70300 + }, + { + "epoch": 9.98012775017743, + "grad_norm": 1.876141905784607, + "learning_rate": 9.002427253371186e-05, + "loss": 0.06830405592918395, + "step": 70310 + }, + { + "epoch": 9.98154719659333, + "grad_norm": 2.4454851150512695, + "learning_rate": 9.002285308729595e-05, + "loss": 0.035274538397789004, + "step": 70320 + }, + { + "epoch": 9.982966643009226, + "grad_norm": 3.194749593734741, + "learning_rate": 9.002143364088006e-05, + "loss": 0.01803991198539734, + "step": 70330 + }, + { + "epoch": 9.984386089425124, + "grad_norm": 0.22273701429367065, + "learning_rate": 9.002001419446416e-05, + "loss": 0.04039207696914673, + "step": 70340 + }, + { + "epoch": 9.985805535841022, + "grad_norm": 6.443853855133057, + "learning_rate": 9.001859474804827e-05, + "loss": 0.031891047954559326, + "step": 70350 + }, + { + "epoch": 9.98722498225692, + "grad_norm": 11.825727462768555, + "learning_rate": 9.001717530163237e-05, + "loss": 0.03135543167591095, + "step": 70360 + }, + { + "epoch": 9.988644428672817, + "grad_norm": 1.0881600379943848, + "learning_rate": 9.001575585521647e-05, + "loss": 0.021712112426757812, + "step": 70370 + }, + { + "epoch": 9.990063875088715, + "grad_norm": 0.020460011437535286, + "learning_rate": 9.001433640880057e-05, + "loss": 0.007388191670179367, + "step": 70380 + }, + { + "epoch": 9.991483321504614, + "grad_norm": 0.5041184425354004, + "learning_rate": 9.001291696238468e-05, + "loss": 0.016400963068008423, + "step": 70390 + }, + { + "epoch": 9.99290276792051, + "grad_norm": 11.30899429321289, + "learning_rate": 9.001149751596877e-05, + "loss": 0.04193665981292725, + "step": 70400 + }, + { + "epoch": 9.994322214336409, + "grad_norm": 3.974252700805664, + "learning_rate": 9.001007806955288e-05, + "loss": 0.034135401248931885, + "step": 70410 + }, + { + "epoch": 9.995741660752307, + "grad_norm": 0.414074569940567, + "learning_rate": 9.000865862313698e-05, + "loss": 0.022105370461940766, + "step": 70420 + }, + { + "epoch": 9.997161107168205, + "grad_norm": 4.395871639251709, + "learning_rate": 9.000723917672108e-05, + "loss": 0.04791556596755982, + "step": 70430 + }, + { + "epoch": 9.998580553584102, + "grad_norm": 5.924459934234619, + "learning_rate": 9.000581973030519e-05, + "loss": 0.03285888731479645, + "step": 70440 + }, + { + "epoch": 10.0, + "grad_norm": 12.073928833007812, + "learning_rate": 9.000440028388929e-05, + "loss": 0.023960676789283753, + "step": 70450 + }, + { + "epoch": 10.001419446415898, + "grad_norm": 0.24992632865905762, + "learning_rate": 9.00029808374734e-05, + "loss": 0.028036636114120484, + "step": 70460 + }, + { + "epoch": 10.002838892831795, + "grad_norm": 0.2592790424823761, + "learning_rate": 9.000156139105748e-05, + "loss": 0.09887195825576782, + "step": 70470 + }, + { + "epoch": 10.004258339247693, + "grad_norm": 0.7219700813293457, + "learning_rate": 9.000014194464159e-05, + "loss": 0.02016732543706894, + "step": 70480 + }, + { + "epoch": 10.005677785663591, + "grad_norm": 0.675056517124176, + "learning_rate": 8.999872249822569e-05, + "loss": 0.04064895510673523, + "step": 70490 + }, + { + "epoch": 10.00709723207949, + "grad_norm": 2.294806718826294, + "learning_rate": 8.99973030518098e-05, + "loss": 0.06091110110282898, + "step": 70500 + }, + { + "epoch": 10.00709723207949, + "eval_accuracy": 0.9842309404209322, + "eval_loss": 0.05429995805025101, + "eval_runtime": 33.3699, + "eval_samples_per_second": 471.293, + "eval_steps_per_second": 14.744, + "step": 70500 + }, + { + "epoch": 10.008516678495386, + "grad_norm": 4.840185642242432, + "learning_rate": 8.99958836053939e-05, + "loss": 0.025154224038124083, + "step": 70510 + }, + { + "epoch": 10.009936124911285, + "grad_norm": 0.6732443571090698, + "learning_rate": 8.9994464158978e-05, + "loss": 0.017550435662269593, + "step": 70520 + }, + { + "epoch": 10.011355571327183, + "grad_norm": 3.3090858459472656, + "learning_rate": 8.999304471256211e-05, + "loss": 0.04830752909183502, + "step": 70530 + }, + { + "epoch": 10.01277501774308, + "grad_norm": 3.5346755981445312, + "learning_rate": 8.99916252661462e-05, + "loss": 0.041277503967285155, + "step": 70540 + }, + { + "epoch": 10.014194464158978, + "grad_norm": 0.1413526087999344, + "learning_rate": 8.999020581973032e-05, + "loss": 0.050315022468566895, + "step": 70550 + }, + { + "epoch": 10.015613910574876, + "grad_norm": 0.13002030551433563, + "learning_rate": 8.998878637331441e-05, + "loss": 0.011780694127082825, + "step": 70560 + }, + { + "epoch": 10.017033356990774, + "grad_norm": 3.0763373374938965, + "learning_rate": 8.998736692689851e-05, + "loss": 0.05392890572547913, + "step": 70570 + }, + { + "epoch": 10.01845280340667, + "grad_norm": 2.0271835327148438, + "learning_rate": 8.998594748048261e-05, + "loss": 0.03189515769481659, + "step": 70580 + }, + { + "epoch": 10.01987224982257, + "grad_norm": 10.38420295715332, + "learning_rate": 8.998452803406672e-05, + "loss": 0.03412808775901795, + "step": 70590 + }, + { + "epoch": 10.021291696238467, + "grad_norm": 0.4704546630382538, + "learning_rate": 8.998310858765082e-05, + "loss": 0.015590818226337433, + "step": 70600 + }, + { + "epoch": 10.022711142654364, + "grad_norm": 0.04271915927529335, + "learning_rate": 8.998168914123493e-05, + "loss": 0.03360204696655274, + "step": 70610 + }, + { + "epoch": 10.024130589070262, + "grad_norm": 0.23193983733654022, + "learning_rate": 8.998026969481902e-05, + "loss": 0.019712990522384642, + "step": 70620 + }, + { + "epoch": 10.02555003548616, + "grad_norm": 3.272385358810425, + "learning_rate": 8.997885024840312e-05, + "loss": 0.019542354345321655, + "step": 70630 + }, + { + "epoch": 10.026969481902059, + "grad_norm": 0.11366137862205505, + "learning_rate": 8.997743080198723e-05, + "loss": 0.016868501901626587, + "step": 70640 + }, + { + "epoch": 10.028388928317955, + "grad_norm": 1.392044186592102, + "learning_rate": 8.997601135557133e-05, + "loss": 0.05533118844032288, + "step": 70650 + }, + { + "epoch": 10.029808374733854, + "grad_norm": 0.035225555300712585, + "learning_rate": 8.997459190915544e-05, + "loss": 0.014872361719608308, + "step": 70660 + }, + { + "epoch": 10.031227821149752, + "grad_norm": 0.07124058157205582, + "learning_rate": 8.997317246273954e-05, + "loss": 0.028601282835006715, + "step": 70670 + }, + { + "epoch": 10.032647267565649, + "grad_norm": 4.94709587097168, + "learning_rate": 8.997175301632364e-05, + "loss": 0.04412299692630768, + "step": 70680 + }, + { + "epoch": 10.034066713981547, + "grad_norm": 8.149444580078125, + "learning_rate": 8.997033356990773e-05, + "loss": 0.036119335889816286, + "step": 70690 + }, + { + "epoch": 10.035486160397445, + "grad_norm": 6.122970104217529, + "learning_rate": 8.996891412349184e-05, + "loss": 0.014646390080451965, + "step": 70700 + }, + { + "epoch": 10.036905606813344, + "grad_norm": 0.029342738911509514, + "learning_rate": 8.996749467707594e-05, + "loss": 0.03428757190704346, + "step": 70710 + }, + { + "epoch": 10.03832505322924, + "grad_norm": 0.2870967388153076, + "learning_rate": 8.996607523066005e-05, + "loss": 0.014554601907730103, + "step": 70720 + }, + { + "epoch": 10.039744499645138, + "grad_norm": 0.4357489049434662, + "learning_rate": 8.996465578424415e-05, + "loss": 0.02026190161705017, + "step": 70730 + }, + { + "epoch": 10.041163946061037, + "grad_norm": 1.4404624700546265, + "learning_rate": 8.996323633782825e-05, + "loss": 0.07623617053031921, + "step": 70740 + }, + { + "epoch": 10.042583392476933, + "grad_norm": 9.76617431640625, + "learning_rate": 8.996181689141236e-05, + "loss": 0.04643445312976837, + "step": 70750 + }, + { + "epoch": 10.044002838892832, + "grad_norm": 0.0906580463051796, + "learning_rate": 8.996039744499646e-05, + "loss": 0.045586833357810976, + "step": 70760 + }, + { + "epoch": 10.04542228530873, + "grad_norm": 0.23739679157733917, + "learning_rate": 8.995897799858057e-05, + "loss": 0.017312943935394287, + "step": 70770 + }, + { + "epoch": 10.046841731724628, + "grad_norm": 0.541393518447876, + "learning_rate": 8.995755855216465e-05, + "loss": 0.013014046847820282, + "step": 70780 + }, + { + "epoch": 10.048261178140525, + "grad_norm": 0.27587655186653137, + "learning_rate": 8.995613910574876e-05, + "loss": 0.0030714210122823717, + "step": 70790 + }, + { + "epoch": 10.049680624556423, + "grad_norm": 0.08097139745950699, + "learning_rate": 8.995471965933286e-05, + "loss": 0.01676403880119324, + "step": 70800 + }, + { + "epoch": 10.051100070972321, + "grad_norm": 10.044310569763184, + "learning_rate": 8.995330021291697e-05, + "loss": 0.011085320264101028, + "step": 70810 + }, + { + "epoch": 10.052519517388218, + "grad_norm": 1.3171991109848022, + "learning_rate": 8.995188076650107e-05, + "loss": 0.007198108732700348, + "step": 70820 + }, + { + "epoch": 10.053938963804116, + "grad_norm": 0.6077939867973328, + "learning_rate": 8.995046132008516e-05, + "loss": 0.019241389632225037, + "step": 70830 + }, + { + "epoch": 10.055358410220014, + "grad_norm": 1.554423213005066, + "learning_rate": 8.994904187366927e-05, + "loss": 0.009505119919776917, + "step": 70840 + }, + { + "epoch": 10.056777856635913, + "grad_norm": 0.0959242656826973, + "learning_rate": 8.994762242725337e-05, + "loss": 0.01176130622625351, + "step": 70850 + }, + { + "epoch": 10.05819730305181, + "grad_norm": 19.056350708007812, + "learning_rate": 8.994620298083748e-05, + "loss": 0.039687898755073545, + "step": 70860 + }, + { + "epoch": 10.059616749467708, + "grad_norm": 0.17465567588806152, + "learning_rate": 8.994478353442158e-05, + "loss": 0.010702335834503173, + "step": 70870 + }, + { + "epoch": 10.061036195883606, + "grad_norm": 1.6593632698059082, + "learning_rate": 8.994336408800568e-05, + "loss": 0.016061322391033174, + "step": 70880 + }, + { + "epoch": 10.062455642299502, + "grad_norm": 14.231645584106445, + "learning_rate": 8.994194464158978e-05, + "loss": 0.04368197321891785, + "step": 70890 + }, + { + "epoch": 10.0638750887154, + "grad_norm": 1.333464503288269, + "learning_rate": 8.994052519517389e-05, + "loss": 0.012594687938690185, + "step": 70900 + }, + { + "epoch": 10.065294535131299, + "grad_norm": 11.049664497375488, + "learning_rate": 8.993910574875798e-05, + "loss": 0.051619219779968264, + "step": 70910 + }, + { + "epoch": 10.066713981547197, + "grad_norm": 0.08700914680957794, + "learning_rate": 8.99376863023421e-05, + "loss": 0.049428775906562805, + "step": 70920 + }, + { + "epoch": 10.068133427963094, + "grad_norm": 0.23135952651500702, + "learning_rate": 8.993626685592619e-05, + "loss": 0.023401886224746704, + "step": 70930 + }, + { + "epoch": 10.069552874378992, + "grad_norm": 0.07406525313854218, + "learning_rate": 8.993484740951029e-05, + "loss": 0.060725486278533934, + "step": 70940 + }, + { + "epoch": 10.07097232079489, + "grad_norm": 2.472419261932373, + "learning_rate": 8.99334279630944e-05, + "loss": 0.01986803412437439, + "step": 70950 + }, + { + "epoch": 10.072391767210787, + "grad_norm": 0.574719250202179, + "learning_rate": 8.99320085166785e-05, + "loss": 0.043755120038986205, + "step": 70960 + }, + { + "epoch": 10.073811213626685, + "grad_norm": 0.35062700510025024, + "learning_rate": 8.993058907026261e-05, + "loss": 0.016467268764972686, + "step": 70970 + }, + { + "epoch": 10.075230660042584, + "grad_norm": 3.5769553184509277, + "learning_rate": 8.99291696238467e-05, + "loss": 0.02966910004615784, + "step": 70980 + }, + { + "epoch": 10.076650106458482, + "grad_norm": 0.09693639725446701, + "learning_rate": 8.99277501774308e-05, + "loss": 0.021013087034225462, + "step": 70990 + }, + { + "epoch": 10.078069552874378, + "grad_norm": 0.028018489480018616, + "learning_rate": 8.99263307310149e-05, + "loss": 0.04916905760765076, + "step": 71000 + }, + { + "epoch": 10.078069552874378, + "eval_accuracy": 0.9842309404209322, + "eval_loss": 0.05193416774272919, + "eval_runtime": 32.7864, + "eval_samples_per_second": 479.681, + "eval_steps_per_second": 15.006, + "step": 71000 + }, + { + "epoch": 10.079488999290277, + "grad_norm": 11.105819702148438, + "learning_rate": 8.992491128459901e-05, + "loss": 0.0558125376701355, + "step": 71010 + }, + { + "epoch": 10.080908445706175, + "grad_norm": 5.16521692276001, + "learning_rate": 8.992349183818312e-05, + "loss": 0.027687478065490722, + "step": 71020 + }, + { + "epoch": 10.082327892122072, + "grad_norm": 1.2156544923782349, + "learning_rate": 8.992207239176722e-05, + "loss": 0.05835610628128052, + "step": 71030 + }, + { + "epoch": 10.08374733853797, + "grad_norm": 11.860342979431152, + "learning_rate": 8.992065294535132e-05, + "loss": 0.06005517244338989, + "step": 71040 + }, + { + "epoch": 10.085166784953868, + "grad_norm": 1.9857722520828247, + "learning_rate": 8.991923349893541e-05, + "loss": 0.03460575938224793, + "step": 71050 + }, + { + "epoch": 10.086586231369767, + "grad_norm": 12.068072319030762, + "learning_rate": 8.991781405251953e-05, + "loss": 0.0850683569908142, + "step": 71060 + }, + { + "epoch": 10.088005677785663, + "grad_norm": 0.45789551734924316, + "learning_rate": 8.991639460610362e-05, + "loss": 0.07147586941719056, + "step": 71070 + }, + { + "epoch": 10.089425124201561, + "grad_norm": 0.7118040323257446, + "learning_rate": 8.991497515968773e-05, + "loss": 0.07116876244544983, + "step": 71080 + }, + { + "epoch": 10.09084457061746, + "grad_norm": 0.2730395495891571, + "learning_rate": 8.991355571327182e-05, + "loss": 0.011208267509937286, + "step": 71090 + }, + { + "epoch": 10.092264017033356, + "grad_norm": 0.616715669631958, + "learning_rate": 8.991213626685593e-05, + "loss": 0.01879979819059372, + "step": 71100 + }, + { + "epoch": 10.093683463449254, + "grad_norm": 0.4260369539260864, + "learning_rate": 8.991071682044004e-05, + "loss": 0.02226797640323639, + "step": 71110 + }, + { + "epoch": 10.095102909865153, + "grad_norm": 1.1055042743682861, + "learning_rate": 8.990929737402414e-05, + "loss": 0.04241999387741089, + "step": 71120 + }, + { + "epoch": 10.096522356281051, + "grad_norm": 0.2439938485622406, + "learning_rate": 8.990787792760825e-05, + "loss": 0.014829432964324952, + "step": 71130 + }, + { + "epoch": 10.097941802696948, + "grad_norm": 2.2205474376678467, + "learning_rate": 8.990645848119233e-05, + "loss": 0.043992698192596436, + "step": 71140 + }, + { + "epoch": 10.099361249112846, + "grad_norm": 3.324392318725586, + "learning_rate": 8.990503903477644e-05, + "loss": 0.007843706011772155, + "step": 71150 + }, + { + "epoch": 10.100780695528744, + "grad_norm": 5.776381015777588, + "learning_rate": 8.990361958836054e-05, + "loss": 0.06609423756599427, + "step": 71160 + }, + { + "epoch": 10.10220014194464, + "grad_norm": 3.8756496906280518, + "learning_rate": 8.990220014194465e-05, + "loss": 0.025007185339927674, + "step": 71170 + }, + { + "epoch": 10.103619588360539, + "grad_norm": 9.75920295715332, + "learning_rate": 8.990078069552875e-05, + "loss": 0.04297915697097778, + "step": 71180 + }, + { + "epoch": 10.105039034776437, + "grad_norm": 6.752058029174805, + "learning_rate": 8.989936124911285e-05, + "loss": 0.07576794624328613, + "step": 71190 + }, + { + "epoch": 10.106458481192336, + "grad_norm": 0.22006545960903168, + "learning_rate": 8.989794180269696e-05, + "loss": 0.08119171261787414, + "step": 71200 + }, + { + "epoch": 10.107877927608232, + "grad_norm": 12.223731994628906, + "learning_rate": 8.989652235628105e-05, + "loss": 0.025416919589042665, + "step": 71210 + }, + { + "epoch": 10.10929737402413, + "grad_norm": 1.204849362373352, + "learning_rate": 8.989510290986516e-05, + "loss": 0.0057471167296171185, + "step": 71220 + }, + { + "epoch": 10.110716820440029, + "grad_norm": 5.5199408531188965, + "learning_rate": 8.989368346344926e-05, + "loss": 0.02445787936449051, + "step": 71230 + }, + { + "epoch": 10.112136266855925, + "grad_norm": 0.5732960104942322, + "learning_rate": 8.989226401703336e-05, + "loss": 0.06102269887924194, + "step": 71240 + }, + { + "epoch": 10.113555713271824, + "grad_norm": 0.24469658732414246, + "learning_rate": 8.989084457061746e-05, + "loss": 0.07163866758346557, + "step": 71250 + }, + { + "epoch": 10.114975159687722, + "grad_norm": 10.344715118408203, + "learning_rate": 8.988942512420157e-05, + "loss": 0.020954841375350954, + "step": 71260 + }, + { + "epoch": 10.11639460610362, + "grad_norm": 0.6934405565261841, + "learning_rate": 8.988800567778567e-05, + "loss": 0.052423101663589475, + "step": 71270 + }, + { + "epoch": 10.117814052519517, + "grad_norm": 3.2811639308929443, + "learning_rate": 8.988658623136978e-05, + "loss": 0.05588293671607971, + "step": 71280 + }, + { + "epoch": 10.119233498935415, + "grad_norm": 0.6066336631774902, + "learning_rate": 8.988516678495387e-05, + "loss": 0.03658683001995087, + "step": 71290 + }, + { + "epoch": 10.120652945351313, + "grad_norm": 13.91519546508789, + "learning_rate": 8.988374733853797e-05, + "loss": 0.09927948117256165, + "step": 71300 + }, + { + "epoch": 10.12207239176721, + "grad_norm": 5.7359514236450195, + "learning_rate": 8.988232789212208e-05, + "loss": 0.05882952809333801, + "step": 71310 + }, + { + "epoch": 10.123491838183108, + "grad_norm": 9.058098793029785, + "learning_rate": 8.988090844570618e-05, + "loss": 0.0466866672039032, + "step": 71320 + }, + { + "epoch": 10.124911284599007, + "grad_norm": 0.7956753969192505, + "learning_rate": 8.987948899929029e-05, + "loss": 0.04202900826931, + "step": 71330 + }, + { + "epoch": 10.126330731014905, + "grad_norm": 1.190938949584961, + "learning_rate": 8.987806955287439e-05, + "loss": 0.05764939785003662, + "step": 71340 + }, + { + "epoch": 10.127750177430801, + "grad_norm": 0.05666103586554527, + "learning_rate": 8.987665010645849e-05, + "loss": 0.024899232387542724, + "step": 71350 + }, + { + "epoch": 10.1291696238467, + "grad_norm": 4.87168025970459, + "learning_rate": 8.987523066004258e-05, + "loss": 0.06777219772338867, + "step": 71360 + }, + { + "epoch": 10.130589070262598, + "grad_norm": 1.0274242162704468, + "learning_rate": 8.98738112136267e-05, + "loss": 0.01196812316775322, + "step": 71370 + }, + { + "epoch": 10.132008516678495, + "grad_norm": 0.10885701328516006, + "learning_rate": 8.987239176721079e-05, + "loss": 0.0149383544921875, + "step": 71380 + }, + { + "epoch": 10.133427963094393, + "grad_norm": 4.033825397491455, + "learning_rate": 8.98709723207949e-05, + "loss": 0.013677287101745605, + "step": 71390 + }, + { + "epoch": 10.134847409510291, + "grad_norm": 0.4323374032974243, + "learning_rate": 8.9869552874379e-05, + "loss": 0.03762426972389221, + "step": 71400 + }, + { + "epoch": 10.13626685592619, + "grad_norm": 0.06930834800004959, + "learning_rate": 8.98681334279631e-05, + "loss": 0.009782253205776215, + "step": 71410 + }, + { + "epoch": 10.137686302342086, + "grad_norm": 0.7295514345169067, + "learning_rate": 8.986671398154721e-05, + "loss": 0.016273342072963715, + "step": 71420 + }, + { + "epoch": 10.139105748757984, + "grad_norm": 7.975281238555908, + "learning_rate": 8.98652945351313e-05, + "loss": 0.05889239311218262, + "step": 71430 + }, + { + "epoch": 10.140525195173883, + "grad_norm": 0.8045636415481567, + "learning_rate": 8.986387508871542e-05, + "loss": 0.04868484139442444, + "step": 71440 + }, + { + "epoch": 10.14194464158978, + "grad_norm": 8.876813888549805, + "learning_rate": 8.98624556422995e-05, + "loss": 0.03207354545593262, + "step": 71450 + }, + { + "epoch": 10.143364088005677, + "grad_norm": 8.953425407409668, + "learning_rate": 8.986103619588361e-05, + "loss": 0.05620843172073364, + "step": 71460 + }, + { + "epoch": 10.144783534421576, + "grad_norm": 8.15450382232666, + "learning_rate": 8.985961674946771e-05, + "loss": 0.0524405837059021, + "step": 71470 + }, + { + "epoch": 10.146202980837474, + "grad_norm": 8.996185302734375, + "learning_rate": 8.985819730305182e-05, + "loss": 0.011744706332683564, + "step": 71480 + }, + { + "epoch": 10.14762242725337, + "grad_norm": 0.6976650357246399, + "learning_rate": 8.985677785663592e-05, + "loss": 0.006831346452236176, + "step": 71490 + }, + { + "epoch": 10.149041873669269, + "grad_norm": 1.2598222494125366, + "learning_rate": 8.985535841022001e-05, + "loss": 0.018995559215545653, + "step": 71500 + }, + { + "epoch": 10.149041873669269, + "eval_accuracy": 0.9872194315508361, + "eval_loss": 0.042244430631399155, + "eval_runtime": 32.7379, + "eval_samples_per_second": 480.392, + "eval_steps_per_second": 15.028, + "step": 71500 + }, + { + "epoch": 10.150461320085167, + "grad_norm": 0.495564728975296, + "learning_rate": 8.985393896380412e-05, + "loss": 0.0047418631613254545, + "step": 71510 + }, + { + "epoch": 10.151880766501064, + "grad_norm": 3.843820095062256, + "learning_rate": 8.985251951738822e-05, + "loss": 0.032132938504219055, + "step": 71520 + }, + { + "epoch": 10.153300212916962, + "grad_norm": 1.3583863973617554, + "learning_rate": 8.985110007097233e-05, + "loss": 0.013863271474838257, + "step": 71530 + }, + { + "epoch": 10.15471965933286, + "grad_norm": 9.500256538391113, + "learning_rate": 8.984968062455643e-05, + "loss": 0.024224182963371275, + "step": 71540 + }, + { + "epoch": 10.156139105748759, + "grad_norm": 5.837913990020752, + "learning_rate": 8.984826117814053e-05, + "loss": 0.015855035185813902, + "step": 71550 + }, + { + "epoch": 10.157558552164655, + "grad_norm": 2.1019768714904785, + "learning_rate": 8.984684173172462e-05, + "loss": 0.01872989535331726, + "step": 71560 + }, + { + "epoch": 10.158977998580554, + "grad_norm": 4.689863681793213, + "learning_rate": 8.984542228530874e-05, + "loss": 0.011310167610645294, + "step": 71570 + }, + { + "epoch": 10.160397444996452, + "grad_norm": 3.9470810890197754, + "learning_rate": 8.984400283889283e-05, + "loss": 0.019275748729705812, + "step": 71580 + }, + { + "epoch": 10.161816891412348, + "grad_norm": 6.351150989532471, + "learning_rate": 8.984258339247694e-05, + "loss": 0.016378867626190185, + "step": 71590 + }, + { + "epoch": 10.163236337828247, + "grad_norm": 6.803494930267334, + "learning_rate": 8.984116394606104e-05, + "loss": 0.03146334290504456, + "step": 71600 + }, + { + "epoch": 10.164655784244145, + "grad_norm": 10.133057594299316, + "learning_rate": 8.983974449964514e-05, + "loss": 0.06289324760437012, + "step": 71610 + }, + { + "epoch": 10.166075230660043, + "grad_norm": 0.8119639158248901, + "learning_rate": 8.983832505322925e-05, + "loss": 0.03100758194923401, + "step": 71620 + }, + { + "epoch": 10.16749467707594, + "grad_norm": 0.7174460291862488, + "learning_rate": 8.983690560681335e-05, + "loss": 0.013892364501953126, + "step": 71630 + }, + { + "epoch": 10.168914123491838, + "grad_norm": 4.801692962646484, + "learning_rate": 8.983548616039746e-05, + "loss": 0.012847928702831269, + "step": 71640 + }, + { + "epoch": 10.170333569907736, + "grad_norm": 0.08614791929721832, + "learning_rate": 8.983406671398154e-05, + "loss": 0.03195511400699615, + "step": 71650 + }, + { + "epoch": 10.171753016323633, + "grad_norm": 0.3636281490325928, + "learning_rate": 8.983264726756565e-05, + "loss": 0.014664243161678314, + "step": 71660 + }, + { + "epoch": 10.173172462739531, + "grad_norm": 0.8303320407867432, + "learning_rate": 8.983122782114975e-05, + "loss": 0.023724818229675294, + "step": 71670 + }, + { + "epoch": 10.17459190915543, + "grad_norm": 0.06572652608156204, + "learning_rate": 8.982980837473386e-05, + "loss": 0.027840864658355714, + "step": 71680 + }, + { + "epoch": 10.176011355571328, + "grad_norm": 0.05816236510872841, + "learning_rate": 8.982838892831796e-05, + "loss": 0.009276087582111358, + "step": 71690 + }, + { + "epoch": 10.177430801987224, + "grad_norm": 0.15733098983764648, + "learning_rate": 8.982711142654366e-05, + "loss": 0.04615362286567688, + "step": 71700 + }, + { + "epoch": 10.178850248403123, + "grad_norm": 0.4171859323978424, + "learning_rate": 8.982569198012775e-05, + "loss": 0.06214319467544556, + "step": 71710 + }, + { + "epoch": 10.180269694819021, + "grad_norm": 0.051216304302215576, + "learning_rate": 8.982427253371187e-05, + "loss": 0.04424688518047333, + "step": 71720 + }, + { + "epoch": 10.181689141234918, + "grad_norm": 1.8618922233581543, + "learning_rate": 8.982285308729595e-05, + "loss": 0.015009069442749023, + "step": 71730 + }, + { + "epoch": 10.183108587650816, + "grad_norm": 7.354833602905273, + "learning_rate": 8.982143364088006e-05, + "loss": 0.06012805700302124, + "step": 71740 + }, + { + "epoch": 10.184528034066714, + "grad_norm": 0.7485067248344421, + "learning_rate": 8.982001419446416e-05, + "loss": 0.04865095615386963, + "step": 71750 + }, + { + "epoch": 10.185947480482612, + "grad_norm": 0.6087238788604736, + "learning_rate": 8.981859474804827e-05, + "loss": 0.052143925428390504, + "step": 71760 + }, + { + "epoch": 10.187366926898509, + "grad_norm": 2.8317716121673584, + "learning_rate": 8.981717530163238e-05, + "loss": 0.0342563271522522, + "step": 71770 + }, + { + "epoch": 10.188786373314407, + "grad_norm": 0.08310432732105255, + "learning_rate": 8.981575585521646e-05, + "loss": 0.04148037135601044, + "step": 71780 + }, + { + "epoch": 10.190205819730306, + "grad_norm": 4.928909778594971, + "learning_rate": 8.981433640880057e-05, + "loss": 0.04418375790119171, + "step": 71790 + }, + { + "epoch": 10.191625266146202, + "grad_norm": 0.2593403160572052, + "learning_rate": 8.981291696238467e-05, + "loss": 0.039079904556274414, + "step": 71800 + }, + { + "epoch": 10.1930447125621, + "grad_norm": 0.35453835129737854, + "learning_rate": 8.981149751596878e-05, + "loss": 0.02273627519607544, + "step": 71810 + }, + { + "epoch": 10.194464158977999, + "grad_norm": 14.605096817016602, + "learning_rate": 8.981007806955288e-05, + "loss": 0.07609878778457642, + "step": 71820 + }, + { + "epoch": 10.195883605393897, + "grad_norm": 0.33558687567710876, + "learning_rate": 8.980865862313698e-05, + "loss": 0.027788797020912172, + "step": 71830 + }, + { + "epoch": 10.197303051809794, + "grad_norm": 0.15005749464035034, + "learning_rate": 8.980723917672107e-05, + "loss": 0.017736873030662535, + "step": 71840 + }, + { + "epoch": 10.198722498225692, + "grad_norm": 11.601777076721191, + "learning_rate": 8.980581973030519e-05, + "loss": 0.051431238651275635, + "step": 71850 + }, + { + "epoch": 10.20014194464159, + "grad_norm": 0.3003895878791809, + "learning_rate": 8.98044002838893e-05, + "loss": 0.009282226115465165, + "step": 71860 + }, + { + "epoch": 10.201561391057487, + "grad_norm": 3.5037968158721924, + "learning_rate": 8.98029808374734e-05, + "loss": 0.029530704021453857, + "step": 71870 + }, + { + "epoch": 10.202980837473385, + "grad_norm": 11.290355682373047, + "learning_rate": 8.980156139105749e-05, + "loss": 0.02697465717792511, + "step": 71880 + }, + { + "epoch": 10.204400283889283, + "grad_norm": 6.993745803833008, + "learning_rate": 8.980014194464159e-05, + "loss": 0.021236954629421233, + "step": 71890 + }, + { + "epoch": 10.205819730305182, + "grad_norm": 2.8136534690856934, + "learning_rate": 8.97987224982257e-05, + "loss": 0.02050168514251709, + "step": 71900 + }, + { + "epoch": 10.207239176721078, + "grad_norm": 1.45379638671875, + "learning_rate": 8.97973030518098e-05, + "loss": 0.02213282585144043, + "step": 71910 + }, + { + "epoch": 10.208658623136976, + "grad_norm": 0.5076923370361328, + "learning_rate": 8.979588360539391e-05, + "loss": 0.030324968695640563, + "step": 71920 + }, + { + "epoch": 10.210078069552875, + "grad_norm": 7.311733245849609, + "learning_rate": 8.979446415897799e-05, + "loss": 0.03139882981777191, + "step": 71930 + }, + { + "epoch": 10.211497515968771, + "grad_norm": 0.35405999422073364, + "learning_rate": 8.97930447125621e-05, + "loss": 0.014943568408489228, + "step": 71940 + }, + { + "epoch": 10.21291696238467, + "grad_norm": 8.592523574829102, + "learning_rate": 8.979162526614621e-05, + "loss": 0.04479727745056152, + "step": 71950 + }, + { + "epoch": 10.214336408800568, + "grad_norm": 0.3105320334434509, + "learning_rate": 8.979020581973031e-05, + "loss": 0.041830265522003175, + "step": 71960 + }, + { + "epoch": 10.215755855216466, + "grad_norm": 0.29249778389930725, + "learning_rate": 8.978878637331442e-05, + "loss": 0.0695478618144989, + "step": 71970 + }, + { + "epoch": 10.217175301632363, + "grad_norm": 5.5391764640808105, + "learning_rate": 8.97873669268985e-05, + "loss": 0.06145778298377991, + "step": 71980 + }, + { + "epoch": 10.218594748048261, + "grad_norm": 1.5610414743423462, + "learning_rate": 8.978594748048262e-05, + "loss": 0.02754751741886139, + "step": 71990 + }, + { + "epoch": 10.22001419446416, + "grad_norm": 0.09087704867124557, + "learning_rate": 8.978452803406671e-05, + "loss": 0.034285178780555724, + "step": 72000 + }, + { + "epoch": 10.22001419446416, + "eval_accuracy": 0.9847396197621924, + "eval_loss": 0.05170145630836487, + "eval_runtime": 32.8652, + "eval_samples_per_second": 478.531, + "eval_steps_per_second": 14.97, + "step": 72000 + }, + { + "epoch": 10.221433640880056, + "grad_norm": 0.021180758252739906, + "learning_rate": 8.978310858765082e-05, + "loss": 0.057250940799713136, + "step": 72010 + }, + { + "epoch": 10.222853087295954, + "grad_norm": 6.0435075759887695, + "learning_rate": 8.978168914123492e-05, + "loss": 0.04329615831375122, + "step": 72020 + }, + { + "epoch": 10.224272533711853, + "grad_norm": 0.007519877981394529, + "learning_rate": 8.978026969481903e-05, + "loss": 0.023879942297935487, + "step": 72030 + }, + { + "epoch": 10.22569198012775, + "grad_norm": 4.61924934387207, + "learning_rate": 8.977885024840312e-05, + "loss": 0.05088481307029724, + "step": 72040 + }, + { + "epoch": 10.227111426543647, + "grad_norm": 4.466940402984619, + "learning_rate": 8.977743080198723e-05, + "loss": 0.015369561314582825, + "step": 72050 + }, + { + "epoch": 10.228530872959546, + "grad_norm": 0.04262328892946243, + "learning_rate": 8.977601135557134e-05, + "loss": 0.031910020112991336, + "step": 72060 + }, + { + "epoch": 10.229950319375444, + "grad_norm": 1.2850419282913208, + "learning_rate": 8.977459190915544e-05, + "loss": 0.04279256463050842, + "step": 72070 + }, + { + "epoch": 10.231369765791342, + "grad_norm": 1.3325414657592773, + "learning_rate": 8.977317246273955e-05, + "loss": 0.020931917428970336, + "step": 72080 + }, + { + "epoch": 10.232789212207239, + "grad_norm": 1.0753977298736572, + "learning_rate": 8.977175301632363e-05, + "loss": 0.008612241595983505, + "step": 72090 + }, + { + "epoch": 10.234208658623137, + "grad_norm": 0.07489815354347229, + "learning_rate": 8.977033356990774e-05, + "loss": 0.034203407168388364, + "step": 72100 + }, + { + "epoch": 10.235628105039035, + "grad_norm": 3.5092709064483643, + "learning_rate": 8.976891412349184e-05, + "loss": 0.02857227623462677, + "step": 72110 + }, + { + "epoch": 10.237047551454932, + "grad_norm": 4.87595796585083, + "learning_rate": 8.976749467707595e-05, + "loss": 0.03527629375457764, + "step": 72120 + }, + { + "epoch": 10.23846699787083, + "grad_norm": 2.1418490409851074, + "learning_rate": 8.976607523066005e-05, + "loss": 0.05000826716423035, + "step": 72130 + }, + { + "epoch": 10.239886444286729, + "grad_norm": 0.8071398138999939, + "learning_rate": 8.976465578424414e-05, + "loss": 0.009712295234203338, + "step": 72140 + }, + { + "epoch": 10.241305890702627, + "grad_norm": 0.32705071568489075, + "learning_rate": 8.976323633782826e-05, + "loss": 0.05061535239219665, + "step": 72150 + }, + { + "epoch": 10.242725337118523, + "grad_norm": 0.4660518765449524, + "learning_rate": 8.976181689141235e-05, + "loss": 0.05338441133499146, + "step": 72160 + }, + { + "epoch": 10.244144783534422, + "grad_norm": 0.027236519381403923, + "learning_rate": 8.976039744499646e-05, + "loss": 0.022108878195285796, + "step": 72170 + }, + { + "epoch": 10.24556422995032, + "grad_norm": 0.14898504316806793, + "learning_rate": 8.975897799858056e-05, + "loss": 0.025085175037384035, + "step": 72180 + }, + { + "epoch": 10.246983676366217, + "grad_norm": 2.2338013648986816, + "learning_rate": 8.975755855216466e-05, + "loss": 0.01661747694015503, + "step": 72190 + }, + { + "epoch": 10.248403122782115, + "grad_norm": 9.279471397399902, + "learning_rate": 8.975613910574876e-05, + "loss": 0.06610844731330871, + "step": 72200 + }, + { + "epoch": 10.249822569198013, + "grad_norm": 0.07761373370885849, + "learning_rate": 8.975471965933287e-05, + "loss": 0.02264375686645508, + "step": 72210 + }, + { + "epoch": 10.251242015613911, + "grad_norm": 8.15989875793457, + "learning_rate": 8.975330021291696e-05, + "loss": 0.048688432574272154, + "step": 72220 + }, + { + "epoch": 10.252661462029808, + "grad_norm": 6.95366096496582, + "learning_rate": 8.975188076650108e-05, + "loss": 0.032186472415924074, + "step": 72230 + }, + { + "epoch": 10.254080908445706, + "grad_norm": 14.51134204864502, + "learning_rate": 8.975046132008517e-05, + "loss": 0.022816789150238038, + "step": 72240 + }, + { + "epoch": 10.255500354861605, + "grad_norm": 0.10701218247413635, + "learning_rate": 8.974904187366927e-05, + "loss": 0.03494252264499664, + "step": 72250 + }, + { + "epoch": 10.256919801277501, + "grad_norm": 3.7930524349212646, + "learning_rate": 8.974762242725338e-05, + "loss": 0.024190935492515563, + "step": 72260 + }, + { + "epoch": 10.2583392476934, + "grad_norm": 1.5398367643356323, + "learning_rate": 8.974620298083748e-05, + "loss": 0.01877433657646179, + "step": 72270 + }, + { + "epoch": 10.259758694109298, + "grad_norm": 7.618338584899902, + "learning_rate": 8.974478353442159e-05, + "loss": 0.0600216805934906, + "step": 72280 + }, + { + "epoch": 10.261178140525196, + "grad_norm": 9.979314804077148, + "learning_rate": 8.974336408800567e-05, + "loss": 0.03386954665184021, + "step": 72290 + }, + { + "epoch": 10.262597586941093, + "grad_norm": 11.369985580444336, + "learning_rate": 8.974194464158978e-05, + "loss": 0.06406527757644653, + "step": 72300 + }, + { + "epoch": 10.264017033356991, + "grad_norm": 3.3756444454193115, + "learning_rate": 8.974052519517388e-05, + "loss": 0.03174246549606323, + "step": 72310 + }, + { + "epoch": 10.26543647977289, + "grad_norm": 0.29984939098358154, + "learning_rate": 8.973910574875799e-05, + "loss": 0.026856064796447754, + "step": 72320 + }, + { + "epoch": 10.266855926188786, + "grad_norm": 3.937253952026367, + "learning_rate": 8.973768630234209e-05, + "loss": 0.02418387681245804, + "step": 72330 + }, + { + "epoch": 10.268275372604684, + "grad_norm": 0.4848698675632477, + "learning_rate": 8.973626685592619e-05, + "loss": 0.04492365121841431, + "step": 72340 + }, + { + "epoch": 10.269694819020582, + "grad_norm": 0.23468013107776642, + "learning_rate": 8.97348474095103e-05, + "loss": 0.06487563252449036, + "step": 72350 + }, + { + "epoch": 10.27111426543648, + "grad_norm": 13.240750312805176, + "learning_rate": 8.97334279630944e-05, + "loss": 0.0500341534614563, + "step": 72360 + }, + { + "epoch": 10.272533711852377, + "grad_norm": 5.729434013366699, + "learning_rate": 8.97320085166785e-05, + "loss": 0.05774502754211426, + "step": 72370 + }, + { + "epoch": 10.273953158268275, + "grad_norm": 2.5826942920684814, + "learning_rate": 8.97305890702626e-05, + "loss": 0.027796417474746704, + "step": 72380 + }, + { + "epoch": 10.275372604684174, + "grad_norm": 0.07748646289110184, + "learning_rate": 8.972916962384671e-05, + "loss": 0.033330485224723816, + "step": 72390 + }, + { + "epoch": 10.27679205110007, + "grad_norm": 0.12528225779533386, + "learning_rate": 8.97277501774308e-05, + "loss": 0.020797872543334962, + "step": 72400 + }, + { + "epoch": 10.278211497515969, + "grad_norm": 0.2537434995174408, + "learning_rate": 8.972633073101491e-05, + "loss": 0.04087269008159637, + "step": 72410 + }, + { + "epoch": 10.279630943931867, + "grad_norm": 2.3561251163482666, + "learning_rate": 8.9724911284599e-05, + "loss": 0.04362359642982483, + "step": 72420 + }, + { + "epoch": 10.281050390347765, + "grad_norm": 0.16754914820194244, + "learning_rate": 8.972349183818312e-05, + "loss": 0.006162405386567116, + "step": 72430 + }, + { + "epoch": 10.282469836763662, + "grad_norm": 0.5426493883132935, + "learning_rate": 8.972207239176722e-05, + "loss": 0.04110488295555115, + "step": 72440 + }, + { + "epoch": 10.28388928317956, + "grad_norm": 3.838188409805298, + "learning_rate": 8.972065294535131e-05, + "loss": 0.06003108024597168, + "step": 72450 + }, + { + "epoch": 10.285308729595458, + "grad_norm": 0.3094750940799713, + "learning_rate": 8.971923349893542e-05, + "loss": 0.036252951622009276, + "step": 72460 + }, + { + "epoch": 10.286728176011355, + "grad_norm": 1.1412874460220337, + "learning_rate": 8.971781405251952e-05, + "loss": 0.02480601370334625, + "step": 72470 + }, + { + "epoch": 10.288147622427253, + "grad_norm": 3.882920265197754, + "learning_rate": 8.971639460610363e-05, + "loss": 0.056429213285446166, + "step": 72480 + }, + { + "epoch": 10.289567068843152, + "grad_norm": 0.4824908673763275, + "learning_rate": 8.971497515968773e-05, + "loss": 0.04092736542224884, + "step": 72490 + }, + { + "epoch": 10.29098651525905, + "grad_norm": 0.3222559988498688, + "learning_rate": 8.971355571327183e-05, + "loss": 0.09585509300231934, + "step": 72500 + }, + { + "epoch": 10.29098651525905, + "eval_accuracy": 0.9814332040440008, + "eval_loss": 0.06251820921897888, + "eval_runtime": 33.2932, + "eval_samples_per_second": 472.378, + "eval_steps_per_second": 14.778, + "step": 72500 + }, + { + "epoch": 10.292405961674946, + "grad_norm": 11.763492584228516, + "learning_rate": 8.971213626685592e-05, + "loss": 0.01904451847076416, + "step": 72510 + }, + { + "epoch": 10.293825408090845, + "grad_norm": 0.5626010298728943, + "learning_rate": 8.971071682044003e-05, + "loss": 0.01258438527584076, + "step": 72520 + }, + { + "epoch": 10.295244854506743, + "grad_norm": 5.586578845977783, + "learning_rate": 8.970929737402413e-05, + "loss": 0.028937163949012756, + "step": 72530 + }, + { + "epoch": 10.29666430092264, + "grad_norm": 4.926784515380859, + "learning_rate": 8.970787792760824e-05, + "loss": 0.05953345894813537, + "step": 72540 + }, + { + "epoch": 10.298083747338538, + "grad_norm": 0.07617692649364471, + "learning_rate": 8.970645848119234e-05, + "loss": 0.024326160550117493, + "step": 72550 + }, + { + "epoch": 10.299503193754436, + "grad_norm": 0.09429468214511871, + "learning_rate": 8.970503903477644e-05, + "loss": 0.016668303310871123, + "step": 72560 + }, + { + "epoch": 10.300922640170334, + "grad_norm": 0.03485213965177536, + "learning_rate": 8.970361958836055e-05, + "loss": 0.02714379131793976, + "step": 72570 + }, + { + "epoch": 10.302342086586231, + "grad_norm": 0.046538855880498886, + "learning_rate": 8.970220014194465e-05, + "loss": 0.03082246482372284, + "step": 72580 + }, + { + "epoch": 10.30376153300213, + "grad_norm": 0.529966413974762, + "learning_rate": 8.970078069552876e-05, + "loss": 0.010735002905130386, + "step": 72590 + }, + { + "epoch": 10.305180979418028, + "grad_norm": 1.1207096576690674, + "learning_rate": 8.969936124911284e-05, + "loss": 0.007675926387310028, + "step": 72600 + }, + { + "epoch": 10.306600425833924, + "grad_norm": 0.2528717517852783, + "learning_rate": 8.969794180269695e-05, + "loss": 0.027586179971694946, + "step": 72610 + }, + { + "epoch": 10.308019872249822, + "grad_norm": 2.0045723915100098, + "learning_rate": 8.969652235628105e-05, + "loss": 0.025220289826393127, + "step": 72620 + }, + { + "epoch": 10.30943931866572, + "grad_norm": 1.069236397743225, + "learning_rate": 8.969510290986516e-05, + "loss": 0.02478688657283783, + "step": 72630 + }, + { + "epoch": 10.310858765081619, + "grad_norm": 3.645983934402466, + "learning_rate": 8.969368346344926e-05, + "loss": 0.02384749799966812, + "step": 72640 + }, + { + "epoch": 10.312278211497516, + "grad_norm": 0.055776119232177734, + "learning_rate": 8.969226401703335e-05, + "loss": 0.025250345468521118, + "step": 72650 + }, + { + "epoch": 10.313697657913414, + "grad_norm": 1.7379151582717896, + "learning_rate": 8.969084457061747e-05, + "loss": 0.039242887496948244, + "step": 72660 + }, + { + "epoch": 10.315117104329312, + "grad_norm": 7.7128472328186035, + "learning_rate": 8.968942512420156e-05, + "loss": 0.04973468780517578, + "step": 72670 + }, + { + "epoch": 10.316536550745209, + "grad_norm": 0.4700365662574768, + "learning_rate": 8.968800567778567e-05, + "loss": 0.035573115944862364, + "step": 72680 + }, + { + "epoch": 10.317955997161107, + "grad_norm": 6.4476318359375, + "learning_rate": 8.968658623136977e-05, + "loss": 0.019644205272197724, + "step": 72690 + }, + { + "epoch": 10.319375443577005, + "grad_norm": 5.906562805175781, + "learning_rate": 8.968516678495387e-05, + "loss": 0.025435513257980345, + "step": 72700 + }, + { + "epoch": 10.320794889992904, + "grad_norm": 0.21683073043823242, + "learning_rate": 8.968374733853797e-05, + "loss": 0.06322475075721741, + "step": 72710 + }, + { + "epoch": 10.3222143364088, + "grad_norm": 1.5359755754470825, + "learning_rate": 8.968232789212208e-05, + "loss": 0.011539919674396515, + "step": 72720 + }, + { + "epoch": 10.323633782824698, + "grad_norm": 0.12440875917673111, + "learning_rate": 8.968090844570617e-05, + "loss": 0.043030500411987305, + "step": 72730 + }, + { + "epoch": 10.325053229240597, + "grad_norm": 4.936128616333008, + "learning_rate": 8.967948899929029e-05, + "loss": 0.018066060543060303, + "step": 72740 + }, + { + "epoch": 10.326472675656493, + "grad_norm": 0.04510660469532013, + "learning_rate": 8.967806955287438e-05, + "loss": 0.019599223136901857, + "step": 72750 + }, + { + "epoch": 10.327892122072392, + "grad_norm": 0.1886427402496338, + "learning_rate": 8.967665010645848e-05, + "loss": 0.04775834977626801, + "step": 72760 + }, + { + "epoch": 10.32931156848829, + "grad_norm": 0.006249363534152508, + "learning_rate": 8.967523066004259e-05, + "loss": 0.025642585754394532, + "step": 72770 + }, + { + "epoch": 10.330731014904188, + "grad_norm": 0.16167527437210083, + "learning_rate": 8.967381121362669e-05, + "loss": 0.024762631952762605, + "step": 72780 + }, + { + "epoch": 10.332150461320085, + "grad_norm": 1.3200987577438354, + "learning_rate": 8.96723917672108e-05, + "loss": 0.0683577299118042, + "step": 72790 + }, + { + "epoch": 10.333569907735983, + "grad_norm": 0.19871366024017334, + "learning_rate": 8.96709723207949e-05, + "loss": 0.026114186644554137, + "step": 72800 + }, + { + "epoch": 10.334989354151881, + "grad_norm": 3.256074905395508, + "learning_rate": 8.9669552874379e-05, + "loss": 0.020235490798950196, + "step": 72810 + }, + { + "epoch": 10.336408800567778, + "grad_norm": 4.438711643218994, + "learning_rate": 8.966813342796309e-05, + "loss": 0.025841870903968812, + "step": 72820 + }, + { + "epoch": 10.337828246983676, + "grad_norm": 6.780463695526123, + "learning_rate": 8.96667139815472e-05, + "loss": 0.06629498600959778, + "step": 72830 + }, + { + "epoch": 10.339247693399575, + "grad_norm": 5.759703159332275, + "learning_rate": 8.96652945351313e-05, + "loss": 0.05249155759811401, + "step": 72840 + }, + { + "epoch": 10.340667139815473, + "grad_norm": 6.208014011383057, + "learning_rate": 8.966387508871541e-05, + "loss": 0.029995641112327574, + "step": 72850 + }, + { + "epoch": 10.34208658623137, + "grad_norm": 0.3385080099105835, + "learning_rate": 8.966245564229951e-05, + "loss": 0.027232617139816284, + "step": 72860 + }, + { + "epoch": 10.343506032647268, + "grad_norm": 0.02547670714557171, + "learning_rate": 8.96610361958836e-05, + "loss": 0.006107653677463532, + "step": 72870 + }, + { + "epoch": 10.344925479063166, + "grad_norm": 1.217483401298523, + "learning_rate": 8.965961674946772e-05, + "loss": 0.02872964143753052, + "step": 72880 + }, + { + "epoch": 10.346344925479062, + "grad_norm": 3.2974629402160645, + "learning_rate": 8.965819730305181e-05, + "loss": 0.030625206232070924, + "step": 72890 + }, + { + "epoch": 10.34776437189496, + "grad_norm": 1.686771273612976, + "learning_rate": 8.965677785663592e-05, + "loss": 0.03720858991146088, + "step": 72900 + }, + { + "epoch": 10.349183818310859, + "grad_norm": 0.4160674214363098, + "learning_rate": 8.965535841022001e-05, + "loss": 0.03108443021774292, + "step": 72910 + }, + { + "epoch": 10.350603264726757, + "grad_norm": 0.33811572194099426, + "learning_rate": 8.965393896380412e-05, + "loss": 0.07100291848182679, + "step": 72920 + }, + { + "epoch": 10.352022711142654, + "grad_norm": 3.747161388397217, + "learning_rate": 8.965251951738822e-05, + "loss": 0.024768516421318054, + "step": 72930 + }, + { + "epoch": 10.353442157558552, + "grad_norm": 11.699270248413086, + "learning_rate": 8.965110007097233e-05, + "loss": 0.03598522841930389, + "step": 72940 + }, + { + "epoch": 10.35486160397445, + "grad_norm": 0.11568176746368408, + "learning_rate": 8.964968062455643e-05, + "loss": 0.030876615643501283, + "step": 72950 + }, + { + "epoch": 10.356281050390347, + "grad_norm": 0.1701243370771408, + "learning_rate": 8.964826117814052e-05, + "loss": 0.03053068518638611, + "step": 72960 + }, + { + "epoch": 10.357700496806245, + "grad_norm": 0.4134175777435303, + "learning_rate": 8.964684173172463e-05, + "loss": 0.022406129539012908, + "step": 72970 + }, + { + "epoch": 10.359119943222144, + "grad_norm": 9.19906997680664, + "learning_rate": 8.964542228530873e-05, + "loss": 0.05493233203887939, + "step": 72980 + }, + { + "epoch": 10.360539389638042, + "grad_norm": 11.903061866760254, + "learning_rate": 8.964400283889284e-05, + "loss": 0.07393259406089783, + "step": 72990 + }, + { + "epoch": 10.361958836053939, + "grad_norm": 3.575417995452881, + "learning_rate": 8.964258339247694e-05, + "loss": 0.06495519280433655, + "step": 73000 + }, + { + "epoch": 10.361958836053939, + "eval_accuracy": 0.9803522604438227, + "eval_loss": 0.06612089276313782, + "eval_runtime": 32.161, + "eval_samples_per_second": 489.009, + "eval_steps_per_second": 15.298, + "step": 73000 + }, + { + "epoch": 10.363378282469837, + "grad_norm": 0.9812063574790955, + "learning_rate": 8.964116394606104e-05, + "loss": 0.039226147532463077, + "step": 73010 + }, + { + "epoch": 10.364797728885735, + "grad_norm": 0.3453628718852997, + "learning_rate": 8.963974449964513e-05, + "loss": 0.03408626914024353, + "step": 73020 + }, + { + "epoch": 10.366217175301632, + "grad_norm": 5.1603899002075195, + "learning_rate": 8.963832505322924e-05, + "loss": 0.07247533798217773, + "step": 73030 + }, + { + "epoch": 10.36763662171753, + "grad_norm": 4.564572811126709, + "learning_rate": 8.963690560681334e-05, + "loss": 0.02123674303293228, + "step": 73040 + }, + { + "epoch": 10.369056068133428, + "grad_norm": 2.140882968902588, + "learning_rate": 8.963548616039745e-05, + "loss": 0.02011290192604065, + "step": 73050 + }, + { + "epoch": 10.370475514549327, + "grad_norm": 0.5259304046630859, + "learning_rate": 8.963406671398155e-05, + "loss": 0.03429713249206543, + "step": 73060 + }, + { + "epoch": 10.371894960965223, + "grad_norm": 14.053521156311035, + "learning_rate": 8.963264726756565e-05, + "loss": 0.0851962685585022, + "step": 73070 + }, + { + "epoch": 10.373314407381121, + "grad_norm": 0.11945252865552902, + "learning_rate": 8.963122782114976e-05, + "loss": 0.02100173830986023, + "step": 73080 + }, + { + "epoch": 10.37473385379702, + "grad_norm": 1.8677468299865723, + "learning_rate": 8.962980837473386e-05, + "loss": 0.042025390267372134, + "step": 73090 + }, + { + "epoch": 10.376153300212916, + "grad_norm": 7.363965034484863, + "learning_rate": 8.962838892831797e-05, + "loss": 0.024972128868103027, + "step": 73100 + }, + { + "epoch": 10.377572746628815, + "grad_norm": 7.26507568359375, + "learning_rate": 8.962696948190206e-05, + "loss": 0.055237317085266115, + "step": 73110 + }, + { + "epoch": 10.378992193044713, + "grad_norm": 0.9234290719032288, + "learning_rate": 8.962555003548616e-05, + "loss": 0.049257388710975646, + "step": 73120 + }, + { + "epoch": 10.380411639460611, + "grad_norm": 7.842380523681641, + "learning_rate": 8.962413058907026e-05, + "loss": 0.04204971194267273, + "step": 73130 + }, + { + "epoch": 10.381831085876508, + "grad_norm": 0.5620039701461792, + "learning_rate": 8.962271114265437e-05, + "loss": 0.006001041084527969, + "step": 73140 + }, + { + "epoch": 10.383250532292406, + "grad_norm": 3.0934367179870605, + "learning_rate": 8.962129169623847e-05, + "loss": 0.02211282551288605, + "step": 73150 + }, + { + "epoch": 10.384669978708304, + "grad_norm": 0.1924249231815338, + "learning_rate": 8.961987224982258e-05, + "loss": 0.012744960188865662, + "step": 73160 + }, + { + "epoch": 10.3860894251242, + "grad_norm": 3.350724697113037, + "learning_rate": 8.961845280340668e-05, + "loss": 0.022318266332149506, + "step": 73170 + }, + { + "epoch": 10.3875088715401, + "grad_norm": 3.635645866394043, + "learning_rate": 8.961703335699077e-05, + "loss": 0.029640212655067444, + "step": 73180 + }, + { + "epoch": 10.388928317955997, + "grad_norm": 0.46407830715179443, + "learning_rate": 8.961561391057488e-05, + "loss": 0.024921415746212004, + "step": 73190 + }, + { + "epoch": 10.390347764371896, + "grad_norm": 10.320816993713379, + "learning_rate": 8.961419446415898e-05, + "loss": 0.0269631564617157, + "step": 73200 + }, + { + "epoch": 10.391767210787792, + "grad_norm": 2.758819818496704, + "learning_rate": 8.961277501774309e-05, + "loss": 0.04611527621746063, + "step": 73210 + }, + { + "epoch": 10.39318665720369, + "grad_norm": 4.595924377441406, + "learning_rate": 8.961135557132718e-05, + "loss": 0.05312431454658508, + "step": 73220 + }, + { + "epoch": 10.394606103619589, + "grad_norm": 5.965487957000732, + "learning_rate": 8.960993612491129e-05, + "loss": 0.05250157713890076, + "step": 73230 + }, + { + "epoch": 10.396025550035485, + "grad_norm": 2.658355951309204, + "learning_rate": 8.960851667849538e-05, + "loss": 0.03350549340248108, + "step": 73240 + }, + { + "epoch": 10.397444996451384, + "grad_norm": 0.023311669006943703, + "learning_rate": 8.96070972320795e-05, + "loss": 0.01648566424846649, + "step": 73250 + }, + { + "epoch": 10.398864442867282, + "grad_norm": 10.702779769897461, + "learning_rate": 8.96056777856636e-05, + "loss": 0.04944937825202942, + "step": 73260 + }, + { + "epoch": 10.40028388928318, + "grad_norm": 0.25531113147735596, + "learning_rate": 8.960425833924769e-05, + "loss": 0.03268125057220459, + "step": 73270 + }, + { + "epoch": 10.401703335699077, + "grad_norm": 0.44327834248542786, + "learning_rate": 8.96028388928318e-05, + "loss": 0.025926288962364197, + "step": 73280 + }, + { + "epoch": 10.403122782114975, + "grad_norm": 0.04333629086613655, + "learning_rate": 8.96014194464159e-05, + "loss": 0.02328411638736725, + "step": 73290 + }, + { + "epoch": 10.404542228530874, + "grad_norm": 0.1707431674003601, + "learning_rate": 8.960000000000001e-05, + "loss": 0.008144380897283554, + "step": 73300 + }, + { + "epoch": 10.40596167494677, + "grad_norm": 2.7800562381744385, + "learning_rate": 8.959858055358411e-05, + "loss": 0.030040925741195677, + "step": 73310 + }, + { + "epoch": 10.407381121362668, + "grad_norm": 6.127976417541504, + "learning_rate": 8.95971611071682e-05, + "loss": 0.034268587827682495, + "step": 73320 + }, + { + "epoch": 10.408800567778567, + "grad_norm": 0.9095988869667053, + "learning_rate": 8.95957416607523e-05, + "loss": 0.036425772309303286, + "step": 73330 + }, + { + "epoch": 10.410220014194465, + "grad_norm": 9.289791107177734, + "learning_rate": 8.959432221433641e-05, + "loss": 0.07397414445877075, + "step": 73340 + }, + { + "epoch": 10.411639460610361, + "grad_norm": 0.14718365669250488, + "learning_rate": 8.959290276792052e-05, + "loss": 0.06551390886306763, + "step": 73350 + }, + { + "epoch": 10.41305890702626, + "grad_norm": 0.5513947010040283, + "learning_rate": 8.959148332150462e-05, + "loss": 0.018647877871990202, + "step": 73360 + }, + { + "epoch": 10.414478353442158, + "grad_norm": 10.3717622756958, + "learning_rate": 8.959006387508872e-05, + "loss": 0.0698345422744751, + "step": 73370 + }, + { + "epoch": 10.415897799858055, + "grad_norm": 1.1711900234222412, + "learning_rate": 8.958864442867282e-05, + "loss": 0.01738281548023224, + "step": 73380 + }, + { + "epoch": 10.417317246273953, + "grad_norm": 0.29801109433174133, + "learning_rate": 8.958722498225693e-05, + "loss": 0.02697826027870178, + "step": 73390 + }, + { + "epoch": 10.418736692689851, + "grad_norm": 1.2740803956985474, + "learning_rate": 8.958580553584102e-05, + "loss": 0.015290048718452454, + "step": 73400 + }, + { + "epoch": 10.42015613910575, + "grad_norm": 0.2341892421245575, + "learning_rate": 8.958438608942513e-05, + "loss": 0.07527807354927063, + "step": 73410 + }, + { + "epoch": 10.421575585521646, + "grad_norm": 0.5258547067642212, + "learning_rate": 8.958296664300922e-05, + "loss": 0.048480254411697385, + "step": 73420 + }, + { + "epoch": 10.422995031937544, + "grad_norm": 1.7956501245498657, + "learning_rate": 8.958154719659333e-05, + "loss": 0.02461201250553131, + "step": 73430 + }, + { + "epoch": 10.424414478353443, + "grad_norm": 4.295741081237793, + "learning_rate": 8.958012775017744e-05, + "loss": 0.02888575792312622, + "step": 73440 + }, + { + "epoch": 10.42583392476934, + "grad_norm": 0.2158520519733429, + "learning_rate": 8.957870830376154e-05, + "loss": 0.009636881202459336, + "step": 73450 + }, + { + "epoch": 10.427253371185238, + "grad_norm": 0.5940867066383362, + "learning_rate": 8.957728885734565e-05, + "loss": 0.02917068898677826, + "step": 73460 + }, + { + "epoch": 10.428672817601136, + "grad_norm": 3.378797769546509, + "learning_rate": 8.957586941092975e-05, + "loss": 0.01072699874639511, + "step": 73470 + }, + { + "epoch": 10.430092264017034, + "grad_norm": 9.421317100524902, + "learning_rate": 8.957444996451384e-05, + "loss": 0.01690336912870407, + "step": 73480 + }, + { + "epoch": 10.43151171043293, + "grad_norm": 3.34399676322937, + "learning_rate": 8.957303051809794e-05, + "loss": 0.011198329925537109, + "step": 73490 + }, + { + "epoch": 10.432931156848829, + "grad_norm": 2.572981119155884, + "learning_rate": 8.957161107168205e-05, + "loss": 0.00565105676651001, + "step": 73500 + }, + { + "epoch": 10.432931156848829, + "eval_accuracy": 0.9844216951739048, + "eval_loss": 0.05566272512078285, + "eval_runtime": 33.2646, + "eval_samples_per_second": 472.785, + "eval_steps_per_second": 14.791, + "step": 73500 + }, + { + "epoch": 10.434350603264727, + "grad_norm": 0.12054192274808884, + "learning_rate": 8.957019162526615e-05, + "loss": 0.05910096168518066, + "step": 73510 + }, + { + "epoch": 10.435770049680624, + "grad_norm": 0.16370511054992676, + "learning_rate": 8.956877217885026e-05, + "loss": 0.032081523537635805, + "step": 73520 + }, + { + "epoch": 10.437189496096522, + "grad_norm": 0.29961004853248596, + "learning_rate": 8.956735273243436e-05, + "loss": 0.007783429324626922, + "step": 73530 + }, + { + "epoch": 10.43860894251242, + "grad_norm": 2.4290831089019775, + "learning_rate": 8.956593328601845e-05, + "loss": 0.023958057165145874, + "step": 73540 + }, + { + "epoch": 10.440028388928319, + "grad_norm": 6.39838171005249, + "learning_rate": 8.956451383960257e-05, + "loss": 0.045057627558708194, + "step": 73550 + }, + { + "epoch": 10.441447835344215, + "grad_norm": 1.2620060443878174, + "learning_rate": 8.956309439318666e-05, + "loss": 0.010841131955385209, + "step": 73560 + }, + { + "epoch": 10.442867281760114, + "grad_norm": 0.03583799675107002, + "learning_rate": 8.956167494677077e-05, + "loss": 0.026120901107788086, + "step": 73570 + }, + { + "epoch": 10.444286728176012, + "grad_norm": 9.077432632446289, + "learning_rate": 8.956025550035486e-05, + "loss": 0.05460309386253357, + "step": 73580 + }, + { + "epoch": 10.445706174591908, + "grad_norm": 1.1113617420196533, + "learning_rate": 8.955883605393897e-05, + "loss": 0.04970341622829437, + "step": 73590 + }, + { + "epoch": 10.447125621007807, + "grad_norm": 2.5755014419555664, + "learning_rate": 8.955741660752307e-05, + "loss": 0.050418293476104735, + "step": 73600 + }, + { + "epoch": 10.448545067423705, + "grad_norm": 1.6536363363265991, + "learning_rate": 8.955599716110718e-05, + "loss": 0.03822358548641205, + "step": 73610 + }, + { + "epoch": 10.449964513839603, + "grad_norm": 15.947848320007324, + "learning_rate": 8.955457771469127e-05, + "loss": 0.051361745595932005, + "step": 73620 + }, + { + "epoch": 10.4513839602555, + "grad_norm": 5.572516441345215, + "learning_rate": 8.955315826827537e-05, + "loss": 0.024033012986183166, + "step": 73630 + }, + { + "epoch": 10.452803406671398, + "grad_norm": 4.491860866546631, + "learning_rate": 8.955173882185948e-05, + "loss": 0.05285232663154602, + "step": 73640 + }, + { + "epoch": 10.454222853087296, + "grad_norm": 2.209595203399658, + "learning_rate": 8.955031937544358e-05, + "loss": 0.010080764442682267, + "step": 73650 + }, + { + "epoch": 10.455642299503193, + "grad_norm": 7.413250923156738, + "learning_rate": 8.954889992902769e-05, + "loss": 0.05623313784599304, + "step": 73660 + }, + { + "epoch": 10.457061745919091, + "grad_norm": 5.551499366760254, + "learning_rate": 8.954748048261179e-05, + "loss": 0.022087934613227844, + "step": 73670 + }, + { + "epoch": 10.45848119233499, + "grad_norm": 0.6603949666023254, + "learning_rate": 8.954606103619589e-05, + "loss": 0.06099636554718017, + "step": 73680 + }, + { + "epoch": 10.459900638750888, + "grad_norm": 1.2046256065368652, + "learning_rate": 8.954464158977998e-05, + "loss": 0.04031319320201874, + "step": 73690 + }, + { + "epoch": 10.461320085166784, + "grad_norm": 2.295785427093506, + "learning_rate": 8.95432221433641e-05, + "loss": 0.0229206383228302, + "step": 73700 + }, + { + "epoch": 10.462739531582683, + "grad_norm": 1.1245261430740356, + "learning_rate": 8.954180269694819e-05, + "loss": 0.03432927429676056, + "step": 73710 + }, + { + "epoch": 10.464158977998581, + "grad_norm": 0.25341543555259705, + "learning_rate": 8.95403832505323e-05, + "loss": 0.011335819959640503, + "step": 73720 + }, + { + "epoch": 10.465578424414478, + "grad_norm": 1.2831389904022217, + "learning_rate": 8.95389638041164e-05, + "loss": 0.02653484046459198, + "step": 73730 + }, + { + "epoch": 10.466997870830376, + "grad_norm": 3.338085412979126, + "learning_rate": 8.95375443577005e-05, + "loss": 0.0467991441488266, + "step": 73740 + }, + { + "epoch": 10.468417317246274, + "grad_norm": 10.528369903564453, + "learning_rate": 8.953612491128461e-05, + "loss": 0.10263023376464844, + "step": 73750 + }, + { + "epoch": 10.469836763662173, + "grad_norm": 0.6761602759361267, + "learning_rate": 8.95347054648687e-05, + "loss": 0.010650408267974854, + "step": 73760 + }, + { + "epoch": 10.471256210078069, + "grad_norm": 0.20647557079792023, + "learning_rate": 8.953328601845282e-05, + "loss": 0.019610205292701723, + "step": 73770 + }, + { + "epoch": 10.472675656493967, + "grad_norm": 1.0002400875091553, + "learning_rate": 8.953186657203691e-05, + "loss": 0.018295831978321075, + "step": 73780 + }, + { + "epoch": 10.474095102909866, + "grad_norm": 0.8303980231285095, + "learning_rate": 8.953044712562101e-05, + "loss": 0.0178376168012619, + "step": 73790 + }, + { + "epoch": 10.475514549325762, + "grad_norm": 6.3407487869262695, + "learning_rate": 8.952902767920511e-05, + "loss": 0.06772493124008179, + "step": 73800 + }, + { + "epoch": 10.47693399574166, + "grad_norm": 4.017003536224365, + "learning_rate": 8.952760823278922e-05, + "loss": 0.04538638293743134, + "step": 73810 + }, + { + "epoch": 10.478353442157559, + "grad_norm": 8.43114185333252, + "learning_rate": 8.952618878637332e-05, + "loss": 0.03319251537322998, + "step": 73820 + }, + { + "epoch": 10.479772888573457, + "grad_norm": 2.7893521785736084, + "learning_rate": 8.952476933995743e-05, + "loss": 0.028853052854537965, + "step": 73830 + }, + { + "epoch": 10.481192334989354, + "grad_norm": 9.185164451599121, + "learning_rate": 8.952334989354153e-05, + "loss": 0.06574128866195679, + "step": 73840 + }, + { + "epoch": 10.482611781405252, + "grad_norm": 0.7617444396018982, + "learning_rate": 8.952193044712562e-05, + "loss": 0.04429103434085846, + "step": 73850 + }, + { + "epoch": 10.48403122782115, + "grad_norm": 2.2630093097686768, + "learning_rate": 8.952051100070973e-05, + "loss": 0.02493145763874054, + "step": 73860 + }, + { + "epoch": 10.485450674237047, + "grad_norm": 4.28364896774292, + "learning_rate": 8.951909155429383e-05, + "loss": 0.09974995851516724, + "step": 73870 + }, + { + "epoch": 10.486870120652945, + "grad_norm": 2.674414873123169, + "learning_rate": 8.951767210787794e-05, + "loss": 0.0344027042388916, + "step": 73880 + }, + { + "epoch": 10.488289567068843, + "grad_norm": 6.765301704406738, + "learning_rate": 8.951625266146203e-05, + "loss": 0.025649476051330566, + "step": 73890 + }, + { + "epoch": 10.489709013484742, + "grad_norm": 0.6529340744018555, + "learning_rate": 8.951483321504614e-05, + "loss": 0.00923079326748848, + "step": 73900 + }, + { + "epoch": 10.491128459900638, + "grad_norm": 0.058388613164424896, + "learning_rate": 8.951341376863023e-05, + "loss": 0.021965248882770537, + "step": 73910 + }, + { + "epoch": 10.492547906316537, + "grad_norm": 3.1133265495300293, + "learning_rate": 8.951199432221434e-05, + "loss": 0.014603239297866822, + "step": 73920 + }, + { + "epoch": 10.493967352732435, + "grad_norm": 4.840022087097168, + "learning_rate": 8.951057487579844e-05, + "loss": 0.04172016680240631, + "step": 73930 + }, + { + "epoch": 10.495386799148331, + "grad_norm": 3.6252007484436035, + "learning_rate": 8.950915542938254e-05, + "loss": 0.0229099839925766, + "step": 73940 + }, + { + "epoch": 10.49680624556423, + "grad_norm": 0.875785768032074, + "learning_rate": 8.950773598296665e-05, + "loss": 0.0125528022646904, + "step": 73950 + }, + { + "epoch": 10.498225691980128, + "grad_norm": 2.1686460971832275, + "learning_rate": 8.950631653655075e-05, + "loss": 0.01959478259086609, + "step": 73960 + }, + { + "epoch": 10.499645138396026, + "grad_norm": 2.5842530727386475, + "learning_rate": 8.950489709013486e-05, + "loss": 0.01581704616546631, + "step": 73970 + }, + { + "epoch": 10.501064584811923, + "grad_norm": 0.9878895878791809, + "learning_rate": 8.950347764371896e-05, + "loss": 0.029901912808418273, + "step": 73980 + }, + { + "epoch": 10.502484031227821, + "grad_norm": 0.5379632115364075, + "learning_rate": 8.950205819730305e-05, + "loss": 0.035832139849662784, + "step": 73990 + }, + { + "epoch": 10.50390347764372, + "grad_norm": 0.46223098039627075, + "learning_rate": 8.950063875088715e-05, + "loss": 0.02706916332244873, + "step": 74000 + }, + { + "epoch": 10.50390347764372, + "eval_accuracy": 0.9834679214090418, + "eval_loss": 0.05921155586838722, + "eval_runtime": 32.9761, + "eval_samples_per_second": 476.921, + "eval_steps_per_second": 14.92, + "step": 74000 + }, + { + "epoch": 10.505322924059616, + "grad_norm": 0.26483798027038574, + "learning_rate": 8.949921930447126e-05, + "loss": 0.01467936635017395, + "step": 74010 + }, + { + "epoch": 10.506742370475514, + "grad_norm": 0.147200345993042, + "learning_rate": 8.949779985805536e-05, + "loss": 0.012765195965766907, + "step": 74020 + }, + { + "epoch": 10.508161816891413, + "grad_norm": 1.744972586631775, + "learning_rate": 8.949638041163947e-05, + "loss": 0.06021806001663208, + "step": 74030 + }, + { + "epoch": 10.509581263307311, + "grad_norm": 8.56665325164795, + "learning_rate": 8.949496096522357e-05, + "loss": 0.05351743698120117, + "step": 74040 + }, + { + "epoch": 10.511000709723207, + "grad_norm": 0.1041315421462059, + "learning_rate": 8.949354151880767e-05, + "loss": 0.04573552906513214, + "step": 74050 + }, + { + "epoch": 10.512420156139106, + "grad_norm": 5.440675258636475, + "learning_rate": 8.949212207239178e-05, + "loss": 0.04139101505279541, + "step": 74060 + }, + { + "epoch": 10.513839602555004, + "grad_norm": 2.8024065494537354, + "learning_rate": 8.949070262597587e-05, + "loss": 0.06140284538269043, + "step": 74070 + }, + { + "epoch": 10.5152590489709, + "grad_norm": 10.5914945602417, + "learning_rate": 8.948928317955998e-05, + "loss": 0.07495509386062622, + "step": 74080 + }, + { + "epoch": 10.516678495386799, + "grad_norm": 1.458267331123352, + "learning_rate": 8.948786373314407e-05, + "loss": 0.0535413384437561, + "step": 74090 + }, + { + "epoch": 10.518097941802697, + "grad_norm": 2.3449082374572754, + "learning_rate": 8.948644428672818e-05, + "loss": 0.04176829755306244, + "step": 74100 + }, + { + "epoch": 10.519517388218595, + "grad_norm": 0.42052292823791504, + "learning_rate": 8.948502484031228e-05, + "loss": 0.011891970038414001, + "step": 74110 + }, + { + "epoch": 10.520936834634492, + "grad_norm": 7.065118312835693, + "learning_rate": 8.948360539389639e-05, + "loss": 0.052879738807678225, + "step": 74120 + }, + { + "epoch": 10.52235628105039, + "grad_norm": 0.9390953183174133, + "learning_rate": 8.948218594748048e-05, + "loss": 0.04085269868373871, + "step": 74130 + }, + { + "epoch": 10.523775727466289, + "grad_norm": 0.5752598643302917, + "learning_rate": 8.94807665010646e-05, + "loss": 0.025446805357933044, + "step": 74140 + }, + { + "epoch": 10.525195173882185, + "grad_norm": 5.54420804977417, + "learning_rate": 8.947934705464869e-05, + "loss": 0.07761828303337097, + "step": 74150 + }, + { + "epoch": 10.526614620298083, + "grad_norm": 8.482158660888672, + "learning_rate": 8.947792760823279e-05, + "loss": 0.015085341036319732, + "step": 74160 + }, + { + "epoch": 10.528034066713982, + "grad_norm": 0.253826767206192, + "learning_rate": 8.94765081618169e-05, + "loss": 0.05122783184051514, + "step": 74170 + }, + { + "epoch": 10.52945351312988, + "grad_norm": 10.227400779724121, + "learning_rate": 8.9475088715401e-05, + "loss": 0.06506451368331909, + "step": 74180 + }, + { + "epoch": 10.530872959545777, + "grad_norm": 6.364181995391846, + "learning_rate": 8.947366926898511e-05, + "loss": 0.012909649312496186, + "step": 74190 + }, + { + "epoch": 10.532292405961675, + "grad_norm": 1.6474579572677612, + "learning_rate": 8.94722498225692e-05, + "loss": 0.058185654878616336, + "step": 74200 + }, + { + "epoch": 10.533711852377573, + "grad_norm": 0.05176066979765892, + "learning_rate": 8.94708303761533e-05, + "loss": 0.024706798791885375, + "step": 74210 + }, + { + "epoch": 10.53513129879347, + "grad_norm": 0.15353232622146606, + "learning_rate": 8.94694109297374e-05, + "loss": 0.010836786031723023, + "step": 74220 + }, + { + "epoch": 10.536550745209368, + "grad_norm": 1.6580612659454346, + "learning_rate": 8.946799148332151e-05, + "loss": 0.018438754975795744, + "step": 74230 + }, + { + "epoch": 10.537970191625266, + "grad_norm": 0.47383174300193787, + "learning_rate": 8.946657203690561e-05, + "loss": 0.019079934060573577, + "step": 74240 + }, + { + "epoch": 10.539389638041165, + "grad_norm": 0.06103862076997757, + "learning_rate": 8.946515259048971e-05, + "loss": 0.017451618611812592, + "step": 74250 + }, + { + "epoch": 10.540809084457061, + "grad_norm": 0.028250914067029953, + "learning_rate": 8.946373314407382e-05, + "loss": 0.018591858446598053, + "step": 74260 + }, + { + "epoch": 10.54222853087296, + "grad_norm": 0.04944806545972824, + "learning_rate": 8.946231369765792e-05, + "loss": 0.026098889112472535, + "step": 74270 + }, + { + "epoch": 10.543647977288858, + "grad_norm": 0.3458721935749054, + "learning_rate": 8.946089425124203e-05, + "loss": 0.046861696243286136, + "step": 74280 + }, + { + "epoch": 10.545067423704754, + "grad_norm": 1.2632274627685547, + "learning_rate": 8.945947480482612e-05, + "loss": 0.017438746988773346, + "step": 74290 + }, + { + "epoch": 10.546486870120653, + "grad_norm": 1.1462650299072266, + "learning_rate": 8.945805535841022e-05, + "loss": 0.018086281418800355, + "step": 74300 + }, + { + "epoch": 10.547906316536551, + "grad_norm": 2.7778356075286865, + "learning_rate": 8.945663591199432e-05, + "loss": 0.04267987906932831, + "step": 74310 + }, + { + "epoch": 10.54932576295245, + "grad_norm": 1.8293638229370117, + "learning_rate": 8.945521646557843e-05, + "loss": 0.059062355756759645, + "step": 74320 + }, + { + "epoch": 10.550745209368346, + "grad_norm": 7.1552019119262695, + "learning_rate": 8.945379701916253e-05, + "loss": 0.04298066794872284, + "step": 74330 + }, + { + "epoch": 10.552164655784244, + "grad_norm": 6.903208255767822, + "learning_rate": 8.945237757274664e-05, + "loss": 0.020708955824375153, + "step": 74340 + }, + { + "epoch": 10.553584102200142, + "grad_norm": 0.5696495175361633, + "learning_rate": 8.945095812633074e-05, + "loss": 0.027511507272720337, + "step": 74350 + }, + { + "epoch": 10.555003548616039, + "grad_norm": 1.9177380800247192, + "learning_rate": 8.944953867991483e-05, + "loss": 0.016340428590774538, + "step": 74360 + }, + { + "epoch": 10.556422995031937, + "grad_norm": 0.17848406732082367, + "learning_rate": 8.944811923349894e-05, + "loss": 0.04354447424411774, + "step": 74370 + }, + { + "epoch": 10.557842441447836, + "grad_norm": 5.381374835968018, + "learning_rate": 8.944669978708304e-05, + "loss": 0.03182124495506287, + "step": 74380 + }, + { + "epoch": 10.559261887863734, + "grad_norm": 4.705573558807373, + "learning_rate": 8.944528034066715e-05, + "loss": 0.029999750852584838, + "step": 74390 + }, + { + "epoch": 10.56068133427963, + "grad_norm": 0.5373235940933228, + "learning_rate": 8.944386089425124e-05, + "loss": 0.03394618630409241, + "step": 74400 + }, + { + "epoch": 10.562100780695529, + "grad_norm": 0.5422061681747437, + "learning_rate": 8.944244144783535e-05, + "loss": 0.01418820470571518, + "step": 74410 + }, + { + "epoch": 10.563520227111427, + "grad_norm": 5.353031635284424, + "learning_rate": 8.944102200141944e-05, + "loss": 0.017679476737976076, + "step": 74420 + }, + { + "epoch": 10.564939673527324, + "grad_norm": 0.4547473192214966, + "learning_rate": 8.943960255500356e-05, + "loss": 0.029423204064369202, + "step": 74430 + }, + { + "epoch": 10.566359119943222, + "grad_norm": 6.3978166580200195, + "learning_rate": 8.943818310858765e-05, + "loss": 0.07245333194732666, + "step": 74440 + }, + { + "epoch": 10.56777856635912, + "grad_norm": 0.7407926321029663, + "learning_rate": 8.943676366217175e-05, + "loss": 0.03220914006233215, + "step": 74450 + }, + { + "epoch": 10.569198012775018, + "grad_norm": 6.225142478942871, + "learning_rate": 8.943548616039745e-05, + "loss": 0.021983492374420165, + "step": 74460 + }, + { + "epoch": 10.570617459190915, + "grad_norm": 0.22022587060928345, + "learning_rate": 8.943406671398156e-05, + "loss": 0.017795734107494354, + "step": 74470 + }, + { + "epoch": 10.572036905606813, + "grad_norm": 5.597069263458252, + "learning_rate": 8.943264726756566e-05, + "loss": 0.01740722209215164, + "step": 74480 + }, + { + "epoch": 10.573456352022712, + "grad_norm": 1.1836518049240112, + "learning_rate": 8.943122782114975e-05, + "loss": 0.03079001307487488, + "step": 74490 + }, + { + "epoch": 10.574875798438608, + "grad_norm": 15.553241729736328, + "learning_rate": 8.942980837473386e-05, + "loss": 0.07134093046188354, + "step": 74500 + }, + { + "epoch": 10.574875798438608, + "eval_accuracy": 0.9783175430787817, + "eval_loss": 0.07355938851833344, + "eval_runtime": 33.2908, + "eval_samples_per_second": 472.412, + "eval_steps_per_second": 14.779, + "step": 74500 + }, + { + "epoch": 10.576295244854506, + "grad_norm": 13.643026351928711, + "learning_rate": 8.942838892831796e-05, + "loss": 0.09327298998832703, + "step": 74510 + }, + { + "epoch": 10.577714691270405, + "grad_norm": 9.596179008483887, + "learning_rate": 8.942696948190207e-05, + "loss": 0.04536471366882324, + "step": 74520 + }, + { + "epoch": 10.579134137686303, + "grad_norm": 1.495818853378296, + "learning_rate": 8.942555003548616e-05, + "loss": 0.019459769129753113, + "step": 74530 + }, + { + "epoch": 10.5805535841022, + "grad_norm": 7.002575397491455, + "learning_rate": 8.942413058907027e-05, + "loss": 0.02139394134283066, + "step": 74540 + }, + { + "epoch": 10.581973030518098, + "grad_norm": 0.691582977771759, + "learning_rate": 8.942271114265437e-05, + "loss": 0.04519861042499542, + "step": 74550 + }, + { + "epoch": 10.583392476933996, + "grad_norm": 4.451111316680908, + "learning_rate": 8.942129169623848e-05, + "loss": 0.05654475688934326, + "step": 74560 + }, + { + "epoch": 10.584811923349893, + "grad_norm": 1.640137791633606, + "learning_rate": 8.941987224982257e-05, + "loss": 0.06334338784217834, + "step": 74570 + }, + { + "epoch": 10.586231369765791, + "grad_norm": 0.7545045614242554, + "learning_rate": 8.941845280340667e-05, + "loss": 0.02559314966201782, + "step": 74580 + }, + { + "epoch": 10.58765081618169, + "grad_norm": 2.474801540374756, + "learning_rate": 8.941703335699078e-05, + "loss": 0.048035275936126706, + "step": 74590 + }, + { + "epoch": 10.589070262597588, + "grad_norm": 0.6320196986198425, + "learning_rate": 8.941561391057488e-05, + "loss": 0.01585846096277237, + "step": 74600 + }, + { + "epoch": 10.590489709013484, + "grad_norm": 1.918588399887085, + "learning_rate": 8.941419446415899e-05, + "loss": 0.020170879364013673, + "step": 74610 + }, + { + "epoch": 10.591909155429382, + "grad_norm": 1.1940841674804688, + "learning_rate": 8.941277501774309e-05, + "loss": 0.06623907089233398, + "step": 74620 + }, + { + "epoch": 10.59332860184528, + "grad_norm": 8.052373886108398, + "learning_rate": 8.941135557132718e-05, + "loss": 0.03799133002758026, + "step": 74630 + }, + { + "epoch": 10.594748048261177, + "grad_norm": 0.05093760788440704, + "learning_rate": 8.940993612491128e-05, + "loss": 0.058805429935455324, + "step": 74640 + }, + { + "epoch": 10.596167494677076, + "grad_norm": 1.0790239572525024, + "learning_rate": 8.94085166784954e-05, + "loss": 0.02620888352394104, + "step": 74650 + }, + { + "epoch": 10.597586941092974, + "grad_norm": 0.16304579377174377, + "learning_rate": 8.940709723207949e-05, + "loss": 0.0440018504858017, + "step": 74660 + }, + { + "epoch": 10.599006387508872, + "grad_norm": 9.762938499450684, + "learning_rate": 8.94056777856636e-05, + "loss": 0.01490759253501892, + "step": 74670 + }, + { + "epoch": 10.600425833924769, + "grad_norm": 6.83378267288208, + "learning_rate": 8.94042583392477e-05, + "loss": 0.022882431745529175, + "step": 74680 + }, + { + "epoch": 10.601845280340667, + "grad_norm": 5.488767147064209, + "learning_rate": 8.94028388928318e-05, + "loss": 0.01500108242034912, + "step": 74690 + }, + { + "epoch": 10.603264726756565, + "grad_norm": 3.6321401596069336, + "learning_rate": 8.940141944641591e-05, + "loss": 0.017778295278549194, + "step": 74700 + }, + { + "epoch": 10.604684173172462, + "grad_norm": 0.013210363686084747, + "learning_rate": 8.94e-05, + "loss": 0.007808870077133179, + "step": 74710 + }, + { + "epoch": 10.60610361958836, + "grad_norm": 1.6923620700836182, + "learning_rate": 8.939858055358412e-05, + "loss": 0.03966827690601349, + "step": 74720 + }, + { + "epoch": 10.607523066004259, + "grad_norm": 1.4342869520187378, + "learning_rate": 8.93971611071682e-05, + "loss": 0.013448211550712585, + "step": 74730 + }, + { + "epoch": 10.608942512420157, + "grad_norm": 0.07803019136190414, + "learning_rate": 8.939574166075231e-05, + "loss": 0.03317614197731018, + "step": 74740 + }, + { + "epoch": 10.610361958836053, + "grad_norm": 0.46355992555618286, + "learning_rate": 8.939432221433641e-05, + "loss": 0.01559600830078125, + "step": 74750 + }, + { + "epoch": 10.611781405251952, + "grad_norm": 18.4334659576416, + "learning_rate": 8.939290276792052e-05, + "loss": 0.06782117486000061, + "step": 74760 + }, + { + "epoch": 10.61320085166785, + "grad_norm": 1.2076555490493774, + "learning_rate": 8.939148332150462e-05, + "loss": 0.020013023912906647, + "step": 74770 + }, + { + "epoch": 10.614620298083747, + "grad_norm": 0.34689363837242126, + "learning_rate": 8.939006387508871e-05, + "loss": 0.040333092212677, + "step": 74780 + }, + { + "epoch": 10.616039744499645, + "grad_norm": 3.217885971069336, + "learning_rate": 8.938864442867282e-05, + "loss": 0.038464948534965515, + "step": 74790 + }, + { + "epoch": 10.617459190915543, + "grad_norm": 0.19117842614650726, + "learning_rate": 8.938722498225692e-05, + "loss": 0.025781697034835814, + "step": 74800 + }, + { + "epoch": 10.618878637331441, + "grad_norm": 0.020094774663448334, + "learning_rate": 8.938580553584103e-05, + "loss": 0.035686278343200685, + "step": 74810 + }, + { + "epoch": 10.620298083747338, + "grad_norm": 0.07451222836971283, + "learning_rate": 8.938438608942513e-05, + "loss": 0.06001535654067993, + "step": 74820 + }, + { + "epoch": 10.621717530163236, + "grad_norm": 2.8348300457000732, + "learning_rate": 8.938296664300924e-05, + "loss": 0.014533805847167968, + "step": 74830 + }, + { + "epoch": 10.623136976579135, + "grad_norm": 1.1169466972351074, + "learning_rate": 8.938154719659332e-05, + "loss": 0.028077208995819093, + "step": 74840 + }, + { + "epoch": 10.624556422995031, + "grad_norm": 0.6225524544715881, + "learning_rate": 8.938012775017744e-05, + "loss": 0.012209897488355636, + "step": 74850 + }, + { + "epoch": 10.62597586941093, + "grad_norm": 0.010887812823057175, + "learning_rate": 8.937870830376153e-05, + "loss": 0.03707170486450195, + "step": 74860 + }, + { + "epoch": 10.627395315826828, + "grad_norm": 0.1151101142168045, + "learning_rate": 8.937728885734564e-05, + "loss": 0.03600144684314728, + "step": 74870 + }, + { + "epoch": 10.628814762242726, + "grad_norm": 8.231071472167969, + "learning_rate": 8.937586941092974e-05, + "loss": 0.053849917650222776, + "step": 74880 + }, + { + "epoch": 10.630234208658623, + "grad_norm": 1.1308414936065674, + "learning_rate": 8.937444996451384e-05, + "loss": 0.02963399589061737, + "step": 74890 + }, + { + "epoch": 10.63165365507452, + "grad_norm": 0.0716499611735344, + "learning_rate": 8.937303051809795e-05, + "loss": 0.018058374524116516, + "step": 74900 + }, + { + "epoch": 10.63307310149042, + "grad_norm": 3.5925307273864746, + "learning_rate": 8.937161107168205e-05, + "loss": 0.02541220486164093, + "step": 74910 + }, + { + "epoch": 10.634492547906316, + "grad_norm": 0.12991972267627716, + "learning_rate": 8.937019162526616e-05, + "loss": 0.03352646231651306, + "step": 74920 + }, + { + "epoch": 10.635911994322214, + "grad_norm": 9.830734252929688, + "learning_rate": 8.936877217885026e-05, + "loss": 0.06148951053619385, + "step": 74930 + }, + { + "epoch": 10.637331440738112, + "grad_norm": 1.4155148267745972, + "learning_rate": 8.936735273243435e-05, + "loss": 0.015182647109031677, + "step": 74940 + }, + { + "epoch": 10.63875088715401, + "grad_norm": 0.11837710440158844, + "learning_rate": 8.936593328601845e-05, + "loss": 0.016064786911010744, + "step": 74950 + }, + { + "epoch": 10.640170333569907, + "grad_norm": 1.7777584791183472, + "learning_rate": 8.936451383960256e-05, + "loss": 0.040622872114181516, + "step": 74960 + }, + { + "epoch": 10.641589779985805, + "grad_norm": 0.8232346773147583, + "learning_rate": 8.936309439318666e-05, + "loss": 0.02240632474422455, + "step": 74970 + }, + { + "epoch": 10.643009226401704, + "grad_norm": 0.6091140508651733, + "learning_rate": 8.936167494677077e-05, + "loss": 0.01667594611644745, + "step": 74980 + }, + { + "epoch": 10.6444286728176, + "grad_norm": 0.2595471441745758, + "learning_rate": 8.936025550035487e-05, + "loss": 0.022316190600395202, + "step": 74990 + }, + { + "epoch": 10.645848119233499, + "grad_norm": 0.0824456438422203, + "learning_rate": 8.935883605393896e-05, + "loss": 0.04005849361419678, + "step": 75000 + }, + { + "epoch": 10.645848119233499, + "eval_accuracy": 0.9822598079735487, + "eval_loss": 0.05868366360664368, + "eval_runtime": 35.5358, + "eval_samples_per_second": 442.568, + "eval_steps_per_second": 13.845, + "step": 75000 + }, + { + "epoch": 10.647267565649397, + "grad_norm": 0.06293433904647827, + "learning_rate": 8.935741660752307e-05, + "loss": 0.01167212575674057, + "step": 75010 + }, + { + "epoch": 10.648687012065295, + "grad_norm": 3.401526927947998, + "learning_rate": 8.935599716110717e-05, + "loss": 0.02366384267807007, + "step": 75020 + }, + { + "epoch": 10.650106458481192, + "grad_norm": 1.3368604183197021, + "learning_rate": 8.935457771469128e-05, + "loss": 0.030054858326911925, + "step": 75030 + }, + { + "epoch": 10.65152590489709, + "grad_norm": 0.2567216157913208, + "learning_rate": 8.935315826827537e-05, + "loss": 0.009073252975940704, + "step": 75040 + }, + { + "epoch": 10.652945351312988, + "grad_norm": 6.668602466583252, + "learning_rate": 8.935173882185948e-05, + "loss": 0.02904718816280365, + "step": 75050 + }, + { + "epoch": 10.654364797728885, + "grad_norm": 0.33078089356422424, + "learning_rate": 8.935031937544358e-05, + "loss": 0.006366993486881256, + "step": 75060 + }, + { + "epoch": 10.655784244144783, + "grad_norm": 0.9511492252349854, + "learning_rate": 8.934889992902769e-05, + "loss": 0.05331156253814697, + "step": 75070 + }, + { + "epoch": 10.657203690560682, + "grad_norm": 9.841053009033203, + "learning_rate": 8.934748048261178e-05, + "loss": 0.04374881982803345, + "step": 75080 + }, + { + "epoch": 10.65862313697658, + "grad_norm": 1.8360716104507446, + "learning_rate": 8.934606103619588e-05, + "loss": 0.05705385804176331, + "step": 75090 + }, + { + "epoch": 10.660042583392476, + "grad_norm": 6.725709915161133, + "learning_rate": 8.934464158977999e-05, + "loss": 0.023094192147254944, + "step": 75100 + }, + { + "epoch": 10.661462029808375, + "grad_norm": 0.10598300397396088, + "learning_rate": 8.934322214336409e-05, + "loss": 0.011065666377544404, + "step": 75110 + }, + { + "epoch": 10.662881476224273, + "grad_norm": 0.052498895674943924, + "learning_rate": 8.93418026969482e-05, + "loss": 0.00460980124771595, + "step": 75120 + }, + { + "epoch": 10.66430092264017, + "grad_norm": 0.47180619835853577, + "learning_rate": 8.93403832505323e-05, + "loss": 0.02264900803565979, + "step": 75130 + }, + { + "epoch": 10.665720369056068, + "grad_norm": 1.3694490194320679, + "learning_rate": 8.93389638041164e-05, + "loss": 0.05580574870109558, + "step": 75140 + }, + { + "epoch": 10.667139815471966, + "grad_norm": 1.9044922590255737, + "learning_rate": 8.933754435770049e-05, + "loss": 0.012723922729492188, + "step": 75150 + }, + { + "epoch": 10.668559261887864, + "grad_norm": 0.10306615382432938, + "learning_rate": 8.93361249112846e-05, + "loss": 0.03403681516647339, + "step": 75160 + }, + { + "epoch": 10.669978708303761, + "grad_norm": 0.0405077300965786, + "learning_rate": 8.93347054648687e-05, + "loss": 0.047677081823348996, + "step": 75170 + }, + { + "epoch": 10.67139815471966, + "grad_norm": 1.0609707832336426, + "learning_rate": 8.933328601845281e-05, + "loss": 0.015549467504024505, + "step": 75180 + }, + { + "epoch": 10.672817601135558, + "grad_norm": 3.3458545207977295, + "learning_rate": 8.933186657203691e-05, + "loss": 0.016075128316879274, + "step": 75190 + }, + { + "epoch": 10.674237047551454, + "grad_norm": 3.9271702766418457, + "learning_rate": 8.9330447125621e-05, + "loss": 0.044463536143302916, + "step": 75200 + }, + { + "epoch": 10.675656493967352, + "grad_norm": 0.6383125185966492, + "learning_rate": 8.932902767920512e-05, + "loss": 0.059473490715026854, + "step": 75210 + }, + { + "epoch": 10.67707594038325, + "grad_norm": 0.0646110400557518, + "learning_rate": 8.932760823278921e-05, + "loss": 0.005915617942810059, + "step": 75220 + }, + { + "epoch": 10.678495386799149, + "grad_norm": 1.6763479709625244, + "learning_rate": 8.932618878637333e-05, + "loss": 0.08584545850753784, + "step": 75230 + }, + { + "epoch": 10.679914833215046, + "grad_norm": 2.803025484085083, + "learning_rate": 8.932476933995742e-05, + "loss": 0.012907765805721283, + "step": 75240 + }, + { + "epoch": 10.681334279630944, + "grad_norm": 4.714158535003662, + "learning_rate": 8.932334989354152e-05, + "loss": 0.014616544544696807, + "step": 75250 + }, + { + "epoch": 10.682753726046842, + "grad_norm": 6.278298854827881, + "learning_rate": 8.932193044712562e-05, + "loss": 0.08344311714172363, + "step": 75260 + }, + { + "epoch": 10.684173172462739, + "grad_norm": 0.2862883508205414, + "learning_rate": 8.932051100070973e-05, + "loss": 0.0512550950050354, + "step": 75270 + }, + { + "epoch": 10.685592618878637, + "grad_norm": 1.354404091835022, + "learning_rate": 8.931909155429383e-05, + "loss": 0.0363610714673996, + "step": 75280 + }, + { + "epoch": 10.687012065294535, + "grad_norm": 2.309035062789917, + "learning_rate": 8.931767210787794e-05, + "loss": 0.033329719305038454, + "step": 75290 + }, + { + "epoch": 10.688431511710434, + "grad_norm": 0.763126015663147, + "learning_rate": 8.931625266146203e-05, + "loss": 0.00558549165725708, + "step": 75300 + }, + { + "epoch": 10.68985095812633, + "grad_norm": 2.8885669708251953, + "learning_rate": 8.931483321504613e-05, + "loss": 0.022483193874359132, + "step": 75310 + }, + { + "epoch": 10.691270404542228, + "grad_norm": 1.614470362663269, + "learning_rate": 8.931341376863024e-05, + "loss": 0.046516886353492735, + "step": 75320 + }, + { + "epoch": 10.692689850958127, + "grad_norm": 0.4518176019191742, + "learning_rate": 8.931199432221434e-05, + "loss": 0.016953733563423157, + "step": 75330 + }, + { + "epoch": 10.694109297374023, + "grad_norm": 0.07498191297054291, + "learning_rate": 8.931057487579845e-05, + "loss": 0.0493558257818222, + "step": 75340 + }, + { + "epoch": 10.695528743789922, + "grad_norm": 1.2771762609481812, + "learning_rate": 8.930915542938253e-05, + "loss": 0.033792906999588014, + "step": 75350 + }, + { + "epoch": 10.69694819020582, + "grad_norm": 5.1456403732299805, + "learning_rate": 8.930773598296665e-05, + "loss": 0.0450606644153595, + "step": 75360 + }, + { + "epoch": 10.698367636621718, + "grad_norm": 16.016571044921875, + "learning_rate": 8.930631653655074e-05, + "loss": 0.060096734762191774, + "step": 75370 + }, + { + "epoch": 10.699787083037615, + "grad_norm": 0.5739765167236328, + "learning_rate": 8.930489709013485e-05, + "loss": 0.09587665796279907, + "step": 75380 + }, + { + "epoch": 10.701206529453513, + "grad_norm": 2.421649217605591, + "learning_rate": 8.930347764371895e-05, + "loss": 0.01479882448911667, + "step": 75390 + }, + { + "epoch": 10.702625975869411, + "grad_norm": 0.6501554846763611, + "learning_rate": 8.930205819730305e-05, + "loss": 0.02887186110019684, + "step": 75400 + }, + { + "epoch": 10.704045422285308, + "grad_norm": 0.08935364335775375, + "learning_rate": 8.930063875088716e-05, + "loss": 0.018288043141365052, + "step": 75410 + }, + { + "epoch": 10.705464868701206, + "grad_norm": 9.896994590759277, + "learning_rate": 8.929921930447126e-05, + "loss": 0.01711161434650421, + "step": 75420 + }, + { + "epoch": 10.706884315117104, + "grad_norm": 4.586911678314209, + "learning_rate": 8.929779985805537e-05, + "loss": 0.004791490733623505, + "step": 75430 + }, + { + "epoch": 10.708303761533003, + "grad_norm": 2.6947715282440186, + "learning_rate": 8.929638041163947e-05, + "loss": 0.04513532817363739, + "step": 75440 + }, + { + "epoch": 10.7097232079489, + "grad_norm": 0.541801393032074, + "learning_rate": 8.929496096522356e-05, + "loss": 0.03101123869419098, + "step": 75450 + }, + { + "epoch": 10.711142654364798, + "grad_norm": 1.2876341342926025, + "learning_rate": 8.929354151880766e-05, + "loss": 0.01341709792613983, + "step": 75460 + }, + { + "epoch": 10.712562100780696, + "grad_norm": 4.885845184326172, + "learning_rate": 8.929212207239177e-05, + "loss": 0.007390654087066651, + "step": 75470 + }, + { + "epoch": 10.713981547196592, + "grad_norm": 0.015270525589585304, + "learning_rate": 8.929070262597587e-05, + "loss": 0.015379874408245087, + "step": 75480 + }, + { + "epoch": 10.71540099361249, + "grad_norm": 3.687673568725586, + "learning_rate": 8.928928317955998e-05, + "loss": 0.005444001033902168, + "step": 75490 + }, + { + "epoch": 10.716820440028389, + "grad_norm": 10.40943717956543, + "learning_rate": 8.928786373314408e-05, + "loss": 0.035025835037231445, + "step": 75500 + }, + { + "epoch": 10.716820440028389, + "eval_accuracy": 0.9825777325618363, + "eval_loss": 0.05798032879829407, + "eval_runtime": 35.3118, + "eval_samples_per_second": 445.375, + "eval_steps_per_second": 13.933, + "step": 75500 + }, + { + "epoch": 10.718239886444287, + "grad_norm": 0.14924176037311554, + "learning_rate": 8.928644428672817e-05, + "loss": 0.02595347762107849, + "step": 75510 + }, + { + "epoch": 10.719659332860184, + "grad_norm": 7.112300872802734, + "learning_rate": 8.928502484031229e-05, + "loss": 0.04068517088890076, + "step": 75520 + }, + { + "epoch": 10.721078779276082, + "grad_norm": 0.46678414940834045, + "learning_rate": 8.928360539389638e-05, + "loss": 0.06199964880943298, + "step": 75530 + }, + { + "epoch": 10.72249822569198, + "grad_norm": 7.660800457000732, + "learning_rate": 8.92821859474805e-05, + "loss": 0.06672753095626831, + "step": 75540 + }, + { + "epoch": 10.723917672107877, + "grad_norm": 0.13635525107383728, + "learning_rate": 8.928076650106459e-05, + "loss": 0.0058893729001283646, + "step": 75550 + }, + { + "epoch": 10.725337118523775, + "grad_norm": 0.050406236201524734, + "learning_rate": 8.927934705464869e-05, + "loss": 0.02629355192184448, + "step": 75560 + }, + { + "epoch": 10.726756564939674, + "grad_norm": 2.4663329124450684, + "learning_rate": 8.927792760823279e-05, + "loss": 0.02295355051755905, + "step": 75570 + }, + { + "epoch": 10.728176011355572, + "grad_norm": 1.0391991138458252, + "learning_rate": 8.92765081618169e-05, + "loss": 0.023616223037242888, + "step": 75580 + }, + { + "epoch": 10.729595457771469, + "grad_norm": 13.274736404418945, + "learning_rate": 8.927508871540101e-05, + "loss": 0.07250704765319824, + "step": 75590 + }, + { + "epoch": 10.731014904187367, + "grad_norm": 1.5516211986541748, + "learning_rate": 8.92736692689851e-05, + "loss": 0.027104687690734864, + "step": 75600 + }, + { + "epoch": 10.732434350603265, + "grad_norm": 0.705222487449646, + "learning_rate": 8.92722498225692e-05, + "loss": 0.04327774345874787, + "step": 75610 + }, + { + "epoch": 10.733853797019162, + "grad_norm": 0.7016850709915161, + "learning_rate": 8.92708303761533e-05, + "loss": 0.023375515639781953, + "step": 75620 + }, + { + "epoch": 10.73527324343506, + "grad_norm": 7.4501776695251465, + "learning_rate": 8.926941092973741e-05, + "loss": 0.06602050065994262, + "step": 75630 + }, + { + "epoch": 10.736692689850958, + "grad_norm": 0.02406780607998371, + "learning_rate": 8.926799148332151e-05, + "loss": 0.052145916223526004, + "step": 75640 + }, + { + "epoch": 10.738112136266857, + "grad_norm": 7.1097331047058105, + "learning_rate": 8.926657203690562e-05, + "loss": 0.04959434568881989, + "step": 75650 + }, + { + "epoch": 10.739531582682753, + "grad_norm": 0.3886570632457733, + "learning_rate": 8.92651525904897e-05, + "loss": 0.01884925365447998, + "step": 75660 + }, + { + "epoch": 10.740951029098651, + "grad_norm": 13.658404350280762, + "learning_rate": 8.926373314407381e-05, + "loss": 0.03599739670753479, + "step": 75670 + }, + { + "epoch": 10.74237047551455, + "grad_norm": 4.648027420043945, + "learning_rate": 8.926231369765792e-05, + "loss": 0.041656050086021426, + "step": 75680 + }, + { + "epoch": 10.743789921930446, + "grad_norm": 0.9928642511367798, + "learning_rate": 8.926089425124202e-05, + "loss": 0.03774539828300476, + "step": 75690 + }, + { + "epoch": 10.745209368346345, + "grad_norm": 0.8404362797737122, + "learning_rate": 8.925947480482613e-05, + "loss": 0.04949051737785339, + "step": 75700 + }, + { + "epoch": 10.746628814762243, + "grad_norm": 0.06839687377214432, + "learning_rate": 8.925805535841022e-05, + "loss": 0.017173881828784942, + "step": 75710 + }, + { + "epoch": 10.748048261178141, + "grad_norm": 0.04905456304550171, + "learning_rate": 8.925663591199433e-05, + "loss": 0.07567955255508423, + "step": 75720 + }, + { + "epoch": 10.749467707594038, + "grad_norm": 0.1964964121580124, + "learning_rate": 8.925521646557842e-05, + "loss": 0.019594097137451173, + "step": 75730 + }, + { + "epoch": 10.750887154009936, + "grad_norm": 2.3201723098754883, + "learning_rate": 8.925379701916254e-05, + "loss": 0.02814412713050842, + "step": 75740 + }, + { + "epoch": 10.752306600425834, + "grad_norm": 9.624185562133789, + "learning_rate": 8.925237757274663e-05, + "loss": 0.02252240777015686, + "step": 75750 + }, + { + "epoch": 10.75372604684173, + "grad_norm": 0.9482161402702332, + "learning_rate": 8.925095812633073e-05, + "loss": 0.011075331270694733, + "step": 75760 + }, + { + "epoch": 10.75514549325763, + "grad_norm": 4.261680603027344, + "learning_rate": 8.924953867991484e-05, + "loss": 0.012068639695644378, + "step": 75770 + }, + { + "epoch": 10.756564939673527, + "grad_norm": 2.8619332313537598, + "learning_rate": 8.924811923349894e-05, + "loss": 0.04735492467880249, + "step": 75780 + }, + { + "epoch": 10.757984386089426, + "grad_norm": 0.07442318648099899, + "learning_rate": 8.924669978708305e-05, + "loss": 0.01848388910293579, + "step": 75790 + }, + { + "epoch": 10.759403832505322, + "grad_norm": 0.33886000514030457, + "learning_rate": 8.924528034066715e-05, + "loss": 0.03830481171607971, + "step": 75800 + }, + { + "epoch": 10.76082327892122, + "grad_norm": 4.740257263183594, + "learning_rate": 8.924386089425124e-05, + "loss": 0.01993533968925476, + "step": 75810 + }, + { + "epoch": 10.762242725337119, + "grad_norm": 4.805813789367676, + "learning_rate": 8.924244144783534e-05, + "loss": 0.016506880521774292, + "step": 75820 + }, + { + "epoch": 10.763662171753015, + "grad_norm": 0.7994409203529358, + "learning_rate": 8.924102200141945e-05, + "loss": 0.020026545226573943, + "step": 75830 + }, + { + "epoch": 10.765081618168914, + "grad_norm": 0.1343299001455307, + "learning_rate": 8.923960255500355e-05, + "loss": 0.01588977873325348, + "step": 75840 + }, + { + "epoch": 10.766501064584812, + "grad_norm": 0.33635425567626953, + "learning_rate": 8.923818310858766e-05, + "loss": 0.015015700459480285, + "step": 75850 + }, + { + "epoch": 10.76792051100071, + "grad_norm": 3.677292585372925, + "learning_rate": 8.923676366217176e-05, + "loss": 0.020980848371982573, + "step": 75860 + }, + { + "epoch": 10.769339957416607, + "grad_norm": 1.9598472118377686, + "learning_rate": 8.923534421575586e-05, + "loss": 0.03935782611370087, + "step": 75870 + }, + { + "epoch": 10.770759403832505, + "grad_norm": 6.468889236450195, + "learning_rate": 8.923392476933997e-05, + "loss": 0.038150209188461306, + "step": 75880 + }, + { + "epoch": 10.772178850248403, + "grad_norm": 0.1789119988679886, + "learning_rate": 8.923250532292406e-05, + "loss": 0.03437398672103882, + "step": 75890 + }, + { + "epoch": 10.7735982966643, + "grad_norm": 4.859720230102539, + "learning_rate": 8.923108587650818e-05, + "loss": 0.04559687972068786, + "step": 75900 + }, + { + "epoch": 10.775017743080198, + "grad_norm": 0.1075957715511322, + "learning_rate": 8.922966643009227e-05, + "loss": 0.022505611181259155, + "step": 75910 + }, + { + "epoch": 10.776437189496097, + "grad_norm": 0.7738917469978333, + "learning_rate": 8.922824698367637e-05, + "loss": 0.02040761262178421, + "step": 75920 + }, + { + "epoch": 10.777856635911995, + "grad_norm": 0.00945677887648344, + "learning_rate": 8.922682753726047e-05, + "loss": 0.07506464719772339, + "step": 75930 + }, + { + "epoch": 10.779276082327891, + "grad_norm": 0.06521529704332352, + "learning_rate": 8.922540809084458e-05, + "loss": 0.017863285541534425, + "step": 75940 + }, + { + "epoch": 10.78069552874379, + "grad_norm": 0.05482591688632965, + "learning_rate": 8.922398864442868e-05, + "loss": 0.009694677591323853, + "step": 75950 + }, + { + "epoch": 10.782114975159688, + "grad_norm": 0.3654983341693878, + "learning_rate": 8.922256919801279e-05, + "loss": 0.05228215456008911, + "step": 75960 + }, + { + "epoch": 10.783534421575585, + "grad_norm": 9.811040878295898, + "learning_rate": 8.922114975159688e-05, + "loss": 0.04277914464473724, + "step": 75970 + }, + { + "epoch": 10.784953867991483, + "grad_norm": 8.080113410949707, + "learning_rate": 8.921973030518098e-05, + "loss": 0.06942117214202881, + "step": 75980 + }, + { + "epoch": 10.786373314407381, + "grad_norm": 3.861632823944092, + "learning_rate": 8.921831085876509e-05, + "loss": 0.04788309037685394, + "step": 75990 + }, + { + "epoch": 10.78779276082328, + "grad_norm": 2.738668203353882, + "learning_rate": 8.921689141234919e-05, + "loss": 0.012685440480709076, + "step": 76000 + }, + { + "epoch": 10.78779276082328, + "eval_accuracy": 0.9816875437146309, + "eval_loss": 0.062259022146463394, + "eval_runtime": 33.1136, + "eval_samples_per_second": 474.94, + "eval_steps_per_second": 14.858, + "step": 76000 + }, + { + "epoch": 10.789212207239176, + "grad_norm": 5.251237869262695, + "learning_rate": 8.92154719659333e-05, + "loss": 0.051242268085479735, + "step": 76010 + }, + { + "epoch": 10.790631653655074, + "grad_norm": 1.256506085395813, + "learning_rate": 8.921405251951738e-05, + "loss": 0.02849005162715912, + "step": 76020 + }, + { + "epoch": 10.792051100070973, + "grad_norm": 8.649620056152344, + "learning_rate": 8.92126330731015e-05, + "loss": 0.021818137168884276, + "step": 76030 + }, + { + "epoch": 10.79347054648687, + "grad_norm": 9.754542350769043, + "learning_rate": 8.921121362668559e-05, + "loss": 0.02822069525718689, + "step": 76040 + }, + { + "epoch": 10.794889992902768, + "grad_norm": 1.492602825164795, + "learning_rate": 8.92097941802697e-05, + "loss": 0.007633008062839508, + "step": 76050 + }, + { + "epoch": 10.796309439318666, + "grad_norm": 0.14724503457546234, + "learning_rate": 8.92083747338538e-05, + "loss": 0.03960501551628113, + "step": 76060 + }, + { + "epoch": 10.797728885734564, + "grad_norm": 10.349737167358398, + "learning_rate": 8.92069552874379e-05, + "loss": 0.034658610820770264, + "step": 76070 + }, + { + "epoch": 10.79914833215046, + "grad_norm": 0.13395637273788452, + "learning_rate": 8.920553584102201e-05, + "loss": 0.021459685266017915, + "step": 76080 + }, + { + "epoch": 10.800567778566359, + "grad_norm": 0.9659720063209534, + "learning_rate": 8.92041163946061e-05, + "loss": 0.0231198787689209, + "step": 76090 + }, + { + "epoch": 10.801987224982257, + "grad_norm": 10.902729034423828, + "learning_rate": 8.920269694819022e-05, + "loss": 0.03658841252326965, + "step": 76100 + }, + { + "epoch": 10.803406671398154, + "grad_norm": 0.12141856551170349, + "learning_rate": 8.920127750177431e-05, + "loss": 0.030691704154014586, + "step": 76110 + }, + { + "epoch": 10.804826117814052, + "grad_norm": 0.9635015726089478, + "learning_rate": 8.919985805535841e-05, + "loss": 0.026519355177879334, + "step": 76120 + }, + { + "epoch": 10.80624556422995, + "grad_norm": 0.17087894678115845, + "learning_rate": 8.919843860894251e-05, + "loss": 0.008873078972101212, + "step": 76130 + }, + { + "epoch": 10.807665010645849, + "grad_norm": 1.8956565856933594, + "learning_rate": 8.919701916252662e-05, + "loss": 0.0233942449092865, + "step": 76140 + }, + { + "epoch": 10.809084457061745, + "grad_norm": 0.9512972235679626, + "learning_rate": 8.919559971611072e-05, + "loss": 0.05020730495452881, + "step": 76150 + }, + { + "epoch": 10.810503903477644, + "grad_norm": 2.1291935443878174, + "learning_rate": 8.919418026969483e-05, + "loss": 0.06957405805587769, + "step": 76160 + }, + { + "epoch": 10.811923349893542, + "grad_norm": 5.251128196716309, + "learning_rate": 8.919276082327893e-05, + "loss": 0.030325111746788026, + "step": 76170 + }, + { + "epoch": 10.813342796309438, + "grad_norm": 0.18351991474628448, + "learning_rate": 8.919134137686302e-05, + "loss": 0.05944993495941162, + "step": 76180 + }, + { + "epoch": 10.814762242725337, + "grad_norm": 3.2230257987976074, + "learning_rate": 8.918992193044713e-05, + "loss": 0.027699339389801025, + "step": 76190 + }, + { + "epoch": 10.816181689141235, + "grad_norm": 9.277687072753906, + "learning_rate": 8.918850248403123e-05, + "loss": 0.030685320496559143, + "step": 76200 + }, + { + "epoch": 10.817601135557133, + "grad_norm": 8.766088485717773, + "learning_rate": 8.918708303761534e-05, + "loss": 0.018734395503997803, + "step": 76210 + }, + { + "epoch": 10.81902058197303, + "grad_norm": 4.725594997406006, + "learning_rate": 8.918566359119944e-05, + "loss": 0.04566576480865479, + "step": 76220 + }, + { + "epoch": 10.820440028388928, + "grad_norm": 0.19386504590511322, + "learning_rate": 8.918424414478354e-05, + "loss": 0.012554574012756347, + "step": 76230 + }, + { + "epoch": 10.821859474804826, + "grad_norm": 0.6063259243965149, + "learning_rate": 8.918282469836763e-05, + "loss": 0.053751158714294436, + "step": 76240 + }, + { + "epoch": 10.823278921220723, + "grad_norm": 0.2790958285331726, + "learning_rate": 8.918140525195175e-05, + "loss": 0.08109488487243652, + "step": 76250 + }, + { + "epoch": 10.824698367636621, + "grad_norm": 2.9445583820343018, + "learning_rate": 8.917998580553584e-05, + "loss": 0.046107107400894166, + "step": 76260 + }, + { + "epoch": 10.82611781405252, + "grad_norm": 1.1821379661560059, + "learning_rate": 8.917856635911995e-05, + "loss": 0.044123581051826476, + "step": 76270 + }, + { + "epoch": 10.827537260468418, + "grad_norm": 0.30668017268180847, + "learning_rate": 8.917714691270405e-05, + "loss": 0.028360003232955934, + "step": 76280 + }, + { + "epoch": 10.828956706884314, + "grad_norm": 0.2103549838066101, + "learning_rate": 8.917572746628815e-05, + "loss": 0.05053573250770569, + "step": 76290 + }, + { + "epoch": 10.830376153300213, + "grad_norm": 0.880430281162262, + "learning_rate": 8.917430801987226e-05, + "loss": 0.01300375759601593, + "step": 76300 + }, + { + "epoch": 10.831795599716111, + "grad_norm": 11.212428092956543, + "learning_rate": 8.917288857345636e-05, + "loss": 0.07399303913116455, + "step": 76310 + }, + { + "epoch": 10.833215046132008, + "grad_norm": 0.44175541400909424, + "learning_rate": 8.917146912704047e-05, + "loss": 0.05799729228019714, + "step": 76320 + }, + { + "epoch": 10.834634492547906, + "grad_norm": 4.154860973358154, + "learning_rate": 8.917004968062455e-05, + "loss": 0.03559871315956116, + "step": 76330 + }, + { + "epoch": 10.836053938963804, + "grad_norm": 0.07235050946474075, + "learning_rate": 8.916863023420866e-05, + "loss": 0.039110025763511656, + "step": 76340 + }, + { + "epoch": 10.837473385379703, + "grad_norm": 0.8463187217712402, + "learning_rate": 8.916721078779276e-05, + "loss": 0.05355388522148132, + "step": 76350 + }, + { + "epoch": 10.838892831795599, + "grad_norm": 2.4119479656219482, + "learning_rate": 8.916579134137687e-05, + "loss": 0.038125574588775635, + "step": 76360 + }, + { + "epoch": 10.840312278211497, + "grad_norm": 0.2792767286300659, + "learning_rate": 8.916437189496097e-05, + "loss": 0.03929415941238403, + "step": 76370 + }, + { + "epoch": 10.841731724627396, + "grad_norm": 2.039494752883911, + "learning_rate": 8.916295244854507e-05, + "loss": 0.017711035907268524, + "step": 76380 + }, + { + "epoch": 10.843151171043292, + "grad_norm": 2.766622543334961, + "learning_rate": 8.916153300212918e-05, + "loss": 0.02152646780014038, + "step": 76390 + }, + { + "epoch": 10.84457061745919, + "grad_norm": 0.2604852318763733, + "learning_rate": 8.916011355571327e-05, + "loss": 0.05204763412475586, + "step": 76400 + }, + { + "epoch": 10.845990063875089, + "grad_norm": 0.023158498108386993, + "learning_rate": 8.915869410929739e-05, + "loss": 0.015346193313598632, + "step": 76410 + }, + { + "epoch": 10.847409510290987, + "grad_norm": 0.13105705380439758, + "learning_rate": 8.915727466288148e-05, + "loss": 0.022280707955360413, + "step": 76420 + }, + { + "epoch": 10.848828956706884, + "grad_norm": 5.168460845947266, + "learning_rate": 8.915585521646558e-05, + "loss": 0.016544350981712343, + "step": 76430 + }, + { + "epoch": 10.850248403122782, + "grad_norm": 0.03163724020123482, + "learning_rate": 8.915443577004968e-05, + "loss": 0.029596129059791566, + "step": 76440 + }, + { + "epoch": 10.85166784953868, + "grad_norm": 10.255916595458984, + "learning_rate": 8.915301632363379e-05, + "loss": 0.01805151104927063, + "step": 76450 + }, + { + "epoch": 10.853087295954577, + "grad_norm": 0.2659781277179718, + "learning_rate": 8.915159687721789e-05, + "loss": 0.03853174448013306, + "step": 76460 + }, + { + "epoch": 10.854506742370475, + "grad_norm": 12.415229797363281, + "learning_rate": 8.9150177430802e-05, + "loss": 0.0705374002456665, + "step": 76470 + }, + { + "epoch": 10.855926188786373, + "grad_norm": 6.229217529296875, + "learning_rate": 8.91487579843861e-05, + "loss": 0.04124915301799774, + "step": 76480 + }, + { + "epoch": 10.857345635202272, + "grad_norm": 0.02586524747312069, + "learning_rate": 8.914733853797019e-05, + "loss": 0.015211585164070129, + "step": 76490 + }, + { + "epoch": 10.858765081618168, + "grad_norm": 4.167519569396973, + "learning_rate": 8.91459190915543e-05, + "loss": 0.05534020662307739, + "step": 76500 + }, + { + "epoch": 10.858765081618168, + "eval_accuracy": 0.9865835823742608, + "eval_loss": 0.040406033396720886, + "eval_runtime": 33.5985, + "eval_samples_per_second": 468.086, + "eval_steps_per_second": 14.644, + "step": 76500 + }, + { + "epoch": 10.860184528034067, + "grad_norm": 0.2459697723388672, + "learning_rate": 8.91444996451384e-05, + "loss": 0.02314354032278061, + "step": 76510 + }, + { + "epoch": 10.861603974449965, + "grad_norm": 0.3980192244052887, + "learning_rate": 8.914308019872251e-05, + "loss": 0.020578236877918245, + "step": 76520 + }, + { + "epoch": 10.863023420865863, + "grad_norm": 1.053245186805725, + "learning_rate": 8.91416607523066e-05, + "loss": 0.05024528503417969, + "step": 76530 + }, + { + "epoch": 10.86444286728176, + "grad_norm": 13.911527633666992, + "learning_rate": 8.91402413058907e-05, + "loss": 0.08677828907966614, + "step": 76540 + }, + { + "epoch": 10.865862313697658, + "grad_norm": 0.045699913054704666, + "learning_rate": 8.91388218594748e-05, + "loss": 0.021171575784683226, + "step": 76550 + }, + { + "epoch": 10.867281760113556, + "grad_norm": 0.06769008189439774, + "learning_rate": 8.913740241305891e-05, + "loss": 0.05880331993103027, + "step": 76560 + }, + { + "epoch": 10.868701206529453, + "grad_norm": 15.066309928894043, + "learning_rate": 8.913598296664301e-05, + "loss": 0.09242464900016785, + "step": 76570 + }, + { + "epoch": 10.870120652945351, + "grad_norm": 10.06954288482666, + "learning_rate": 8.913456352022712e-05, + "loss": 0.06237143278121948, + "step": 76580 + }, + { + "epoch": 10.87154009936125, + "grad_norm": 5.549213886260986, + "learning_rate": 8.913314407381122e-05, + "loss": 0.01747702956199646, + "step": 76590 + }, + { + "epoch": 10.872959545777148, + "grad_norm": 0.04882671684026718, + "learning_rate": 8.913172462739532e-05, + "loss": 0.029430165886878967, + "step": 76600 + }, + { + "epoch": 10.874378992193044, + "grad_norm": 3.903329372406006, + "learning_rate": 8.913030518097943e-05, + "loss": 0.01609521061182022, + "step": 76610 + }, + { + "epoch": 10.875798438608943, + "grad_norm": 4.08885383605957, + "learning_rate": 8.912888573456352e-05, + "loss": 0.014848698675632478, + "step": 76620 + }, + { + "epoch": 10.87721788502484, + "grad_norm": 5.423572540283203, + "learning_rate": 8.912746628814764e-05, + "loss": 0.0083135724067688, + "step": 76630 + }, + { + "epoch": 10.878637331440737, + "grad_norm": 11.061261177062988, + "learning_rate": 8.912604684173172e-05, + "loss": 0.07412885427474976, + "step": 76640 + }, + { + "epoch": 10.880056777856636, + "grad_norm": 0.16122563183307648, + "learning_rate": 8.912462739531583e-05, + "loss": 0.021539703011512756, + "step": 76650 + }, + { + "epoch": 10.881476224272534, + "grad_norm": 0.2158479243516922, + "learning_rate": 8.912320794889993e-05, + "loss": 0.026331749558448792, + "step": 76660 + }, + { + "epoch": 10.882895670688432, + "grad_norm": 1.3243156671524048, + "learning_rate": 8.912178850248404e-05, + "loss": 0.02839702069759369, + "step": 76670 + }, + { + "epoch": 10.884315117104329, + "grad_norm": 8.35107707977295, + "learning_rate": 8.912036905606814e-05, + "loss": 0.05367217659950256, + "step": 76680 + }, + { + "epoch": 10.885734563520227, + "grad_norm": 2.349330425262451, + "learning_rate": 8.911894960965223e-05, + "loss": 0.02370249629020691, + "step": 76690 + }, + { + "epoch": 10.887154009936125, + "grad_norm": 0.0545620433986187, + "learning_rate": 8.911753016323634e-05, + "loss": 0.021818794310092926, + "step": 76700 + }, + { + "epoch": 10.888573456352022, + "grad_norm": 0.1560916304588318, + "learning_rate": 8.911611071682044e-05, + "loss": 0.014035370945930482, + "step": 76710 + }, + { + "epoch": 10.88999290276792, + "grad_norm": 0.5020081400871277, + "learning_rate": 8.911469127040455e-05, + "loss": 0.03221254944801331, + "step": 76720 + }, + { + "epoch": 10.891412349183819, + "grad_norm": 0.08128255605697632, + "learning_rate": 8.911327182398865e-05, + "loss": 0.028797352313995363, + "step": 76730 + }, + { + "epoch": 10.892831795599717, + "grad_norm": 14.98328685760498, + "learning_rate": 8.911185237757275e-05, + "loss": 0.0881187915802002, + "step": 76740 + }, + { + "epoch": 10.894251242015613, + "grad_norm": 9.377324104309082, + "learning_rate": 8.911043293115685e-05, + "loss": 0.043282487988471986, + "step": 76750 + }, + { + "epoch": 10.895670688431512, + "grad_norm": 7.479063510894775, + "learning_rate": 8.910901348474096e-05, + "loss": 0.05147362947463989, + "step": 76760 + }, + { + "epoch": 10.89709013484741, + "grad_norm": 1.4757670164108276, + "learning_rate": 8.910759403832505e-05, + "loss": 0.021717017889022826, + "step": 76770 + }, + { + "epoch": 10.898509581263307, + "grad_norm": 2.818730354309082, + "learning_rate": 8.910617459190916e-05, + "loss": 0.005148597434163094, + "step": 76780 + }, + { + "epoch": 10.899929027679205, + "grad_norm": 0.34069201350212097, + "learning_rate": 8.910475514549326e-05, + "loss": 0.02241356670856476, + "step": 76790 + }, + { + "epoch": 10.901348474095103, + "grad_norm": 0.8318196535110474, + "learning_rate": 8.910333569907736e-05, + "loss": 0.022158035635948183, + "step": 76800 + }, + { + "epoch": 10.902767920511002, + "grad_norm": 0.2929559350013733, + "learning_rate": 8.910191625266147e-05, + "loss": 0.026474547386169434, + "step": 76810 + }, + { + "epoch": 10.904187366926898, + "grad_norm": 9.473808288574219, + "learning_rate": 8.910049680624557e-05, + "loss": 0.017038679122924803, + "step": 76820 + }, + { + "epoch": 10.905606813342796, + "grad_norm": 12.349149703979492, + "learning_rate": 8.909907735982968e-05, + "loss": 0.01714346408843994, + "step": 76830 + }, + { + "epoch": 10.907026259758695, + "grad_norm": 0.05128764733672142, + "learning_rate": 8.909765791341376e-05, + "loss": 0.012290577590465545, + "step": 76840 + }, + { + "epoch": 10.908445706174591, + "grad_norm": 1.4182794094085693, + "learning_rate": 8.909623846699787e-05, + "loss": 0.045222300291061404, + "step": 76850 + }, + { + "epoch": 10.90986515259049, + "grad_norm": 6.337584972381592, + "learning_rate": 8.909481902058197e-05, + "loss": 0.033546441793441774, + "step": 76860 + }, + { + "epoch": 10.911284599006388, + "grad_norm": 0.1292036771774292, + "learning_rate": 8.909339957416608e-05, + "loss": 0.022637271881103517, + "step": 76870 + }, + { + "epoch": 10.912704045422286, + "grad_norm": 3.266705274581909, + "learning_rate": 8.909198012775018e-05, + "loss": 0.028116098046302794, + "step": 76880 + }, + { + "epoch": 10.914123491838183, + "grad_norm": 12.7665376663208, + "learning_rate": 8.909056068133428e-05, + "loss": 0.06570132970809936, + "step": 76890 + }, + { + "epoch": 10.915542938254081, + "grad_norm": 3.6279942989349365, + "learning_rate": 8.908914123491839e-05, + "loss": 0.04003820419311523, + "step": 76900 + }, + { + "epoch": 10.91696238466998, + "grad_norm": 0.5584992170333862, + "learning_rate": 8.908772178850248e-05, + "loss": 0.05849987268447876, + "step": 76910 + }, + { + "epoch": 10.918381831085876, + "grad_norm": 6.1313629150390625, + "learning_rate": 8.90863023420866e-05, + "loss": 0.024083325266838075, + "step": 76920 + }, + { + "epoch": 10.919801277501774, + "grad_norm": 7.750735282897949, + "learning_rate": 8.908488289567069e-05, + "loss": 0.024571770429611207, + "step": 76930 + }, + { + "epoch": 10.921220723917672, + "grad_norm": 0.34011775255203247, + "learning_rate": 8.90834634492548e-05, + "loss": 0.04700807929039001, + "step": 76940 + }, + { + "epoch": 10.92264017033357, + "grad_norm": 0.03488544002175331, + "learning_rate": 8.908204400283889e-05, + "loss": 0.037614253163337705, + "step": 76950 + }, + { + "epoch": 10.924059616749467, + "grad_norm": 8.300019264221191, + "learning_rate": 8.9080624556423e-05, + "loss": 0.0336763858795166, + "step": 76960 + }, + { + "epoch": 10.925479063165366, + "grad_norm": 3.3698670864105225, + "learning_rate": 8.90792051100071e-05, + "loss": 0.04678107500076294, + "step": 76970 + }, + { + "epoch": 10.926898509581264, + "grad_norm": 0.13484854996204376, + "learning_rate": 8.90777856635912e-05, + "loss": 0.03645238280296326, + "step": 76980 + }, + { + "epoch": 10.92831795599716, + "grad_norm": 2.7829763889312744, + "learning_rate": 8.907636621717532e-05, + "loss": 0.03553824722766876, + "step": 76990 + }, + { + "epoch": 10.929737402413059, + "grad_norm": 0.27800244092941284, + "learning_rate": 8.90749467707594e-05, + "loss": 0.04834374189376831, + "step": 77000 + }, + { + "epoch": 10.929737402413059, + "eval_accuracy": 0.9841673555032746, + "eval_loss": 0.04912128299474716, + "eval_runtime": 32.9222, + "eval_samples_per_second": 477.702, + "eval_steps_per_second": 14.944, + "step": 77000 + }, + { + "epoch": 10.931156848828957, + "grad_norm": 2.2467801570892334, + "learning_rate": 8.907352732434351e-05, + "loss": 0.051036447286605835, + "step": 77010 + }, + { + "epoch": 10.932576295244855, + "grad_norm": 0.80274498462677, + "learning_rate": 8.907210787792761e-05, + "loss": 0.020682474970817565, + "step": 77020 + }, + { + "epoch": 10.933995741660752, + "grad_norm": 5.00384521484375, + "learning_rate": 8.907068843151172e-05, + "loss": 0.020825859904289246, + "step": 77030 + }, + { + "epoch": 10.93541518807665, + "grad_norm": 1.326248049736023, + "learning_rate": 8.906926898509582e-05, + "loss": 0.0339616596698761, + "step": 77040 + }, + { + "epoch": 10.936834634492548, + "grad_norm": 0.44627857208251953, + "learning_rate": 8.906784953867992e-05, + "loss": 0.01936686933040619, + "step": 77050 + }, + { + "epoch": 10.938254080908445, + "grad_norm": 2.1883907318115234, + "learning_rate": 8.906643009226401e-05, + "loss": 0.023100431263446807, + "step": 77060 + }, + { + "epoch": 10.939673527324343, + "grad_norm": 4.381327152252197, + "learning_rate": 8.906501064584812e-05, + "loss": 0.051467007398605345, + "step": 77070 + }, + { + "epoch": 10.941092973740242, + "grad_norm": 0.08656803518533707, + "learning_rate": 8.906359119943223e-05, + "loss": 0.07422645092010498, + "step": 77080 + }, + { + "epoch": 10.94251242015614, + "grad_norm": 1.2520685195922852, + "learning_rate": 8.906217175301633e-05, + "loss": 0.023004311323165893, + "step": 77090 + }, + { + "epoch": 10.943931866572036, + "grad_norm": 0.6970699429512024, + "learning_rate": 8.906075230660043e-05, + "loss": 0.04289089739322662, + "step": 77100 + }, + { + "epoch": 10.945351312987935, + "grad_norm": 1.45811927318573, + "learning_rate": 8.905933286018453e-05, + "loss": 0.050782245397567746, + "step": 77110 + }, + { + "epoch": 10.946770759403833, + "grad_norm": 0.21888741850852966, + "learning_rate": 8.905791341376864e-05, + "loss": 0.005652286112308502, + "step": 77120 + }, + { + "epoch": 10.94819020581973, + "grad_norm": 1.4828088283538818, + "learning_rate": 8.905663591199432e-05, + "loss": 0.037698855996131896, + "step": 77130 + }, + { + "epoch": 10.949609652235628, + "grad_norm": 0.18355588614940643, + "learning_rate": 8.905521646557843e-05, + "loss": 0.02868366539478302, + "step": 77140 + }, + { + "epoch": 10.951029098651526, + "grad_norm": 3.372123956680298, + "learning_rate": 8.905379701916253e-05, + "loss": 0.03374863862991333, + "step": 77150 + }, + { + "epoch": 10.952448545067424, + "grad_norm": 4.836116790771484, + "learning_rate": 8.905237757274664e-05, + "loss": 0.05168530941009521, + "step": 77160 + }, + { + "epoch": 10.953867991483321, + "grad_norm": 5.375553131103516, + "learning_rate": 8.905095812633073e-05, + "loss": 0.11784226894378662, + "step": 77170 + }, + { + "epoch": 10.95528743789922, + "grad_norm": 0.08931706100702286, + "learning_rate": 8.904953867991484e-05, + "loss": 0.0237982839345932, + "step": 77180 + }, + { + "epoch": 10.956706884315118, + "grad_norm": 0.5286836624145508, + "learning_rate": 8.904811923349893e-05, + "loss": 0.031697696447372435, + "step": 77190 + }, + { + "epoch": 10.958126330731014, + "grad_norm": 0.030751101672649384, + "learning_rate": 8.904669978708304e-05, + "loss": 0.038225984573364256, + "step": 77200 + }, + { + "epoch": 10.959545777146912, + "grad_norm": 0.2199520617723465, + "learning_rate": 8.904528034066714e-05, + "loss": 0.014718365669250489, + "step": 77210 + }, + { + "epoch": 10.96096522356281, + "grad_norm": 0.08655641973018646, + "learning_rate": 8.904386089425124e-05, + "loss": 0.03180850744247436, + "step": 77220 + }, + { + "epoch": 10.962384669978709, + "grad_norm": 7.156330585479736, + "learning_rate": 8.904244144783535e-05, + "loss": 0.02334403395652771, + "step": 77230 + }, + { + "epoch": 10.963804116394606, + "grad_norm": 9.984768867492676, + "learning_rate": 8.904102200141945e-05, + "loss": 0.040103816986083986, + "step": 77240 + }, + { + "epoch": 10.965223562810504, + "grad_norm": 7.9748640060424805, + "learning_rate": 8.903960255500356e-05, + "loss": 0.04069978296756745, + "step": 77250 + }, + { + "epoch": 10.966643009226402, + "grad_norm": 7.155030727386475, + "learning_rate": 8.903818310858766e-05, + "loss": 0.022116178274154664, + "step": 77260 + }, + { + "epoch": 10.968062455642299, + "grad_norm": 0.7471809387207031, + "learning_rate": 8.903676366217177e-05, + "loss": 0.01585453748703003, + "step": 77270 + }, + { + "epoch": 10.969481902058197, + "grad_norm": 7.733248233795166, + "learning_rate": 8.903534421575585e-05, + "loss": 0.06696531176567078, + "step": 77280 + }, + { + "epoch": 10.970901348474095, + "grad_norm": 0.08299662172794342, + "learning_rate": 8.903392476933996e-05, + "loss": 0.012213122844696046, + "step": 77290 + }, + { + "epoch": 10.972320794889994, + "grad_norm": 4.243612766265869, + "learning_rate": 8.903250532292406e-05, + "loss": 0.018444839119911193, + "step": 77300 + }, + { + "epoch": 10.97374024130589, + "grad_norm": 4.067358016967773, + "learning_rate": 8.903108587650817e-05, + "loss": 0.0302010178565979, + "step": 77310 + }, + { + "epoch": 10.975159687721789, + "grad_norm": 3.4205238819122314, + "learning_rate": 8.902966643009227e-05, + "loss": 0.011153788864612579, + "step": 77320 + }, + { + "epoch": 10.976579134137687, + "grad_norm": 0.927170991897583, + "learning_rate": 8.902824698367636e-05, + "loss": 0.025216352939605714, + "step": 77330 + }, + { + "epoch": 10.977998580553583, + "grad_norm": 1.6161059141159058, + "learning_rate": 8.902682753726048e-05, + "loss": 0.05900951623916626, + "step": 77340 + }, + { + "epoch": 10.979418026969482, + "grad_norm": 0.03316309675574303, + "learning_rate": 8.902540809084457e-05, + "loss": 0.08623931407928467, + "step": 77350 + }, + { + "epoch": 10.98083747338538, + "grad_norm": 0.952528178691864, + "learning_rate": 8.902398864442868e-05, + "loss": 0.0170826256275177, + "step": 77360 + }, + { + "epoch": 10.982256919801278, + "grad_norm": 2.0128047466278076, + "learning_rate": 8.902256919801278e-05, + "loss": 0.03770047724246979, + "step": 77370 + }, + { + "epoch": 10.983676366217175, + "grad_norm": 0.32875731587409973, + "learning_rate": 8.902114975159688e-05, + "loss": 0.03144612908363342, + "step": 77380 + }, + { + "epoch": 10.985095812633073, + "grad_norm": 1.2215029001235962, + "learning_rate": 8.901973030518098e-05, + "loss": 0.04773979187011719, + "step": 77390 + }, + { + "epoch": 10.986515259048971, + "grad_norm": 6.523492336273193, + "learning_rate": 8.901831085876509e-05, + "loss": 0.035741502046585084, + "step": 77400 + }, + { + "epoch": 10.987934705464868, + "grad_norm": 11.67943000793457, + "learning_rate": 8.901689141234918e-05, + "loss": 0.022857260704040528, + "step": 77410 + }, + { + "epoch": 10.989354151880766, + "grad_norm": 0.12591129541397095, + "learning_rate": 8.90154719659333e-05, + "loss": 0.03355528712272644, + "step": 77420 + }, + { + "epoch": 10.990773598296665, + "grad_norm": 7.616695880889893, + "learning_rate": 8.901405251951739e-05, + "loss": 0.08523765802383423, + "step": 77430 + }, + { + "epoch": 10.992193044712563, + "grad_norm": 19.298355102539062, + "learning_rate": 8.901263307310149e-05, + "loss": 0.06812095642089844, + "step": 77440 + }, + { + "epoch": 10.99361249112846, + "grad_norm": 7.188718795776367, + "learning_rate": 8.90112136266856e-05, + "loss": 0.05898793935775757, + "step": 77450 + }, + { + "epoch": 10.995031937544358, + "grad_norm": 0.2391853779554367, + "learning_rate": 8.90097941802697e-05, + "loss": 0.012519893050193787, + "step": 77460 + }, + { + "epoch": 10.996451383960256, + "grad_norm": 0.12945179641246796, + "learning_rate": 8.900837473385381e-05, + "loss": 0.014145855605602265, + "step": 77470 + }, + { + "epoch": 10.997870830376153, + "grad_norm": 0.19085891544818878, + "learning_rate": 8.90069552874379e-05, + "loss": 0.015647728741168977, + "step": 77480 + }, + { + "epoch": 10.99929027679205, + "grad_norm": 0.04224420338869095, + "learning_rate": 8.9005535841022e-05, + "loss": 0.01942497193813324, + "step": 77490 + }, + { + "epoch": 11.00070972320795, + "grad_norm": 0.024194825440645218, + "learning_rate": 8.90041163946061e-05, + "loss": 0.01477721482515335, + "step": 77500 + }, + { + "epoch": 11.00070972320795, + "eval_accuracy": 0.9854390538564253, + "eval_loss": 0.04882017523050308, + "eval_runtime": 32.218, + "eval_samples_per_second": 488.143, + "eval_steps_per_second": 15.271, + "step": 77500 + }, + { + "epoch": 11.002129169623847, + "grad_norm": 1.208798885345459, + "learning_rate": 8.900269694819021e-05, + "loss": 0.016632518172264098, + "step": 77510 + }, + { + "epoch": 11.003548616039744, + "grad_norm": 5.06233024597168, + "learning_rate": 8.900127750177431e-05, + "loss": 0.03450791537761688, + "step": 77520 + }, + { + "epoch": 11.004968062455642, + "grad_norm": 5.087305545806885, + "learning_rate": 8.899985805535841e-05, + "loss": 0.04986914396286011, + "step": 77530 + }, + { + "epoch": 11.00638750887154, + "grad_norm": 10.168726921081543, + "learning_rate": 8.899843860894252e-05, + "loss": 0.040247094631195066, + "step": 77540 + }, + { + "epoch": 11.007806955287437, + "grad_norm": 0.219258114695549, + "learning_rate": 8.899701916252662e-05, + "loss": 0.006717376410961151, + "step": 77550 + }, + { + "epoch": 11.009226401703335, + "grad_norm": 0.42845240235328674, + "learning_rate": 8.899559971611073e-05, + "loss": 0.022232869267463685, + "step": 77560 + }, + { + "epoch": 11.010645848119234, + "grad_norm": 1.3312067985534668, + "learning_rate": 8.899418026969482e-05, + "loss": 0.020168834924697877, + "step": 77570 + }, + { + "epoch": 11.012065294535132, + "grad_norm": 0.6087434887886047, + "learning_rate": 8.899276082327892e-05, + "loss": 0.012365716695785522, + "step": 77580 + }, + { + "epoch": 11.013484740951029, + "grad_norm": 1.740855097770691, + "learning_rate": 8.899134137686302e-05, + "loss": 0.00474800281226635, + "step": 77590 + }, + { + "epoch": 11.014904187366927, + "grad_norm": 2.4797630310058594, + "learning_rate": 8.898992193044713e-05, + "loss": 0.021309559047222138, + "step": 77600 + }, + { + "epoch": 11.016323633782825, + "grad_norm": 0.2835226058959961, + "learning_rate": 8.898850248403123e-05, + "loss": 0.004374232143163681, + "step": 77610 + }, + { + "epoch": 11.017743080198722, + "grad_norm": 0.2921813130378723, + "learning_rate": 8.898708303761534e-05, + "loss": 0.01224185824394226, + "step": 77620 + }, + { + "epoch": 11.01916252661462, + "grad_norm": 0.12308903783559799, + "learning_rate": 8.898566359119944e-05, + "loss": 0.014115402102470398, + "step": 77630 + }, + { + "epoch": 11.020581973030518, + "grad_norm": 1.0105446577072144, + "learning_rate": 8.898424414478353e-05, + "loss": 0.02689531445503235, + "step": 77640 + }, + { + "epoch": 11.022001419446417, + "grad_norm": 1.195181965827942, + "learning_rate": 8.898282469836764e-05, + "loss": 0.041376516222953796, + "step": 77650 + }, + { + "epoch": 11.023420865862313, + "grad_norm": 2.656174659729004, + "learning_rate": 8.898140525195174e-05, + "loss": 0.025141558051109313, + "step": 77660 + }, + { + "epoch": 11.024840312278211, + "grad_norm": 0.2554796040058136, + "learning_rate": 8.897998580553585e-05, + "loss": 0.030996525287628175, + "step": 77670 + }, + { + "epoch": 11.02625975869411, + "grad_norm": 2.055675506591797, + "learning_rate": 8.897856635911995e-05, + "loss": 0.00680890753865242, + "step": 77680 + }, + { + "epoch": 11.027679205110006, + "grad_norm": 5.830803871154785, + "learning_rate": 8.897714691270405e-05, + "loss": 0.09050383567810058, + "step": 77690 + }, + { + "epoch": 11.029098651525905, + "grad_norm": 2.9373791217803955, + "learning_rate": 8.897572746628814e-05, + "loss": 0.031220877170562746, + "step": 77700 + }, + { + "epoch": 11.030518097941803, + "grad_norm": 0.3388267159461975, + "learning_rate": 8.897430801987225e-05, + "loss": 0.026630723476409913, + "step": 77710 + }, + { + "epoch": 11.031937544357701, + "grad_norm": 0.21325798332691193, + "learning_rate": 8.897288857345635e-05, + "loss": 0.062087488174438474, + "step": 77720 + }, + { + "epoch": 11.033356990773598, + "grad_norm": 0.05454428121447563, + "learning_rate": 8.897146912704046e-05, + "loss": 0.08088703751564026, + "step": 77730 + }, + { + "epoch": 11.034776437189496, + "grad_norm": 2.0780720710754395, + "learning_rate": 8.897004968062456e-05, + "loss": 0.029281583428382874, + "step": 77740 + }, + { + "epoch": 11.036195883605394, + "grad_norm": 0.6183001399040222, + "learning_rate": 8.896863023420866e-05, + "loss": 0.027968209981918336, + "step": 77750 + }, + { + "epoch": 11.037615330021291, + "grad_norm": 1.7325291633605957, + "learning_rate": 8.896721078779277e-05, + "loss": 0.04854607284069061, + "step": 77760 + }, + { + "epoch": 11.03903477643719, + "grad_norm": 3.145956516265869, + "learning_rate": 8.896579134137687e-05, + "loss": 0.0039848946034908295, + "step": 77770 + }, + { + "epoch": 11.040454222853088, + "grad_norm": 0.12728923559188843, + "learning_rate": 8.896437189496098e-05, + "loss": 0.019792550802230836, + "step": 77780 + }, + { + "epoch": 11.041873669268986, + "grad_norm": 1.0950226783752441, + "learning_rate": 8.896295244854506e-05, + "loss": 0.033737349510192874, + "step": 77790 + }, + { + "epoch": 11.043293115684882, + "grad_norm": 3.4214136600494385, + "learning_rate": 8.896153300212917e-05, + "loss": 0.009793543070554734, + "step": 77800 + }, + { + "epoch": 11.04471256210078, + "grad_norm": 0.49779754877090454, + "learning_rate": 8.896011355571327e-05, + "loss": 0.009086959809064866, + "step": 77810 + }, + { + "epoch": 11.046132008516679, + "grad_norm": 0.01435944065451622, + "learning_rate": 8.895869410929738e-05, + "loss": 0.02067076861858368, + "step": 77820 + }, + { + "epoch": 11.047551454932576, + "grad_norm": 5.94788122177124, + "learning_rate": 8.895727466288149e-05, + "loss": 0.011896288394927979, + "step": 77830 + }, + { + "epoch": 11.048970901348474, + "grad_norm": 0.29182007908821106, + "learning_rate": 8.895585521646558e-05, + "loss": 0.026392003893852232, + "step": 77840 + }, + { + "epoch": 11.050390347764372, + "grad_norm": 0.21065226197242737, + "learning_rate": 8.895443577004969e-05, + "loss": 0.028645521402359007, + "step": 77850 + }, + { + "epoch": 11.05180979418027, + "grad_norm": 0.05954265221953392, + "learning_rate": 8.895301632363378e-05, + "loss": 0.010045936703681946, + "step": 77860 + }, + { + "epoch": 11.053229240596167, + "grad_norm": 0.0884803906083107, + "learning_rate": 8.89515968772179e-05, + "loss": 0.011941583454608917, + "step": 77870 + }, + { + "epoch": 11.054648687012065, + "grad_norm": 0.1329718977212906, + "learning_rate": 8.895017743080199e-05, + "loss": 0.010423028469085693, + "step": 77880 + }, + { + "epoch": 11.056068133427964, + "grad_norm": 0.2441541701555252, + "learning_rate": 8.894875798438609e-05, + "loss": 0.002589988708496094, + "step": 77890 + }, + { + "epoch": 11.05748757984386, + "grad_norm": 0.7030231356620789, + "learning_rate": 8.894733853797019e-05, + "loss": 0.005089676007628441, + "step": 77900 + }, + { + "epoch": 11.058907026259758, + "grad_norm": 0.0806090235710144, + "learning_rate": 8.89459190915543e-05, + "loss": 0.007579062879085541, + "step": 77910 + }, + { + "epoch": 11.060326472675657, + "grad_norm": 3.0792415142059326, + "learning_rate": 8.894449964513841e-05, + "loss": 0.004580720514059067, + "step": 77920 + }, + { + "epoch": 11.061745919091555, + "grad_norm": 0.10120502859354019, + "learning_rate": 8.89430801987225e-05, + "loss": 0.007818933576345444, + "step": 77930 + }, + { + "epoch": 11.063165365507452, + "grad_norm": 0.13891956210136414, + "learning_rate": 8.89416607523066e-05, + "loss": 0.020057034492492676, + "step": 77940 + }, + { + "epoch": 11.06458481192335, + "grad_norm": 0.9457455277442932, + "learning_rate": 8.89402413058907e-05, + "loss": 0.03999852836132049, + "step": 77950 + }, + { + "epoch": 11.066004258339248, + "grad_norm": 0.043018776923418045, + "learning_rate": 8.893882185947481e-05, + "loss": 0.02448986768722534, + "step": 77960 + }, + { + "epoch": 11.067423704755145, + "grad_norm": 0.0917859748005867, + "learning_rate": 8.893740241305891e-05, + "loss": 0.04967025220394135, + "step": 77970 + }, + { + "epoch": 11.068843151171043, + "grad_norm": 0.42049574851989746, + "learning_rate": 8.893598296664302e-05, + "loss": 0.02361696809530258, + "step": 77980 + }, + { + "epoch": 11.070262597586941, + "grad_norm": 0.10781079530715942, + "learning_rate": 8.893456352022712e-05, + "loss": 0.04162313342094422, + "step": 77990 + }, + { + "epoch": 11.07168204400284, + "grad_norm": 4.091378688812256, + "learning_rate": 8.893314407381121e-05, + "loss": 0.006732091307640076, + "step": 78000 + }, + { + "epoch": 11.07168204400284, + "eval_accuracy": 0.9848032046798499, + "eval_loss": 0.05146849900484085, + "eval_runtime": 34.0197, + "eval_samples_per_second": 462.291, + "eval_steps_per_second": 14.462, + "step": 78000 + }, + { + "epoch": 11.073101490418736, + "grad_norm": 0.03186691179871559, + "learning_rate": 8.893172462739533e-05, + "loss": 0.020208078622817992, + "step": 78010 + }, + { + "epoch": 11.074520936834634, + "grad_norm": 0.0380316786468029, + "learning_rate": 8.893030518097942e-05, + "loss": 0.01988564133644104, + "step": 78020 + }, + { + "epoch": 11.075940383250533, + "grad_norm": 0.009216410107910633, + "learning_rate": 8.892888573456353e-05, + "loss": 0.03737284243106842, + "step": 78030 + }, + { + "epoch": 11.07735982966643, + "grad_norm": 0.7872573733329773, + "learning_rate": 8.892746628814763e-05, + "loss": 0.025534018874168396, + "step": 78040 + }, + { + "epoch": 11.078779276082328, + "grad_norm": 0.015327519737184048, + "learning_rate": 8.892604684173173e-05, + "loss": 0.0050436832010746, + "step": 78050 + }, + { + "epoch": 11.080198722498226, + "grad_norm": 10.065001487731934, + "learning_rate": 8.892462739531583e-05, + "loss": 0.022689974308013915, + "step": 78060 + }, + { + "epoch": 11.081618168914124, + "grad_norm": 14.562976837158203, + "learning_rate": 8.892320794889994e-05, + "loss": 0.024745500087738036, + "step": 78070 + }, + { + "epoch": 11.08303761533002, + "grad_norm": 5.8700737953186035, + "learning_rate": 8.892178850248403e-05, + "loss": 0.03570305109024048, + "step": 78080 + }, + { + "epoch": 11.084457061745919, + "grad_norm": 0.8238467574119568, + "learning_rate": 8.892036905606814e-05, + "loss": 0.025160768628120424, + "step": 78090 + }, + { + "epoch": 11.085876508161817, + "grad_norm": 3.493910312652588, + "learning_rate": 8.891894960965224e-05, + "loss": 0.08694150447845458, + "step": 78100 + }, + { + "epoch": 11.087295954577714, + "grad_norm": 3.4150233268737793, + "learning_rate": 8.891753016323634e-05, + "loss": 0.020310258865356444, + "step": 78110 + }, + { + "epoch": 11.088715400993612, + "grad_norm": 14.740667343139648, + "learning_rate": 8.891611071682045e-05, + "loss": 0.030592340230941772, + "step": 78120 + }, + { + "epoch": 11.09013484740951, + "grad_norm": 7.057147026062012, + "learning_rate": 8.891469127040455e-05, + "loss": 0.026824843883514405, + "step": 78130 + }, + { + "epoch": 11.091554293825409, + "grad_norm": 0.21818548440933228, + "learning_rate": 8.891327182398866e-05, + "loss": 0.05211725831031799, + "step": 78140 + }, + { + "epoch": 11.092973740241305, + "grad_norm": 7.997917175292969, + "learning_rate": 8.891185237757274e-05, + "loss": 0.04818741977214813, + "step": 78150 + }, + { + "epoch": 11.094393186657204, + "grad_norm": 0.17914900183677673, + "learning_rate": 8.891043293115685e-05, + "loss": 0.018161210417747497, + "step": 78160 + }, + { + "epoch": 11.095812633073102, + "grad_norm": 0.2530238926410675, + "learning_rate": 8.890901348474095e-05, + "loss": 0.008942224830389024, + "step": 78170 + }, + { + "epoch": 11.097232079488998, + "grad_norm": 0.2684612274169922, + "learning_rate": 8.890759403832506e-05, + "loss": 0.016461795568466185, + "step": 78180 + }, + { + "epoch": 11.098651525904897, + "grad_norm": 7.170384883880615, + "learning_rate": 8.890617459190916e-05, + "loss": 0.031466320157051086, + "step": 78190 + }, + { + "epoch": 11.100070972320795, + "grad_norm": 0.10846813768148422, + "learning_rate": 8.890475514549326e-05, + "loss": 0.04786921739578247, + "step": 78200 + }, + { + "epoch": 11.101490418736693, + "grad_norm": 3.167357921600342, + "learning_rate": 8.890333569907737e-05, + "loss": 0.023188070952892305, + "step": 78210 + }, + { + "epoch": 11.10290986515259, + "grad_norm": 0.05991131067276001, + "learning_rate": 8.890191625266147e-05, + "loss": 0.047985118627548215, + "step": 78220 + }, + { + "epoch": 11.104329311568488, + "grad_norm": 9.863299369812012, + "learning_rate": 8.890049680624558e-05, + "loss": 0.0429348349571228, + "step": 78230 + }, + { + "epoch": 11.105748757984387, + "grad_norm": 12.251611709594727, + "learning_rate": 8.889907735982967e-05, + "loss": 0.035148638486862185, + "step": 78240 + }, + { + "epoch": 11.107168204400283, + "grad_norm": 0.5501163601875305, + "learning_rate": 8.889765791341377e-05, + "loss": 0.005860616266727447, + "step": 78250 + }, + { + "epoch": 11.108587650816181, + "grad_norm": 7.741093158721924, + "learning_rate": 8.889623846699787e-05, + "loss": 0.01935284584760666, + "step": 78260 + }, + { + "epoch": 11.11000709723208, + "grad_norm": 0.04390472546219826, + "learning_rate": 8.889481902058198e-05, + "loss": 0.03044103682041168, + "step": 78270 + }, + { + "epoch": 11.111426543647978, + "grad_norm": 0.007192371413111687, + "learning_rate": 8.889339957416608e-05, + "loss": 0.018576528131961822, + "step": 78280 + }, + { + "epoch": 11.112845990063875, + "grad_norm": 0.30019840598106384, + "learning_rate": 8.889198012775019e-05, + "loss": 0.05336242914199829, + "step": 78290 + }, + { + "epoch": 11.114265436479773, + "grad_norm": 0.11804159730672836, + "learning_rate": 8.889056068133428e-05, + "loss": 0.054386216402053836, + "step": 78300 + }, + { + "epoch": 11.115684882895671, + "grad_norm": 1.0424158573150635, + "learning_rate": 8.888914123491838e-05, + "loss": 0.019771167635917665, + "step": 78310 + }, + { + "epoch": 11.117104329311568, + "grad_norm": 3.4843974113464355, + "learning_rate": 8.888772178850249e-05, + "loss": 0.014606226980686188, + "step": 78320 + }, + { + "epoch": 11.118523775727466, + "grad_norm": 0.32944443821907043, + "learning_rate": 8.888630234208659e-05, + "loss": 0.048432594537734984, + "step": 78330 + }, + { + "epoch": 11.119943222143364, + "grad_norm": 10.374595642089844, + "learning_rate": 8.88848828956707e-05, + "loss": 0.055147755146026614, + "step": 78340 + }, + { + "epoch": 11.121362668559263, + "grad_norm": 4.440117835998535, + "learning_rate": 8.88834634492548e-05, + "loss": 0.0403089314699173, + "step": 78350 + }, + { + "epoch": 11.12278211497516, + "grad_norm": 0.6370695233345032, + "learning_rate": 8.88820440028389e-05, + "loss": 0.021577592194080352, + "step": 78360 + }, + { + "epoch": 11.124201561391057, + "grad_norm": 0.009489811025559902, + "learning_rate": 8.8880624556423e-05, + "loss": 0.0039813734591007234, + "step": 78370 + }, + { + "epoch": 11.125621007806956, + "grad_norm": 10.565815925598145, + "learning_rate": 8.88792051100071e-05, + "loss": 0.03302794098854065, + "step": 78380 + }, + { + "epoch": 11.127040454222852, + "grad_norm": 0.8729338645935059, + "learning_rate": 8.88777856635912e-05, + "loss": 0.004103229567408562, + "step": 78390 + }, + { + "epoch": 11.12845990063875, + "grad_norm": 0.12913577258586884, + "learning_rate": 8.887636621717531e-05, + "loss": 0.023288239538669587, + "step": 78400 + }, + { + "epoch": 11.129879347054649, + "grad_norm": 0.22171258926391602, + "learning_rate": 8.887494677075941e-05, + "loss": 0.013404671847820283, + "step": 78410 + }, + { + "epoch": 11.131298793470547, + "grad_norm": 0.15066424012184143, + "learning_rate": 8.887352732434351e-05, + "loss": 0.009888184815645218, + "step": 78420 + }, + { + "epoch": 11.132718239886444, + "grad_norm": 0.3783220052719116, + "learning_rate": 8.887210787792762e-05, + "loss": 0.02776823043823242, + "step": 78430 + }, + { + "epoch": 11.134137686302342, + "grad_norm": 1.576729416847229, + "learning_rate": 8.887068843151172e-05, + "loss": 0.023171845078468322, + "step": 78440 + }, + { + "epoch": 11.13555713271824, + "grad_norm": 0.04876582324504852, + "learning_rate": 8.886926898509583e-05, + "loss": 0.05620192885398865, + "step": 78450 + }, + { + "epoch": 11.136976579134137, + "grad_norm": 4.442808628082275, + "learning_rate": 8.886784953867991e-05, + "loss": 0.02469184249639511, + "step": 78460 + }, + { + "epoch": 11.138396025550035, + "grad_norm": 1.914604663848877, + "learning_rate": 8.886643009226402e-05, + "loss": 0.008845466375350951, + "step": 78470 + }, + { + "epoch": 11.139815471965933, + "grad_norm": 0.0457342192530632, + "learning_rate": 8.886501064584812e-05, + "loss": 0.039513444900512694, + "step": 78480 + }, + { + "epoch": 11.141234918381832, + "grad_norm": 0.19912536442279816, + "learning_rate": 8.886359119943223e-05, + "loss": 0.08067465424537659, + "step": 78490 + }, + { + "epoch": 11.142654364797728, + "grad_norm": 6.148006439208984, + "learning_rate": 8.886217175301633e-05, + "loss": 0.036643752455711366, + "step": 78500 + }, + { + "epoch": 11.142654364797728, + "eval_accuracy": 0.9822598079735487, + "eval_loss": 0.057572051882743835, + "eval_runtime": 33.7716, + "eval_samples_per_second": 465.687, + "eval_steps_per_second": 14.568, + "step": 78500 + }, + { + "epoch": 11.144073811213627, + "grad_norm": 1.614670991897583, + "learning_rate": 8.886075230660042e-05, + "loss": 0.01014809012413025, + "step": 78510 + }, + { + "epoch": 11.145493257629525, + "grad_norm": 0.3039005696773529, + "learning_rate": 8.885933286018454e-05, + "loss": 0.03332527279853821, + "step": 78520 + }, + { + "epoch": 11.146912704045421, + "grad_norm": 10.968552589416504, + "learning_rate": 8.885791341376863e-05, + "loss": 0.06424198150634766, + "step": 78530 + }, + { + "epoch": 11.14833215046132, + "grad_norm": 0.9706463813781738, + "learning_rate": 8.885649396735274e-05, + "loss": 0.02765073478221893, + "step": 78540 + }, + { + "epoch": 11.149751596877218, + "grad_norm": 1.7395691871643066, + "learning_rate": 8.885507452093684e-05, + "loss": 0.023248912394046785, + "step": 78550 + }, + { + "epoch": 11.151171043293116, + "grad_norm": 0.9122004508972168, + "learning_rate": 8.885365507452094e-05, + "loss": 0.012182090431451797, + "step": 78560 + }, + { + "epoch": 11.152590489709013, + "grad_norm": 2.4857709407806396, + "learning_rate": 8.885223562810504e-05, + "loss": 0.052464467287063596, + "step": 78570 + }, + { + "epoch": 11.154009936124911, + "grad_norm": 0.9637348055839539, + "learning_rate": 8.885081618168915e-05, + "loss": 0.04379349946975708, + "step": 78580 + }, + { + "epoch": 11.15542938254081, + "grad_norm": 13.401801109313965, + "learning_rate": 8.884939673527324e-05, + "loss": 0.08828679919242859, + "step": 78590 + }, + { + "epoch": 11.156848828956706, + "grad_norm": 8.565984725952148, + "learning_rate": 8.884797728885736e-05, + "loss": 0.059217429161071776, + "step": 78600 + }, + { + "epoch": 11.158268275372604, + "grad_norm": 1.7600605487823486, + "learning_rate": 8.884655784244145e-05, + "loss": 0.010065761208534241, + "step": 78610 + }, + { + "epoch": 11.159687721788503, + "grad_norm": 0.9549081325531006, + "learning_rate": 8.884513839602555e-05, + "loss": 0.019970996677875517, + "step": 78620 + }, + { + "epoch": 11.161107168204401, + "grad_norm": 0.03332928195595741, + "learning_rate": 8.884371894960966e-05, + "loss": 0.009172002971172332, + "step": 78630 + }, + { + "epoch": 11.162526614620297, + "grad_norm": 0.23457317054271698, + "learning_rate": 8.884229950319376e-05, + "loss": 0.03987969756126404, + "step": 78640 + }, + { + "epoch": 11.163946061036196, + "grad_norm": 0.09130463004112244, + "learning_rate": 8.884088005677787e-05, + "loss": 0.005275613814592362, + "step": 78650 + }, + { + "epoch": 11.165365507452094, + "grad_norm": 0.07525503635406494, + "learning_rate": 8.883946061036195e-05, + "loss": 0.03904627561569214, + "step": 78660 + }, + { + "epoch": 11.16678495386799, + "grad_norm": 1.4086534976959229, + "learning_rate": 8.883804116394606e-05, + "loss": 0.01731160581111908, + "step": 78670 + }, + { + "epoch": 11.168204400283889, + "grad_norm": 4.701272487640381, + "learning_rate": 8.883662171753016e-05, + "loss": 0.022493749856948853, + "step": 78680 + }, + { + "epoch": 11.169623846699787, + "grad_norm": 0.021906519308686256, + "learning_rate": 8.883520227111427e-05, + "loss": 0.0666272759437561, + "step": 78690 + }, + { + "epoch": 11.171043293115686, + "grad_norm": 3.216141700744629, + "learning_rate": 8.883378282469837e-05, + "loss": 0.0164662629365921, + "step": 78700 + }, + { + "epoch": 11.172462739531582, + "grad_norm": 10.842391014099121, + "learning_rate": 8.883236337828248e-05, + "loss": 0.030035099387168883, + "step": 78710 + }, + { + "epoch": 11.17388218594748, + "grad_norm": 7.16567325592041, + "learning_rate": 8.883094393186658e-05, + "loss": 0.013143278658390045, + "step": 78720 + }, + { + "epoch": 11.175301632363379, + "grad_norm": 0.0488220676779747, + "learning_rate": 8.882952448545068e-05, + "loss": 0.049586433172225955, + "step": 78730 + }, + { + "epoch": 11.176721078779275, + "grad_norm": 0.45880597829818726, + "learning_rate": 8.882810503903479e-05, + "loss": 0.004569170251488686, + "step": 78740 + }, + { + "epoch": 11.178140525195174, + "grad_norm": 13.114766120910645, + "learning_rate": 8.882668559261888e-05, + "loss": 0.02692474126815796, + "step": 78750 + }, + { + "epoch": 11.179559971611072, + "grad_norm": 8.099347114562988, + "learning_rate": 8.8825266146203e-05, + "loss": 0.042691168189048764, + "step": 78760 + }, + { + "epoch": 11.18097941802697, + "grad_norm": 0.07002486288547516, + "learning_rate": 8.882384669978708e-05, + "loss": 0.00397140234708786, + "step": 78770 + }, + { + "epoch": 11.182398864442867, + "grad_norm": 0.8963044881820679, + "learning_rate": 8.882242725337119e-05, + "loss": 0.034300926327705386, + "step": 78780 + }, + { + "epoch": 11.183818310858765, + "grad_norm": 0.19894537329673767, + "learning_rate": 8.882100780695529e-05, + "loss": 0.010814374685287476, + "step": 78790 + }, + { + "epoch": 11.185237757274663, + "grad_norm": 7.048020839691162, + "learning_rate": 8.88195883605394e-05, + "loss": 0.03981360495090484, + "step": 78800 + }, + { + "epoch": 11.18665720369056, + "grad_norm": 7.870205879211426, + "learning_rate": 8.88181689141235e-05, + "loss": 0.020268261432647705, + "step": 78810 + }, + { + "epoch": 11.188076650106458, + "grad_norm": 7.8461594581604, + "learning_rate": 8.881674946770759e-05, + "loss": 0.06974129676818848, + "step": 78820 + }, + { + "epoch": 11.189496096522356, + "grad_norm": 2.0994327068328857, + "learning_rate": 8.88153300212917e-05, + "loss": 0.008805645257234573, + "step": 78830 + }, + { + "epoch": 11.190915542938255, + "grad_norm": 7.246284484863281, + "learning_rate": 8.88139105748758e-05, + "loss": 0.041193538904190065, + "step": 78840 + }, + { + "epoch": 11.192334989354151, + "grad_norm": 3.468059778213501, + "learning_rate": 8.881249112845991e-05, + "loss": 0.029273611307144166, + "step": 78850 + }, + { + "epoch": 11.19375443577005, + "grad_norm": 11.098454475402832, + "learning_rate": 8.881107168204401e-05, + "loss": 0.039662298560142514, + "step": 78860 + }, + { + "epoch": 11.195173882185948, + "grad_norm": 11.667226791381836, + "learning_rate": 8.88096522356281e-05, + "loss": 0.043052345514297485, + "step": 78870 + }, + { + "epoch": 11.196593328601844, + "grad_norm": 0.6782079339027405, + "learning_rate": 8.88082327892122e-05, + "loss": 0.03640368580818176, + "step": 78880 + }, + { + "epoch": 11.198012775017743, + "grad_norm": 3.632553815841675, + "learning_rate": 8.880681334279631e-05, + "loss": 0.011744405329227447, + "step": 78890 + }, + { + "epoch": 11.199432221433641, + "grad_norm": 0.102350153028965, + "learning_rate": 8.880539389638041e-05, + "loss": 0.036199426651000975, + "step": 78900 + }, + { + "epoch": 11.20085166784954, + "grad_norm": 0.1365147978067398, + "learning_rate": 8.880397444996452e-05, + "loss": 0.024364030361175536, + "step": 78910 + }, + { + "epoch": 11.202271114265436, + "grad_norm": 0.05941876396536827, + "learning_rate": 8.880255500354862e-05, + "loss": 0.02454724460840225, + "step": 78920 + }, + { + "epoch": 11.203690560681334, + "grad_norm": 2.1816346645355225, + "learning_rate": 8.880113555713272e-05, + "loss": 0.017804595828056335, + "step": 78930 + }, + { + "epoch": 11.205110007097232, + "grad_norm": 0.7614143490791321, + "learning_rate": 8.879971611071683e-05, + "loss": 0.008576758205890656, + "step": 78940 + }, + { + "epoch": 11.206529453513129, + "grad_norm": 0.6037997007369995, + "learning_rate": 8.879829666430093e-05, + "loss": 0.044572693109512326, + "step": 78950 + }, + { + "epoch": 11.207948899929027, + "grad_norm": 0.44765669107437134, + "learning_rate": 8.879687721788504e-05, + "loss": 0.07759880423545837, + "step": 78960 + }, + { + "epoch": 11.209368346344926, + "grad_norm": 3.6659607887268066, + "learning_rate": 8.879545777146912e-05, + "loss": 0.045743897557258606, + "step": 78970 + }, + { + "epoch": 11.210787792760824, + "grad_norm": 12.800979614257812, + "learning_rate": 8.879403832505323e-05, + "loss": 0.03383589088916779, + "step": 78980 + }, + { + "epoch": 11.21220723917672, + "grad_norm": 0.5182911157608032, + "learning_rate": 8.879261887863733e-05, + "loss": 0.020882833003997802, + "step": 78990 + }, + { + "epoch": 11.213626685592619, + "grad_norm": 3.3220176696777344, + "learning_rate": 8.879119943222144e-05, + "loss": 0.010641470551490784, + "step": 79000 + }, + { + "epoch": 11.213626685592619, + "eval_accuracy": 0.9826413174794939, + "eval_loss": 0.06146110221743584, + "eval_runtime": 34.5996, + "eval_samples_per_second": 454.542, + "eval_steps_per_second": 14.22, + "step": 79000 + }, + { + "epoch": 11.215046132008517, + "grad_norm": 0.5559946298599243, + "learning_rate": 8.878977998580554e-05, + "loss": 0.032634681463241576, + "step": 79010 + }, + { + "epoch": 11.216465578424414, + "grad_norm": 1.8470778465270996, + "learning_rate": 8.878836053938965e-05, + "loss": 0.04523923397064209, + "step": 79020 + }, + { + "epoch": 11.217885024840312, + "grad_norm": 3.1267638206481934, + "learning_rate": 8.878694109297375e-05, + "loss": 0.04637001752853394, + "step": 79030 + }, + { + "epoch": 11.21930447125621, + "grad_norm": 8.157069206237793, + "learning_rate": 8.878552164655784e-05, + "loss": 0.016615946590900422, + "step": 79040 + }, + { + "epoch": 11.220723917672109, + "grad_norm": 0.06618022173643112, + "learning_rate": 8.878410220014195e-05, + "loss": 0.07373695969581603, + "step": 79050 + }, + { + "epoch": 11.222143364088005, + "grad_norm": 0.6748453378677368, + "learning_rate": 8.878268275372605e-05, + "loss": 0.05090689063072205, + "step": 79060 + }, + { + "epoch": 11.223562810503903, + "grad_norm": 0.33752214908599854, + "learning_rate": 8.878126330731016e-05, + "loss": 0.016496339440345766, + "step": 79070 + }, + { + "epoch": 11.224982256919802, + "grad_norm": 1.0742039680480957, + "learning_rate": 8.877984386089425e-05, + "loss": 0.035834723711013795, + "step": 79080 + }, + { + "epoch": 11.2264017033357, + "grad_norm": 1.6405409574508667, + "learning_rate": 8.877842441447836e-05, + "loss": 0.014355912804603577, + "step": 79090 + }, + { + "epoch": 11.227821149751597, + "grad_norm": 2.370652437210083, + "learning_rate": 8.877700496806245e-05, + "loss": 0.02448154389858246, + "step": 79100 + }, + { + "epoch": 11.229240596167495, + "grad_norm": 0.8242597579956055, + "learning_rate": 8.877558552164657e-05, + "loss": 0.04506869912147522, + "step": 79110 + }, + { + "epoch": 11.230660042583393, + "grad_norm": 2.0276906490325928, + "learning_rate": 8.877416607523066e-05, + "loss": 0.05213066339492798, + "step": 79120 + }, + { + "epoch": 11.23207948899929, + "grad_norm": 2.8450264930725098, + "learning_rate": 8.877274662881476e-05, + "loss": 0.020429591834545135, + "step": 79130 + }, + { + "epoch": 11.233498935415188, + "grad_norm": 0.10804232209920883, + "learning_rate": 8.877132718239887e-05, + "loss": 0.022500227391719817, + "step": 79140 + }, + { + "epoch": 11.234918381831086, + "grad_norm": 7.79222297668457, + "learning_rate": 8.876990773598297e-05, + "loss": 0.07455472350120544, + "step": 79150 + }, + { + "epoch": 11.236337828246985, + "grad_norm": 13.84041690826416, + "learning_rate": 8.876848828956708e-05, + "loss": 0.043375393748283385, + "step": 79160 + }, + { + "epoch": 11.237757274662881, + "grad_norm": 0.08527079224586487, + "learning_rate": 8.876706884315118e-05, + "loss": 0.04165436625480652, + "step": 79170 + }, + { + "epoch": 11.23917672107878, + "grad_norm": 0.046399861574172974, + "learning_rate": 8.876564939673527e-05, + "loss": 0.043243306875228885, + "step": 79180 + }, + { + "epoch": 11.240596167494678, + "grad_norm": 0.0947565957903862, + "learning_rate": 8.876422995031937e-05, + "loss": 0.011657755821943283, + "step": 79190 + }, + { + "epoch": 11.242015613910574, + "grad_norm": 0.4945715367794037, + "learning_rate": 8.876281050390348e-05, + "loss": 0.026895123720169067, + "step": 79200 + }, + { + "epoch": 11.243435060326473, + "grad_norm": 0.6884742975234985, + "learning_rate": 8.876139105748758e-05, + "loss": 0.019331425428390503, + "step": 79210 + }, + { + "epoch": 11.24485450674237, + "grad_norm": 3.9027488231658936, + "learning_rate": 8.875997161107169e-05, + "loss": 0.00956488996744156, + "step": 79220 + }, + { + "epoch": 11.24627395315827, + "grad_norm": 0.17311543226242065, + "learning_rate": 8.875855216465579e-05, + "loss": 0.03562336564064026, + "step": 79230 + }, + { + "epoch": 11.247693399574166, + "grad_norm": 0.2896651029586792, + "learning_rate": 8.875713271823989e-05, + "loss": 0.013322600722312927, + "step": 79240 + }, + { + "epoch": 11.249112845990064, + "grad_norm": 0.5805850625038147, + "learning_rate": 8.8755713271824e-05, + "loss": 0.017601439356803895, + "step": 79250 + }, + { + "epoch": 11.250532292405962, + "grad_norm": 0.29199790954589844, + "learning_rate": 8.87542938254081e-05, + "loss": 0.032927751541137695, + "step": 79260 + }, + { + "epoch": 11.251951738821859, + "grad_norm": 6.623978614807129, + "learning_rate": 8.87528743789922e-05, + "loss": 0.02440749704837799, + "step": 79270 + }, + { + "epoch": 11.253371185237757, + "grad_norm": 2.0030710697174072, + "learning_rate": 8.875145493257629e-05, + "loss": 0.010758772492408752, + "step": 79280 + }, + { + "epoch": 11.254790631653655, + "grad_norm": 8.13338565826416, + "learning_rate": 8.87500354861604e-05, + "loss": 0.01573694348335266, + "step": 79290 + }, + { + "epoch": 11.256210078069554, + "grad_norm": 0.5603359937667847, + "learning_rate": 8.87486160397445e-05, + "loss": 0.0071515366435050964, + "step": 79300 + }, + { + "epoch": 11.25762952448545, + "grad_norm": 0.025435080751776695, + "learning_rate": 8.874719659332861e-05, + "loss": 0.05067628026008606, + "step": 79310 + }, + { + "epoch": 11.259048970901349, + "grad_norm": 0.007434059400111437, + "learning_rate": 8.874577714691272e-05, + "loss": 0.009691541641950607, + "step": 79320 + }, + { + "epoch": 11.260468417317247, + "grad_norm": 0.15233097970485687, + "learning_rate": 8.87443577004968e-05, + "loss": 0.008891449868679046, + "step": 79330 + }, + { + "epoch": 11.261887863733143, + "grad_norm": 0.013036555610597134, + "learning_rate": 8.874293825408091e-05, + "loss": 0.01533883363008499, + "step": 79340 + }, + { + "epoch": 11.263307310149042, + "grad_norm": 2.5614285469055176, + "learning_rate": 8.874151880766501e-05, + "loss": 0.007508398592472076, + "step": 79350 + }, + { + "epoch": 11.26472675656494, + "grad_norm": 0.06737040728330612, + "learning_rate": 8.874009936124912e-05, + "loss": 0.032192906737327574, + "step": 79360 + }, + { + "epoch": 11.266146202980838, + "grad_norm": 0.9063697457313538, + "learning_rate": 8.873867991483322e-05, + "loss": 0.01731267273426056, + "step": 79370 + }, + { + "epoch": 11.267565649396735, + "grad_norm": 0.378262996673584, + "learning_rate": 8.873726046841733e-05, + "loss": 0.001695651188492775, + "step": 79380 + }, + { + "epoch": 11.268985095812633, + "grad_norm": 11.412947654724121, + "learning_rate": 8.873584102200141e-05, + "loss": 0.054120153188705444, + "step": 79390 + }, + { + "epoch": 11.270404542228531, + "grad_norm": 1.0254113674163818, + "learning_rate": 8.873442157558552e-05, + "loss": 0.002478482201695442, + "step": 79400 + }, + { + "epoch": 11.271823988644428, + "grad_norm": 0.222798153758049, + "learning_rate": 8.873300212916964e-05, + "loss": 0.011906647682189941, + "step": 79410 + }, + { + "epoch": 11.273243435060326, + "grad_norm": 0.8719398379325867, + "learning_rate": 8.873158268275373e-05, + "loss": 0.0305912584066391, + "step": 79420 + }, + { + "epoch": 11.274662881476225, + "grad_norm": 0.04235215485095978, + "learning_rate": 8.873016323633784e-05, + "loss": 0.030136144161224364, + "step": 79430 + }, + { + "epoch": 11.276082327892123, + "grad_norm": 6.246129035949707, + "learning_rate": 8.872874378992193e-05, + "loss": 0.03775702714920044, + "step": 79440 + }, + { + "epoch": 11.27750177430802, + "grad_norm": 0.19702669978141785, + "learning_rate": 8.872732434350604e-05, + "loss": 0.04544393420219421, + "step": 79450 + }, + { + "epoch": 11.278921220723918, + "grad_norm": 1.5803076028823853, + "learning_rate": 8.872590489709014e-05, + "loss": 0.01723621040582657, + "step": 79460 + }, + { + "epoch": 11.280340667139816, + "grad_norm": 0.902417778968811, + "learning_rate": 8.872448545067425e-05, + "loss": 0.009972374886274338, + "step": 79470 + }, + { + "epoch": 11.281760113555713, + "grad_norm": 0.10034406930208206, + "learning_rate": 8.872306600425834e-05, + "loss": 0.02509826123714447, + "step": 79480 + }, + { + "epoch": 11.283179559971611, + "grad_norm": 1.11604905128479, + "learning_rate": 8.872164655784244e-05, + "loss": 0.020654731988906862, + "step": 79490 + }, + { + "epoch": 11.28459900638751, + "grad_norm": 0.7076891660690308, + "learning_rate": 8.872022711142655e-05, + "loss": 0.01148422509431839, + "step": 79500 + }, + { + "epoch": 11.28459900638751, + "eval_accuracy": 0.9832771666560692, + "eval_loss": 0.05689075216650963, + "eval_runtime": 33.9551, + "eval_samples_per_second": 463.171, + "eval_steps_per_second": 14.49, + "step": 79500 + }, + { + "epoch": 11.286018452803408, + "grad_norm": 1.7435786724090576, + "learning_rate": 8.871880766501065e-05, + "loss": 0.025782716274261475, + "step": 79510 + }, + { + "epoch": 11.287437899219304, + "grad_norm": 0.5907371044158936, + "learning_rate": 8.871738821859476e-05, + "loss": 0.019414816796779633, + "step": 79520 + }, + { + "epoch": 11.288857345635202, + "grad_norm": 0.14381571114063263, + "learning_rate": 8.871596877217886e-05, + "loss": 0.05670464038848877, + "step": 79530 + }, + { + "epoch": 11.2902767920511, + "grad_norm": 6.07039737701416, + "learning_rate": 8.871469127040454e-05, + "loss": 0.0341320812702179, + "step": 79540 + }, + { + "epoch": 11.291696238466997, + "grad_norm": 1.4728301763534546, + "learning_rate": 8.871327182398865e-05, + "loss": 0.019729208946228028, + "step": 79550 + }, + { + "epoch": 11.293115684882896, + "grad_norm": 8.61385726928711, + "learning_rate": 8.871185237757275e-05, + "loss": 0.03510661125183105, + "step": 79560 + }, + { + "epoch": 11.294535131298794, + "grad_norm": 0.5472472906112671, + "learning_rate": 8.871043293115685e-05, + "loss": 0.008604159206151962, + "step": 79570 + }, + { + "epoch": 11.295954577714692, + "grad_norm": 1.1503679752349854, + "learning_rate": 8.870901348474096e-05, + "loss": 0.03208954930305481, + "step": 79580 + }, + { + "epoch": 11.297374024130589, + "grad_norm": 0.0036283473018556833, + "learning_rate": 8.870759403832506e-05, + "loss": 0.002127677947282791, + "step": 79590 + }, + { + "epoch": 11.298793470546487, + "grad_norm": 2.7503178119659424, + "learning_rate": 8.870617459190917e-05, + "loss": 0.008547821640968322, + "step": 79600 + }, + { + "epoch": 11.300212916962385, + "grad_norm": 0.21989195048809052, + "learning_rate": 8.870475514549325e-05, + "loss": 0.013670270144939423, + "step": 79610 + }, + { + "epoch": 11.301632363378282, + "grad_norm": 0.0767766609787941, + "learning_rate": 8.870333569907736e-05, + "loss": 0.03806124329566955, + "step": 79620 + }, + { + "epoch": 11.30305180979418, + "grad_norm": 0.07631035149097443, + "learning_rate": 8.870191625266146e-05, + "loss": 0.023271280527114867, + "step": 79630 + }, + { + "epoch": 11.304471256210078, + "grad_norm": 4.039665699005127, + "learning_rate": 8.870049680624557e-05, + "loss": 0.03466680645942688, + "step": 79640 + }, + { + "epoch": 11.305890702625977, + "grad_norm": 0.5322751998901367, + "learning_rate": 8.869907735982967e-05, + "loss": 0.021226316690444946, + "step": 79650 + }, + { + "epoch": 11.307310149041873, + "grad_norm": 0.2919057011604309, + "learning_rate": 8.869765791341377e-05, + "loss": 0.02747492790222168, + "step": 79660 + }, + { + "epoch": 11.308729595457772, + "grad_norm": 1.9962308406829834, + "learning_rate": 8.869623846699788e-05, + "loss": 0.022367829084396364, + "step": 79670 + }, + { + "epoch": 11.31014904187367, + "grad_norm": 0.6660625338554382, + "learning_rate": 8.869481902058197e-05, + "loss": 0.032745882868766785, + "step": 79680 + }, + { + "epoch": 11.311568488289566, + "grad_norm": 0.26197269558906555, + "learning_rate": 8.869339957416609e-05, + "loss": 0.04385312795639038, + "step": 79690 + }, + { + "epoch": 11.312987934705465, + "grad_norm": 4.215728282928467, + "learning_rate": 8.869198012775018e-05, + "loss": 0.020120292901992798, + "step": 79700 + }, + { + "epoch": 11.314407381121363, + "grad_norm": 8.843235969543457, + "learning_rate": 8.86905606813343e-05, + "loss": 0.019011446833610536, + "step": 79710 + }, + { + "epoch": 11.315826827537261, + "grad_norm": 17.59849739074707, + "learning_rate": 8.868914123491838e-05, + "loss": 0.030032917857170105, + "step": 79720 + }, + { + "epoch": 11.317246273953158, + "grad_norm": 0.15840287506580353, + "learning_rate": 8.868772178850249e-05, + "loss": 0.023678889870643614, + "step": 79730 + }, + { + "epoch": 11.318665720369056, + "grad_norm": 0.020314080640673637, + "learning_rate": 8.868630234208659e-05, + "loss": 0.026168987154960632, + "step": 79740 + }, + { + "epoch": 11.320085166784954, + "grad_norm": 0.6125651001930237, + "learning_rate": 8.86848828956707e-05, + "loss": 0.023059825599193572, + "step": 79750 + }, + { + "epoch": 11.321504613200851, + "grad_norm": 15.21810245513916, + "learning_rate": 8.86834634492548e-05, + "loss": 0.03762706518173218, + "step": 79760 + }, + { + "epoch": 11.32292405961675, + "grad_norm": 5.474844455718994, + "learning_rate": 8.868204400283889e-05, + "loss": 0.049691683053970336, + "step": 79770 + }, + { + "epoch": 11.324343506032648, + "grad_norm": 1.5685131549835205, + "learning_rate": 8.8680624556423e-05, + "loss": 0.012304575741291046, + "step": 79780 + }, + { + "epoch": 11.325762952448546, + "grad_norm": 1.084864616394043, + "learning_rate": 8.86792051100071e-05, + "loss": 0.031174218654632567, + "step": 79790 + }, + { + "epoch": 11.327182398864442, + "grad_norm": 8.637118339538574, + "learning_rate": 8.867778566359121e-05, + "loss": 0.01682147979736328, + "step": 79800 + }, + { + "epoch": 11.32860184528034, + "grad_norm": 0.6938183307647705, + "learning_rate": 8.867636621717531e-05, + "loss": 0.0065284594893455505, + "step": 79810 + }, + { + "epoch": 11.330021291696239, + "grad_norm": 0.4806843101978302, + "learning_rate": 8.86749467707594e-05, + "loss": 0.04211854636669159, + "step": 79820 + }, + { + "epoch": 11.331440738112136, + "grad_norm": 3.1211087703704834, + "learning_rate": 8.86735273243435e-05, + "loss": 0.017286308109760284, + "step": 79830 + }, + { + "epoch": 11.332860184528034, + "grad_norm": 0.42606401443481445, + "learning_rate": 8.867210787792761e-05, + "loss": 0.012424388527870178, + "step": 79840 + }, + { + "epoch": 11.334279630943932, + "grad_norm": 0.31800413131713867, + "learning_rate": 8.867068843151171e-05, + "loss": 0.015106824040412904, + "step": 79850 + }, + { + "epoch": 11.33569907735983, + "grad_norm": 8.675800323486328, + "learning_rate": 8.866926898509582e-05, + "loss": 0.06075004935264587, + "step": 79860 + }, + { + "epoch": 11.337118523775727, + "grad_norm": 0.16496433317661285, + "learning_rate": 8.866784953867992e-05, + "loss": 0.011707325279712678, + "step": 79870 + }, + { + "epoch": 11.338537970191625, + "grad_norm": 0.20603810250759125, + "learning_rate": 8.866643009226402e-05, + "loss": 0.03473392426967621, + "step": 79880 + }, + { + "epoch": 11.339957416607524, + "grad_norm": 0.07010184228420258, + "learning_rate": 8.866501064584813e-05, + "loss": 0.022533835470676424, + "step": 79890 + }, + { + "epoch": 11.34137686302342, + "grad_norm": 1.6609989404678345, + "learning_rate": 8.866359119943222e-05, + "loss": 0.05829617381095886, + "step": 79900 + }, + { + "epoch": 11.342796309439318, + "grad_norm": 3.047590970993042, + "learning_rate": 8.866217175301634e-05, + "loss": 0.04038935005664825, + "step": 79910 + }, + { + "epoch": 11.344215755855217, + "grad_norm": 0.25923168659210205, + "learning_rate": 8.866075230660042e-05, + "loss": 0.047234049439430235, + "step": 79920 + }, + { + "epoch": 11.345635202271115, + "grad_norm": 3.83129620552063, + "learning_rate": 8.865933286018453e-05, + "loss": 0.03659171462059021, + "step": 79930 + }, + { + "epoch": 11.347054648687012, + "grad_norm": 6.185966968536377, + "learning_rate": 8.865791341376863e-05, + "loss": 0.03016130328178406, + "step": 79940 + }, + { + "epoch": 11.34847409510291, + "grad_norm": 0.1661355197429657, + "learning_rate": 8.865649396735274e-05, + "loss": 0.016263149678707123, + "step": 79950 + }, + { + "epoch": 11.349893541518808, + "grad_norm": 0.04041367396712303, + "learning_rate": 8.865507452093684e-05, + "loss": 0.022622223198413848, + "step": 79960 + }, + { + "epoch": 11.351312987934705, + "grad_norm": 13.005172729492188, + "learning_rate": 8.865365507452093e-05, + "loss": 0.03981154561042786, + "step": 79970 + }, + { + "epoch": 11.352732434350603, + "grad_norm": 0.0274747796356678, + "learning_rate": 8.865223562810504e-05, + "loss": 0.02526825964450836, + "step": 79980 + }, + { + "epoch": 11.354151880766501, + "grad_norm": 5.506161212921143, + "learning_rate": 8.865081618168914e-05, + "loss": 0.013444554805755616, + "step": 79990 + }, + { + "epoch": 11.3555713271824, + "grad_norm": 2.2358124256134033, + "learning_rate": 8.864939673527325e-05, + "loss": 0.06353100538253784, + "step": 80000 + }, + { + "epoch": 11.3555713271824, + "eval_accuracy": 0.9811152794557131, + "eval_loss": 0.0651044249534607, + "eval_runtime": 33.7254, + "eval_samples_per_second": 466.325, + "eval_steps_per_second": 14.588, + "step": 80000 + }, + { + "epoch": 11.356990773598296, + "grad_norm": 11.643047332763672, + "learning_rate": 8.864797728885735e-05, + "loss": 0.034117665886878965, + "step": 80010 + }, + { + "epoch": 11.358410220014195, + "grad_norm": 0.7074921727180481, + "learning_rate": 8.864655784244145e-05, + "loss": 0.02698880434036255, + "step": 80020 + }, + { + "epoch": 11.359829666430093, + "grad_norm": 5.939971446990967, + "learning_rate": 8.864513839602554e-05, + "loss": 0.050882387161254886, + "step": 80030 + }, + { + "epoch": 11.36124911284599, + "grad_norm": 6.972652912139893, + "learning_rate": 8.864371894960966e-05, + "loss": 0.0220290869474411, + "step": 80040 + }, + { + "epoch": 11.362668559261888, + "grad_norm": 0.12394507229328156, + "learning_rate": 8.864229950319375e-05, + "loss": 0.011210134625434876, + "step": 80050 + }, + { + "epoch": 11.364088005677786, + "grad_norm": 0.2570814788341522, + "learning_rate": 8.864088005677786e-05, + "loss": 0.021895354986190795, + "step": 80060 + }, + { + "epoch": 11.365507452093684, + "grad_norm": 1.4537698030471802, + "learning_rate": 8.863946061036198e-05, + "loss": 0.04261449277400971, + "step": 80070 + }, + { + "epoch": 11.36692689850958, + "grad_norm": 0.11611072719097137, + "learning_rate": 8.863804116394606e-05, + "loss": 0.0075361371040344235, + "step": 80080 + }, + { + "epoch": 11.36834634492548, + "grad_norm": 8.89481258392334, + "learning_rate": 8.863662171753017e-05, + "loss": 0.010879594087600707, + "step": 80090 + }, + { + "epoch": 11.369765791341377, + "grad_norm": 1.2125955820083618, + "learning_rate": 8.863520227111427e-05, + "loss": 0.009992797672748566, + "step": 80100 + }, + { + "epoch": 11.371185237757274, + "grad_norm": 0.028409961611032486, + "learning_rate": 8.863378282469838e-05, + "loss": 0.014760425686836243, + "step": 80110 + }, + { + "epoch": 11.372604684173172, + "grad_norm": 1.1265093088150024, + "learning_rate": 8.863236337828248e-05, + "loss": 0.01140914335846901, + "step": 80120 + }, + { + "epoch": 11.37402413058907, + "grad_norm": 0.033151112496852875, + "learning_rate": 8.863094393186657e-05, + "loss": 0.01173935979604721, + "step": 80130 + }, + { + "epoch": 11.375443577004969, + "grad_norm": 0.9151448011398315, + "learning_rate": 8.862952448545067e-05, + "loss": 0.011046409606933594, + "step": 80140 + }, + { + "epoch": 11.376863023420865, + "grad_norm": 0.5126664638519287, + "learning_rate": 8.862810503903478e-05, + "loss": 0.03162610232830047, + "step": 80150 + }, + { + "epoch": 11.378282469836764, + "grad_norm": 0.37595346570014954, + "learning_rate": 8.862668559261889e-05, + "loss": 0.04972442090511322, + "step": 80160 + }, + { + "epoch": 11.379701916252662, + "grad_norm": 0.044639065861701965, + "learning_rate": 8.862526614620299e-05, + "loss": 0.032768523693084715, + "step": 80170 + }, + { + "epoch": 11.381121362668559, + "grad_norm": 5.298313617706299, + "learning_rate": 8.862384669978709e-05, + "loss": 0.01692819893360138, + "step": 80180 + }, + { + "epoch": 11.382540809084457, + "grad_norm": 10.989527702331543, + "learning_rate": 8.862242725337118e-05, + "loss": 0.011271566897630692, + "step": 80190 + }, + { + "epoch": 11.383960255500355, + "grad_norm": 18.672773361206055, + "learning_rate": 8.86210078069553e-05, + "loss": 0.0333732008934021, + "step": 80200 + }, + { + "epoch": 11.385379701916253, + "grad_norm": 0.6371679902076721, + "learning_rate": 8.861958836053939e-05, + "loss": 0.03899551033973694, + "step": 80210 + }, + { + "epoch": 11.38679914833215, + "grad_norm": 0.4237102270126343, + "learning_rate": 8.86181689141235e-05, + "loss": 0.01527671068906784, + "step": 80220 + }, + { + "epoch": 11.388218594748048, + "grad_norm": 2.081489324569702, + "learning_rate": 8.861674946770759e-05, + "loss": 0.007568246126174927, + "step": 80230 + }, + { + "epoch": 11.389638041163947, + "grad_norm": 4.329316139221191, + "learning_rate": 8.86153300212917e-05, + "loss": 0.012238572537899017, + "step": 80240 + }, + { + "epoch": 11.391057487579843, + "grad_norm": 0.23751749098300934, + "learning_rate": 8.861391057487581e-05, + "loss": 0.023023539781570436, + "step": 80250 + }, + { + "epoch": 11.392476933995741, + "grad_norm": 3.3496580123901367, + "learning_rate": 8.86124911284599e-05, + "loss": 0.04121274352073669, + "step": 80260 + }, + { + "epoch": 11.39389638041164, + "grad_norm": 10.261673927307129, + "learning_rate": 8.861107168204402e-05, + "loss": 0.04002052247524261, + "step": 80270 + }, + { + "epoch": 11.395315826827538, + "grad_norm": 0.14845259487628937, + "learning_rate": 8.86096522356281e-05, + "loss": 0.028948825597763062, + "step": 80280 + }, + { + "epoch": 11.396735273243435, + "grad_norm": 2.195971965789795, + "learning_rate": 8.860823278921221e-05, + "loss": 0.030800750851631163, + "step": 80290 + }, + { + "epoch": 11.398154719659333, + "grad_norm": 1.8944286108016968, + "learning_rate": 8.860681334279631e-05, + "loss": 0.049839770793914794, + "step": 80300 + }, + { + "epoch": 11.399574166075231, + "grad_norm": 2.8817431926727295, + "learning_rate": 8.860539389638042e-05, + "loss": 0.03240306973457337, + "step": 80310 + }, + { + "epoch": 11.400993612491128, + "grad_norm": 13.485665321350098, + "learning_rate": 8.860397444996452e-05, + "loss": 0.03296520709991455, + "step": 80320 + }, + { + "epoch": 11.402413058907026, + "grad_norm": 0.11741983145475388, + "learning_rate": 8.860255500354862e-05, + "loss": 0.06958463191986083, + "step": 80330 + }, + { + "epoch": 11.403832505322924, + "grad_norm": 5.842444896697998, + "learning_rate": 8.860113555713273e-05, + "loss": 0.03531903624534607, + "step": 80340 + }, + { + "epoch": 11.405251951738823, + "grad_norm": 5.8268327713012695, + "learning_rate": 8.859971611071682e-05, + "loss": 0.03155616819858551, + "step": 80350 + }, + { + "epoch": 11.40667139815472, + "grad_norm": 0.37767449021339417, + "learning_rate": 8.859829666430093e-05, + "loss": 0.06218478083610535, + "step": 80360 + }, + { + "epoch": 11.408090844570618, + "grad_norm": 0.15035152435302734, + "learning_rate": 8.859687721788503e-05, + "loss": 0.028446558117866515, + "step": 80370 + }, + { + "epoch": 11.409510290986516, + "grad_norm": 11.59382152557373, + "learning_rate": 8.859545777146913e-05, + "loss": 0.026360827684402465, + "step": 80380 + }, + { + "epoch": 11.410929737402412, + "grad_norm": 2.5553624629974365, + "learning_rate": 8.859403832505323e-05, + "loss": 0.00743272751569748, + "step": 80390 + }, + { + "epoch": 11.41234918381831, + "grad_norm": 3.6909255981445312, + "learning_rate": 8.859261887863734e-05, + "loss": 0.015209051966667175, + "step": 80400 + }, + { + "epoch": 11.413768630234209, + "grad_norm": 0.004849972203373909, + "learning_rate": 8.859119943222143e-05, + "loss": 0.04423539340496063, + "step": 80410 + }, + { + "epoch": 11.415188076650107, + "grad_norm": 6.711785316467285, + "learning_rate": 8.858977998580555e-05, + "loss": 0.036108124256134036, + "step": 80420 + }, + { + "epoch": 11.416607523066004, + "grad_norm": 3.5907931327819824, + "learning_rate": 8.858836053938964e-05, + "loss": 0.035010167956352235, + "step": 80430 + }, + { + "epoch": 11.418026969481902, + "grad_norm": 3.7037017345428467, + "learning_rate": 8.858694109297374e-05, + "loss": 0.05554625988006592, + "step": 80440 + }, + { + "epoch": 11.4194464158978, + "grad_norm": 2.50538969039917, + "learning_rate": 8.858552164655785e-05, + "loss": 0.039789438247680664, + "step": 80450 + }, + { + "epoch": 11.420865862313697, + "grad_norm": 0.15071162581443787, + "learning_rate": 8.858410220014195e-05, + "loss": 0.030578255653381348, + "step": 80460 + }, + { + "epoch": 11.422285308729595, + "grad_norm": 8.107895851135254, + "learning_rate": 8.858268275372606e-05, + "loss": 0.06292965412139892, + "step": 80470 + }, + { + "epoch": 11.423704755145494, + "grad_norm": 4.886325836181641, + "learning_rate": 8.858126330731016e-05, + "loss": 0.05497379899024964, + "step": 80480 + }, + { + "epoch": 11.425124201561392, + "grad_norm": 0.04609977453947067, + "learning_rate": 8.857984386089425e-05, + "loss": 0.031700408458709715, + "step": 80490 + }, + { + "epoch": 11.426543647977288, + "grad_norm": 5.431364059448242, + "learning_rate": 8.857842441447835e-05, + "loss": 0.0376334011554718, + "step": 80500 + }, + { + "epoch": 11.426543647977288, + "eval_accuracy": 0.981941883385261, + "eval_loss": 0.06595078110694885, + "eval_runtime": 33.4544, + "eval_samples_per_second": 470.102, + "eval_steps_per_second": 14.707, + "step": 80500 + }, + { + "epoch": 11.427963094393187, + "grad_norm": 1.621141791343689, + "learning_rate": 8.857700496806246e-05, + "loss": 0.043695205450057985, + "step": 80510 + }, + { + "epoch": 11.429382540809085, + "grad_norm": 2.8783621788024902, + "learning_rate": 8.857558552164656e-05, + "loss": 0.01851654052734375, + "step": 80520 + }, + { + "epoch": 11.430801987224982, + "grad_norm": 1.7176443338394165, + "learning_rate": 8.857416607523067e-05, + "loss": 0.047656843066215517, + "step": 80530 + }, + { + "epoch": 11.43222143364088, + "grad_norm": 0.0061423284932971, + "learning_rate": 8.857274662881477e-05, + "loss": 0.06792023777961731, + "step": 80540 + }, + { + "epoch": 11.433640880056778, + "grad_norm": 0.18173955380916595, + "learning_rate": 8.857132718239887e-05, + "loss": 0.0309945285320282, + "step": 80550 + }, + { + "epoch": 11.435060326472676, + "grad_norm": 7.272887229919434, + "learning_rate": 8.856990773598298e-05, + "loss": 0.06236481070518494, + "step": 80560 + }, + { + "epoch": 11.436479772888573, + "grad_norm": 6.205739974975586, + "learning_rate": 8.856848828956707e-05, + "loss": 0.04723560214042664, + "step": 80570 + }, + { + "epoch": 11.437899219304471, + "grad_norm": 5.547316074371338, + "learning_rate": 8.856706884315119e-05, + "loss": 0.028773194551467894, + "step": 80580 + }, + { + "epoch": 11.43931866572037, + "grad_norm": 0.14390479028224945, + "learning_rate": 8.856564939673527e-05, + "loss": 0.01777784675359726, + "step": 80590 + }, + { + "epoch": 11.440738112136266, + "grad_norm": 0.3887863755226135, + "learning_rate": 8.856422995031938e-05, + "loss": 0.040037679672241214, + "step": 80600 + }, + { + "epoch": 11.442157558552164, + "grad_norm": 4.7135748863220215, + "learning_rate": 8.856281050390348e-05, + "loss": 0.01964379847049713, + "step": 80610 + }, + { + "epoch": 11.443577004968063, + "grad_norm": 4.665762424468994, + "learning_rate": 8.856139105748759e-05, + "loss": 0.06711235046386718, + "step": 80620 + }, + { + "epoch": 11.444996451383961, + "grad_norm": 7.4186248779296875, + "learning_rate": 8.855997161107169e-05, + "loss": 0.04224056899547577, + "step": 80630 + }, + { + "epoch": 11.446415897799858, + "grad_norm": 1.482706069946289, + "learning_rate": 8.855855216465578e-05, + "loss": 0.02505524754524231, + "step": 80640 + }, + { + "epoch": 11.447835344215756, + "grad_norm": 0.08347708731889725, + "learning_rate": 8.85571327182399e-05, + "loss": 0.033928149938583375, + "step": 80650 + }, + { + "epoch": 11.449254790631654, + "grad_norm": 1.5397976636886597, + "learning_rate": 8.855571327182399e-05, + "loss": 0.0054555382579565045, + "step": 80660 + }, + { + "epoch": 11.45067423704755, + "grad_norm": 0.7289921045303345, + "learning_rate": 8.85542938254081e-05, + "loss": 0.03309193551540375, + "step": 80670 + }, + { + "epoch": 11.452093683463449, + "grad_norm": 4.813479900360107, + "learning_rate": 8.85528743789922e-05, + "loss": 0.09139240980148315, + "step": 80680 + }, + { + "epoch": 11.453513129879347, + "grad_norm": 10.499625205993652, + "learning_rate": 8.85514549325763e-05, + "loss": 0.040348267555236815, + "step": 80690 + }, + { + "epoch": 11.454932576295246, + "grad_norm": 1.5510531663894653, + "learning_rate": 8.85500354861604e-05, + "loss": 0.04443258345127106, + "step": 80700 + }, + { + "epoch": 11.456352022711142, + "grad_norm": 5.092698097229004, + "learning_rate": 8.85486160397445e-05, + "loss": 0.05160248279571533, + "step": 80710 + }, + { + "epoch": 11.45777146912704, + "grad_norm": 6.294242858886719, + "learning_rate": 8.85471965933286e-05, + "loss": 0.02735318839550018, + "step": 80720 + }, + { + "epoch": 11.459190915542939, + "grad_norm": 0.024253351613879204, + "learning_rate": 8.854577714691271e-05, + "loss": 0.03131797909736633, + "step": 80730 + }, + { + "epoch": 11.460610361958835, + "grad_norm": 4.466399192810059, + "learning_rate": 8.854435770049681e-05, + "loss": 0.03240303099155426, + "step": 80740 + }, + { + "epoch": 11.462029808374734, + "grad_norm": 0.31110042333602905, + "learning_rate": 8.854293825408091e-05, + "loss": 0.03598732352256775, + "step": 80750 + }, + { + "epoch": 11.463449254790632, + "grad_norm": 0.1252836138010025, + "learning_rate": 8.854151880766502e-05, + "loss": 0.03730970919132233, + "step": 80760 + }, + { + "epoch": 11.46486870120653, + "grad_norm": 0.3589157462120056, + "learning_rate": 8.854009936124912e-05, + "loss": 0.00836140513420105, + "step": 80770 + }, + { + "epoch": 11.466288147622427, + "grad_norm": 0.5227982401847839, + "learning_rate": 8.853867991483323e-05, + "loss": 0.021766206622123717, + "step": 80780 + }, + { + "epoch": 11.467707594038325, + "grad_norm": 0.19833236932754517, + "learning_rate": 8.853726046841733e-05, + "loss": 0.00756232813000679, + "step": 80790 + }, + { + "epoch": 11.469127040454223, + "grad_norm": 0.6320415139198303, + "learning_rate": 8.853584102200142e-05, + "loss": 0.058226609230041505, + "step": 80800 + }, + { + "epoch": 11.47054648687012, + "grad_norm": 1.0651955604553223, + "learning_rate": 8.853442157558552e-05, + "loss": 0.04778565466403961, + "step": 80810 + }, + { + "epoch": 11.471965933286018, + "grad_norm": 5.570420265197754, + "learning_rate": 8.853300212916963e-05, + "loss": 0.0344421774148941, + "step": 80820 + }, + { + "epoch": 11.473385379701917, + "grad_norm": 0.3369903266429901, + "learning_rate": 8.853158268275373e-05, + "loss": 0.03234374821186066, + "step": 80830 + }, + { + "epoch": 11.474804826117815, + "grad_norm": 0.012835043482482433, + "learning_rate": 8.853016323633784e-05, + "loss": 0.02318093180656433, + "step": 80840 + }, + { + "epoch": 11.476224272533711, + "grad_norm": 1.5728015899658203, + "learning_rate": 8.852874378992194e-05, + "loss": 0.023737967014312744, + "step": 80850 + }, + { + "epoch": 11.47764371894961, + "grad_norm": 0.11739411950111389, + "learning_rate": 8.852732434350603e-05, + "loss": 0.05025478005409241, + "step": 80860 + }, + { + "epoch": 11.479063165365508, + "grad_norm": 0.9397228360176086, + "learning_rate": 8.852590489709014e-05, + "loss": 0.04425714910030365, + "step": 80870 + }, + { + "epoch": 11.480482611781405, + "grad_norm": 6.053242206573486, + "learning_rate": 8.852448545067424e-05, + "loss": 0.07350468635559082, + "step": 80880 + }, + { + "epoch": 11.481902058197303, + "grad_norm": 0.6401218175888062, + "learning_rate": 8.852306600425835e-05, + "loss": 0.04295220375061035, + "step": 80890 + }, + { + "epoch": 11.483321504613201, + "grad_norm": 1.2638099193572998, + "learning_rate": 8.852164655784244e-05, + "loss": 0.06553231477737427, + "step": 80900 + }, + { + "epoch": 11.4847409510291, + "grad_norm": 0.16520237922668457, + "learning_rate": 8.852022711142655e-05, + "loss": 0.022380702197551727, + "step": 80910 + }, + { + "epoch": 11.486160397444996, + "grad_norm": 0.17308127880096436, + "learning_rate": 8.851880766501065e-05, + "loss": 0.03869318962097168, + "step": 80920 + }, + { + "epoch": 11.487579843860894, + "grad_norm": 4.885634422302246, + "learning_rate": 8.851738821859476e-05, + "loss": 0.037957805395126346, + "step": 80930 + }, + { + "epoch": 11.488999290276793, + "grad_norm": 7.335200786590576, + "learning_rate": 8.851596877217885e-05, + "loss": 0.030105233192443848, + "step": 80940 + }, + { + "epoch": 11.490418736692689, + "grad_norm": 5.766997814178467, + "learning_rate": 8.851454932576295e-05, + "loss": 0.04862077534198761, + "step": 80950 + }, + { + "epoch": 11.491838183108587, + "grad_norm": 0.8015795350074768, + "learning_rate": 8.851312987934706e-05, + "loss": 0.032071438431739804, + "step": 80960 + }, + { + "epoch": 11.493257629524486, + "grad_norm": 2.2064619064331055, + "learning_rate": 8.851171043293116e-05, + "loss": 0.02999696135520935, + "step": 80970 + }, + { + "epoch": 11.494677075940384, + "grad_norm": 8.180752754211426, + "learning_rate": 8.851029098651527e-05, + "loss": 0.032754439115524295, + "step": 80980 + }, + { + "epoch": 11.49609652235628, + "grad_norm": 8.906973838806152, + "learning_rate": 8.850887154009937e-05, + "loss": 0.014296115934848785, + "step": 80990 + }, + { + "epoch": 11.497515968772179, + "grad_norm": 4.91297721862793, + "learning_rate": 8.850745209368346e-05, + "loss": 0.022449912130832674, + "step": 81000 + }, + { + "epoch": 11.497515968772179, + "eval_accuracy": 0.9774909391492338, + "eval_loss": 0.07550395280122757, + "eval_runtime": 32.707, + "eval_samples_per_second": 480.844, + "eval_steps_per_second": 15.043, + "step": 81000 + }, + { + "epoch": 11.498935415188077, + "grad_norm": 1.925844430923462, + "learning_rate": 8.850603264726756e-05, + "loss": 0.10111439228057861, + "step": 81010 + }, + { + "epoch": 11.500354861603974, + "grad_norm": 0.3102894425392151, + "learning_rate": 8.850461320085167e-05, + "loss": 0.02656802237033844, + "step": 81020 + }, + { + "epoch": 11.501774308019872, + "grad_norm": 0.8817775845527649, + "learning_rate": 8.850319375443577e-05, + "loss": 0.009132151305675507, + "step": 81030 + }, + { + "epoch": 11.50319375443577, + "grad_norm": 7.1372809410095215, + "learning_rate": 8.850177430801988e-05, + "loss": 0.023169676959514617, + "step": 81040 + }, + { + "epoch": 11.504613200851669, + "grad_norm": 0.17022785544395447, + "learning_rate": 8.850035486160398e-05, + "loss": 0.010735802352428436, + "step": 81050 + }, + { + "epoch": 11.506032647267565, + "grad_norm": 0.09135201573371887, + "learning_rate": 8.849893541518808e-05, + "loss": 0.02542368769645691, + "step": 81060 + }, + { + "epoch": 11.507452093683463, + "grad_norm": 1.0567549467086792, + "learning_rate": 8.849751596877219e-05, + "loss": 0.02125450372695923, + "step": 81070 + }, + { + "epoch": 11.508871540099362, + "grad_norm": 0.1266782283782959, + "learning_rate": 8.849609652235628e-05, + "loss": 0.015872204303741456, + "step": 81080 + }, + { + "epoch": 11.510290986515258, + "grad_norm": 6.861550807952881, + "learning_rate": 8.84946770759404e-05, + "loss": 0.032646551728248596, + "step": 81090 + }, + { + "epoch": 11.511710432931157, + "grad_norm": 0.036920398473739624, + "learning_rate": 8.849325762952448e-05, + "loss": 0.01691252291202545, + "step": 81100 + }, + { + "epoch": 11.513129879347055, + "grad_norm": 0.8152676224708557, + "learning_rate": 8.849183818310859e-05, + "loss": 0.011697210371494293, + "step": 81110 + }, + { + "epoch": 11.514549325762953, + "grad_norm": 7.33953332901001, + "learning_rate": 8.849041873669269e-05, + "loss": 0.02431018799543381, + "step": 81120 + }, + { + "epoch": 11.51596877217885, + "grad_norm": 0.4651520550251007, + "learning_rate": 8.84889992902768e-05, + "loss": 0.023129910230636597, + "step": 81130 + }, + { + "epoch": 11.517388218594748, + "grad_norm": 0.4674939811229706, + "learning_rate": 8.84875798438609e-05, + "loss": 0.03972046971321106, + "step": 81140 + }, + { + "epoch": 11.518807665010646, + "grad_norm": 8.952187538146973, + "learning_rate": 8.8486160397445e-05, + "loss": 0.05522555708885193, + "step": 81150 + }, + { + "epoch": 11.520227111426543, + "grad_norm": 0.10738738626241684, + "learning_rate": 8.84847409510291e-05, + "loss": 0.012743067741394044, + "step": 81160 + }, + { + "epoch": 11.521646557842441, + "grad_norm": 0.03591417148709297, + "learning_rate": 8.84833215046132e-05, + "loss": 0.021634458005428313, + "step": 81170 + }, + { + "epoch": 11.52306600425834, + "grad_norm": 0.01823955960571766, + "learning_rate": 8.848190205819731e-05, + "loss": 0.014959985017776489, + "step": 81180 + }, + { + "epoch": 11.524485450674238, + "grad_norm": 0.3199457824230194, + "learning_rate": 8.848048261178141e-05, + "loss": 0.020771077275276183, + "step": 81190 + }, + { + "epoch": 11.525904897090134, + "grad_norm": 3.928478956222534, + "learning_rate": 8.847906316536552e-05, + "loss": 0.04103337228298187, + "step": 81200 + }, + { + "epoch": 11.527324343506033, + "grad_norm": 0.09946906566619873, + "learning_rate": 8.84776437189496e-05, + "loss": 0.03343590497970581, + "step": 81210 + }, + { + "epoch": 11.528743789921931, + "grad_norm": 0.22660058736801147, + "learning_rate": 8.847622427253372e-05, + "loss": 0.021284933388233184, + "step": 81220 + }, + { + "epoch": 11.530163236337827, + "grad_norm": 0.03823212906718254, + "learning_rate": 8.847480482611781e-05, + "loss": 0.03749181628227234, + "step": 81230 + }, + { + "epoch": 11.531582682753726, + "grad_norm": 2.1197917461395264, + "learning_rate": 8.847338537970192e-05, + "loss": 0.05521925687789917, + "step": 81240 + }, + { + "epoch": 11.533002129169624, + "grad_norm": 12.29832649230957, + "learning_rate": 8.847196593328602e-05, + "loss": 0.036048969626426695, + "step": 81250 + }, + { + "epoch": 11.534421575585522, + "grad_norm": 0.5513697266578674, + "learning_rate": 8.847054648687012e-05, + "loss": 0.013157932460308075, + "step": 81260 + }, + { + "epoch": 11.535841022001419, + "grad_norm": 9.3153715133667, + "learning_rate": 8.846912704045423e-05, + "loss": 0.07485218048095703, + "step": 81270 + }, + { + "epoch": 11.537260468417317, + "grad_norm": 9.140734672546387, + "learning_rate": 8.846770759403833e-05, + "loss": 0.07693561911582947, + "step": 81280 + }, + { + "epoch": 11.538679914833216, + "grad_norm": 0.37796759605407715, + "learning_rate": 8.846628814762244e-05, + "loss": 0.06537792086601257, + "step": 81290 + }, + { + "epoch": 11.540099361249112, + "grad_norm": 7.079742431640625, + "learning_rate": 8.846486870120654e-05, + "loss": 0.025105878710746765, + "step": 81300 + }, + { + "epoch": 11.54151880766501, + "grad_norm": 0.3768748939037323, + "learning_rate": 8.846344925479063e-05, + "loss": 0.054667305946350095, + "step": 81310 + }, + { + "epoch": 11.542938254080909, + "grad_norm": 4.9976019859313965, + "learning_rate": 8.846202980837473e-05, + "loss": 0.06892080307006836, + "step": 81320 + }, + { + "epoch": 11.544357700496807, + "grad_norm": 0.3432539701461792, + "learning_rate": 8.846061036195884e-05, + "loss": 0.01103949099779129, + "step": 81330 + }, + { + "epoch": 11.545777146912704, + "grad_norm": 2.6538197994232178, + "learning_rate": 8.845919091554294e-05, + "loss": 0.01662140041589737, + "step": 81340 + }, + { + "epoch": 11.547196593328602, + "grad_norm": 0.8760688304901123, + "learning_rate": 8.845777146912705e-05, + "loss": 0.03641084134578705, + "step": 81350 + }, + { + "epoch": 11.5486160397445, + "grad_norm": 5.520683765411377, + "learning_rate": 8.845635202271115e-05, + "loss": 0.01766434609889984, + "step": 81360 + }, + { + "epoch": 11.550035486160397, + "grad_norm": 1.819739818572998, + "learning_rate": 8.845493257629524e-05, + "loss": 0.010377876460552216, + "step": 81370 + }, + { + "epoch": 11.551454932576295, + "grad_norm": 1.5828450918197632, + "learning_rate": 8.845351312987935e-05, + "loss": 0.02364148199558258, + "step": 81380 + }, + { + "epoch": 11.552874378992193, + "grad_norm": 0.13533614575862885, + "learning_rate": 8.845209368346345e-05, + "loss": 0.027328240871429443, + "step": 81390 + }, + { + "epoch": 11.554293825408092, + "grad_norm": 12.122629165649414, + "learning_rate": 8.845067423704756e-05, + "loss": 0.04072291851043701, + "step": 81400 + }, + { + "epoch": 11.555713271823988, + "grad_norm": 10.578282356262207, + "learning_rate": 8.844925479063165e-05, + "loss": 0.009069137275218964, + "step": 81410 + }, + { + "epoch": 11.557132718239886, + "grad_norm": 5.478339672088623, + "learning_rate": 8.844783534421576e-05, + "loss": 0.02398311048746109, + "step": 81420 + }, + { + "epoch": 11.558552164655785, + "grad_norm": 2.616616725921631, + "learning_rate": 8.844641589779986e-05, + "loss": 0.0067227624356746675, + "step": 81430 + }, + { + "epoch": 11.559971611071681, + "grad_norm": 0.09911337494850159, + "learning_rate": 8.844499645138397e-05, + "loss": 0.02870553731918335, + "step": 81440 + }, + { + "epoch": 11.56139105748758, + "grad_norm": 0.030385294929146767, + "learning_rate": 8.844357700496806e-05, + "loss": 0.011173336207866669, + "step": 81450 + }, + { + "epoch": 11.562810503903478, + "grad_norm": 0.06865376234054565, + "learning_rate": 8.844215755855217e-05, + "loss": 0.025823640823364257, + "step": 81460 + }, + { + "epoch": 11.564229950319376, + "grad_norm": 3.7182939052581787, + "learning_rate": 8.844073811213627e-05, + "loss": 0.05339401960372925, + "step": 81470 + }, + { + "epoch": 11.565649396735273, + "grad_norm": 1.3230258226394653, + "learning_rate": 8.843931866572037e-05, + "loss": 0.018890395760536194, + "step": 81480 + }, + { + "epoch": 11.567068843151171, + "grad_norm": 3.3779211044311523, + "learning_rate": 8.843789921930448e-05, + "loss": 0.00660952776670456, + "step": 81490 + }, + { + "epoch": 11.56848828956707, + "grad_norm": 0.18795613944530487, + "learning_rate": 8.843647977288858e-05, + "loss": 0.019702652096748353, + "step": 81500 + }, + { + "epoch": 11.56848828956707, + "eval_accuracy": 0.9763464106313983, + "eval_loss": 0.08224768191576004, + "eval_runtime": 33.7438, + "eval_samples_per_second": 466.071, + "eval_steps_per_second": 14.58, + "step": 81500 + }, + { + "epoch": 11.569907735982966, + "grad_norm": 0.34426751732826233, + "learning_rate": 8.843506032647269e-05, + "loss": 0.03004116714000702, + "step": 81510 + }, + { + "epoch": 11.571327182398864, + "grad_norm": 0.4941084086894989, + "learning_rate": 8.843364088005677e-05, + "loss": 0.022238391637802123, + "step": 81520 + }, + { + "epoch": 11.572746628814762, + "grad_norm": 11.108928680419922, + "learning_rate": 8.843222143364088e-05, + "loss": 0.056462281942367555, + "step": 81530 + }, + { + "epoch": 11.57416607523066, + "grad_norm": 0.16970746219158173, + "learning_rate": 8.843080198722498e-05, + "loss": 0.021629197895526885, + "step": 81540 + }, + { + "epoch": 11.575585521646557, + "grad_norm": 0.09552083164453506, + "learning_rate": 8.842952448545068e-05, + "loss": 0.028470611572265624, + "step": 81550 + }, + { + "epoch": 11.577004968062456, + "grad_norm": 0.07120678573846817, + "learning_rate": 8.842810503903478e-05, + "loss": 0.02539893686771393, + "step": 81560 + }, + { + "epoch": 11.578424414478354, + "grad_norm": 2.5239343643188477, + "learning_rate": 8.842668559261889e-05, + "loss": 0.020925018191337585, + "step": 81570 + }, + { + "epoch": 11.57984386089425, + "grad_norm": 10.473538398742676, + "learning_rate": 8.842526614620298e-05, + "loss": 0.06097403764724731, + "step": 81580 + }, + { + "epoch": 11.581263307310149, + "grad_norm": 7.29902982711792, + "learning_rate": 8.842384669978708e-05, + "loss": 0.03347648978233338, + "step": 81590 + }, + { + "epoch": 11.582682753726047, + "grad_norm": 0.9426771402359009, + "learning_rate": 8.842242725337119e-05, + "loss": 0.02707792818546295, + "step": 81600 + }, + { + "epoch": 11.584102200141945, + "grad_norm": 1.5464801788330078, + "learning_rate": 8.842100780695529e-05, + "loss": 0.029200682044029237, + "step": 81610 + }, + { + "epoch": 11.585521646557842, + "grad_norm": 1.6313835382461548, + "learning_rate": 8.84195883605394e-05, + "loss": 0.036450433731079104, + "step": 81620 + }, + { + "epoch": 11.58694109297374, + "grad_norm": 4.770724296569824, + "learning_rate": 8.84181689141235e-05, + "loss": 0.048863834142684935, + "step": 81630 + }, + { + "epoch": 11.588360539389639, + "grad_norm": 0.085438571870327, + "learning_rate": 8.84167494677076e-05, + "loss": 0.042553862929344176, + "step": 81640 + }, + { + "epoch": 11.589779985805535, + "grad_norm": 0.8502871990203857, + "learning_rate": 8.84153300212917e-05, + "loss": 0.03484118282794953, + "step": 81650 + }, + { + "epoch": 11.591199432221433, + "grad_norm": 0.8441861271858215, + "learning_rate": 8.84139105748758e-05, + "loss": 0.030901208519935608, + "step": 81660 + }, + { + "epoch": 11.592618878637332, + "grad_norm": 6.2987871170043945, + "learning_rate": 8.84124911284599e-05, + "loss": 0.03970724940299988, + "step": 81670 + }, + { + "epoch": 11.59403832505323, + "grad_norm": 4.7662129402160645, + "learning_rate": 8.841107168204401e-05, + "loss": 0.01229364275932312, + "step": 81680 + }, + { + "epoch": 11.595457771469126, + "grad_norm": 1.5325942039489746, + "learning_rate": 8.840965223562811e-05, + "loss": 0.03952722549438477, + "step": 81690 + }, + { + "epoch": 11.596877217885025, + "grad_norm": 2.52839994430542, + "learning_rate": 8.840823278921221e-05, + "loss": 0.02153339087963104, + "step": 81700 + }, + { + "epoch": 11.598296664300923, + "grad_norm": 0.5760644674301147, + "learning_rate": 8.840681334279632e-05, + "loss": 0.0679667830467224, + "step": 81710 + }, + { + "epoch": 11.59971611071682, + "grad_norm": 5.679330825805664, + "learning_rate": 8.840539389638042e-05, + "loss": 0.042963171005249025, + "step": 81720 + }, + { + "epoch": 11.601135557132718, + "grad_norm": 0.11442669481039047, + "learning_rate": 8.840397444996453e-05, + "loss": 0.012864866852760315, + "step": 81730 + }, + { + "epoch": 11.602555003548616, + "grad_norm": 9.763188362121582, + "learning_rate": 8.840255500354861e-05, + "loss": 0.03386241793632507, + "step": 81740 + }, + { + "epoch": 11.603974449964515, + "grad_norm": 1.1611683368682861, + "learning_rate": 8.840113555713272e-05, + "loss": 0.029190993309020995, + "step": 81750 + }, + { + "epoch": 11.605393896380411, + "grad_norm": 0.17620205879211426, + "learning_rate": 8.839971611071682e-05, + "loss": 0.03684349656105042, + "step": 81760 + }, + { + "epoch": 11.60681334279631, + "grad_norm": 1.8045251369476318, + "learning_rate": 8.839829666430093e-05, + "loss": 0.050387269258499144, + "step": 81770 + }, + { + "epoch": 11.608232789212208, + "grad_norm": 0.3409873843193054, + "learning_rate": 8.839687721788503e-05, + "loss": 0.022375202178955077, + "step": 81780 + }, + { + "epoch": 11.609652235628104, + "grad_norm": 1.041752815246582, + "learning_rate": 8.839545777146914e-05, + "loss": 0.021577396988868715, + "step": 81790 + }, + { + "epoch": 11.611071682044003, + "grad_norm": 0.13355006277561188, + "learning_rate": 8.839403832505324e-05, + "loss": 0.015710872411727906, + "step": 81800 + }, + { + "epoch": 11.6124911284599, + "grad_norm": 0.040545929223299026, + "learning_rate": 8.839261887863733e-05, + "loss": 0.062452536821365354, + "step": 81810 + }, + { + "epoch": 11.6139105748758, + "grad_norm": 0.624416708946228, + "learning_rate": 8.839119943222144e-05, + "loss": 0.005656691268086433, + "step": 81820 + }, + { + "epoch": 11.615330021291696, + "grad_norm": 4.042696952819824, + "learning_rate": 8.838977998580554e-05, + "loss": 0.011597123742103577, + "step": 81830 + }, + { + "epoch": 11.616749467707594, + "grad_norm": 0.06087561324238777, + "learning_rate": 8.838836053938965e-05, + "loss": 0.031940600275993346, + "step": 81840 + }, + { + "epoch": 11.618168914123492, + "grad_norm": 10.522989273071289, + "learning_rate": 8.838694109297374e-05, + "loss": 0.019474413990974427, + "step": 81850 + }, + { + "epoch": 11.619588360539389, + "grad_norm": 0.5727241039276123, + "learning_rate": 8.838552164655785e-05, + "loss": 0.0022082440555095673, + "step": 81860 + }, + { + "epoch": 11.621007806955287, + "grad_norm": 0.49038177728652954, + "learning_rate": 8.838410220014194e-05, + "loss": 0.012241747230291367, + "step": 81870 + }, + { + "epoch": 11.622427253371185, + "grad_norm": 1.801270842552185, + "learning_rate": 8.838268275372606e-05, + "loss": 0.011108881235122681, + "step": 81880 + }, + { + "epoch": 11.623846699787084, + "grad_norm": 1.095993161201477, + "learning_rate": 8.838126330731015e-05, + "loss": 0.007870152592658997, + "step": 81890 + }, + { + "epoch": 11.62526614620298, + "grad_norm": 0.8447113633155823, + "learning_rate": 8.837984386089425e-05, + "loss": 0.01694517433643341, + "step": 81900 + }, + { + "epoch": 11.626685592618879, + "grad_norm": 0.6038371324539185, + "learning_rate": 8.837842441447836e-05, + "loss": 0.0033034585416316987, + "step": 81910 + }, + { + "epoch": 11.628105039034777, + "grad_norm": 0.1463266760110855, + "learning_rate": 8.837700496806246e-05, + "loss": 0.03691132664680481, + "step": 81920 + }, + { + "epoch": 11.629524485450673, + "grad_norm": 0.01949205808341503, + "learning_rate": 8.837558552164657e-05, + "loss": 0.030641201138496398, + "step": 81930 + }, + { + "epoch": 11.630943931866572, + "grad_norm": 0.01865651085972786, + "learning_rate": 8.837416607523067e-05, + "loss": 0.06486660838127137, + "step": 81940 + }, + { + "epoch": 11.63236337828247, + "grad_norm": 0.44764140248298645, + "learning_rate": 8.837274662881476e-05, + "loss": 0.03592503070831299, + "step": 81950 + }, + { + "epoch": 11.633782824698368, + "grad_norm": 6.017378807067871, + "learning_rate": 8.837132718239886e-05, + "loss": 0.025605541467666627, + "step": 81960 + }, + { + "epoch": 11.635202271114265, + "grad_norm": 4.395421981811523, + "learning_rate": 8.836990773598297e-05, + "loss": 0.03425087332725525, + "step": 81970 + }, + { + "epoch": 11.636621717530163, + "grad_norm": 3.474912405014038, + "learning_rate": 8.836848828956707e-05, + "loss": 0.007524078339338302, + "step": 81980 + }, + { + "epoch": 11.638041163946061, + "grad_norm": 7.506344318389893, + "learning_rate": 8.836706884315118e-05, + "loss": 0.02990333139896393, + "step": 81990 + }, + { + "epoch": 11.639460610361958, + "grad_norm": 1.1138578653335571, + "learning_rate": 8.836564939673528e-05, + "loss": 0.02096046507358551, + "step": 82000 + }, + { + "epoch": 11.639460610361958, + "eval_accuracy": 0.9875373561391237, + "eval_loss": 0.04104387387633324, + "eval_runtime": 34.0548, + "eval_samples_per_second": 461.814, + "eval_steps_per_second": 14.447, + "step": 82000 + }, + { + "epoch": 11.640880056777856, + "grad_norm": 0.22225643694400787, + "learning_rate": 8.836422995031938e-05, + "loss": 0.02522968649864197, + "step": 82010 + }, + { + "epoch": 11.642299503193755, + "grad_norm": 5.2800140380859375, + "learning_rate": 8.836281050390349e-05, + "loss": 0.055390971899032596, + "step": 82020 + }, + { + "epoch": 11.643718949609653, + "grad_norm": 0.09609229862689972, + "learning_rate": 8.836139105748758e-05, + "loss": 0.012254738807678222, + "step": 82030 + }, + { + "epoch": 11.64513839602555, + "grad_norm": 2.6045665740966797, + "learning_rate": 8.83599716110717e-05, + "loss": 0.059720170497894284, + "step": 82040 + }, + { + "epoch": 11.646557842441448, + "grad_norm": 0.6698890924453735, + "learning_rate": 8.835855216465578e-05, + "loss": 0.022242045402526854, + "step": 82050 + }, + { + "epoch": 11.647977288857346, + "grad_norm": 0.3261481523513794, + "learning_rate": 8.835713271823989e-05, + "loss": 0.01315658837556839, + "step": 82060 + }, + { + "epoch": 11.649396735273243, + "grad_norm": 15.95113468170166, + "learning_rate": 8.835571327182399e-05, + "loss": 0.04952957332134247, + "step": 82070 + }, + { + "epoch": 11.650816181689141, + "grad_norm": 0.34951213002204895, + "learning_rate": 8.83542938254081e-05, + "loss": 0.028381425142288207, + "step": 82080 + }, + { + "epoch": 11.65223562810504, + "grad_norm": 0.33600348234176636, + "learning_rate": 8.83528743789922e-05, + "loss": 0.03296903371810913, + "step": 82090 + }, + { + "epoch": 11.653655074520938, + "grad_norm": 0.44056016206741333, + "learning_rate": 8.835145493257629e-05, + "loss": 0.00819028839468956, + "step": 82100 + }, + { + "epoch": 11.655074520936834, + "grad_norm": 0.9378697276115417, + "learning_rate": 8.83500354861604e-05, + "loss": 0.015890008211135863, + "step": 82110 + }, + { + "epoch": 11.656493967352732, + "grad_norm": 7.173929214477539, + "learning_rate": 8.83486160397445e-05, + "loss": 0.016848720610141754, + "step": 82120 + }, + { + "epoch": 11.65791341376863, + "grad_norm": 0.40014129877090454, + "learning_rate": 8.834719659332861e-05, + "loss": 0.03617895543575287, + "step": 82130 + }, + { + "epoch": 11.659332860184527, + "grad_norm": 2.238469362258911, + "learning_rate": 8.834577714691271e-05, + "loss": 0.028064021468162538, + "step": 82140 + }, + { + "epoch": 11.660752306600425, + "grad_norm": 3.2392852306365967, + "learning_rate": 8.834435770049682e-05, + "loss": 0.026870760321617126, + "step": 82150 + }, + { + "epoch": 11.662171753016324, + "grad_norm": 0.2407623678445816, + "learning_rate": 8.83429382540809e-05, + "loss": 0.028642752766609193, + "step": 82160 + }, + { + "epoch": 11.663591199432222, + "grad_norm": 0.5530653595924377, + "learning_rate": 8.834151880766501e-05, + "loss": 0.0160242035984993, + "step": 82170 + }, + { + "epoch": 11.665010645848119, + "grad_norm": 5.290857315063477, + "learning_rate": 8.834009936124911e-05, + "loss": 0.05292970538139343, + "step": 82180 + }, + { + "epoch": 11.666430092264017, + "grad_norm": 0.8904526233673096, + "learning_rate": 8.833867991483322e-05, + "loss": 0.03078848421573639, + "step": 82190 + }, + { + "epoch": 11.667849538679915, + "grad_norm": 0.06986336410045624, + "learning_rate": 8.833726046841732e-05, + "loss": 0.017547787725925447, + "step": 82200 + }, + { + "epoch": 11.669268985095812, + "grad_norm": 9.345512390136719, + "learning_rate": 8.833584102200142e-05, + "loss": 0.0370622456073761, + "step": 82210 + }, + { + "epoch": 11.67068843151171, + "grad_norm": 2.8718032836914062, + "learning_rate": 8.833442157558553e-05, + "loss": 0.016177193820476533, + "step": 82220 + }, + { + "epoch": 11.672107877927608, + "grad_norm": 0.4981312155723572, + "learning_rate": 8.833300212916963e-05, + "loss": 0.020555400848388673, + "step": 82230 + }, + { + "epoch": 11.673527324343507, + "grad_norm": 5.187900543212891, + "learning_rate": 8.833158268275374e-05, + "loss": 0.03203676640987396, + "step": 82240 + }, + { + "epoch": 11.674946770759403, + "grad_norm": 0.5993130803108215, + "learning_rate": 8.833016323633783e-05, + "loss": 0.03552861511707306, + "step": 82250 + }, + { + "epoch": 11.676366217175302, + "grad_norm": 1.570359468460083, + "learning_rate": 8.832874378992193e-05, + "loss": 0.0360751211643219, + "step": 82260 + }, + { + "epoch": 11.6777856635912, + "grad_norm": 0.01849873550236225, + "learning_rate": 8.832732434350603e-05, + "loss": 0.07889996767044068, + "step": 82270 + }, + { + "epoch": 11.679205110007096, + "grad_norm": 4.044131755828857, + "learning_rate": 8.832590489709014e-05, + "loss": 0.03975077271461487, + "step": 82280 + }, + { + "epoch": 11.680624556422995, + "grad_norm": 0.21400457620620728, + "learning_rate": 8.832448545067424e-05, + "loss": 0.028557685017585755, + "step": 82290 + }, + { + "epoch": 11.682044002838893, + "grad_norm": 2.2320234775543213, + "learning_rate": 8.832306600425835e-05, + "loss": 0.032077842950820924, + "step": 82300 + }, + { + "epoch": 11.683463449254791, + "grad_norm": 6.41154670715332, + "learning_rate": 8.832164655784245e-05, + "loss": 0.04047545492649078, + "step": 82310 + }, + { + "epoch": 11.684882895670688, + "grad_norm": 7.783628463745117, + "learning_rate": 8.832022711142654e-05, + "loss": 0.05753174424171448, + "step": 82320 + }, + { + "epoch": 11.686302342086586, + "grad_norm": 0.2691332697868347, + "learning_rate": 8.831880766501065e-05, + "loss": 0.033080264925956726, + "step": 82330 + }, + { + "epoch": 11.687721788502484, + "grad_norm": 0.08865190297365189, + "learning_rate": 8.831738821859475e-05, + "loss": 0.0328124463558197, + "step": 82340 + }, + { + "epoch": 11.689141234918381, + "grad_norm": 0.06198609247803688, + "learning_rate": 8.831596877217886e-05, + "loss": 0.032992076873779294, + "step": 82350 + }, + { + "epoch": 11.69056068133428, + "grad_norm": 1.0192947387695312, + "learning_rate": 8.831454932576295e-05, + "loss": 0.02295834571123123, + "step": 82360 + }, + { + "epoch": 11.691980127750178, + "grad_norm": 4.939817428588867, + "learning_rate": 8.831312987934706e-05, + "loss": 0.04259155392646789, + "step": 82370 + }, + { + "epoch": 11.693399574166076, + "grad_norm": 0.011144721880555153, + "learning_rate": 8.831171043293115e-05, + "loss": 0.011710671335458755, + "step": 82380 + }, + { + "epoch": 11.694819020581972, + "grad_norm": 17.873878479003906, + "learning_rate": 8.831029098651527e-05, + "loss": 0.03337354063987732, + "step": 82390 + }, + { + "epoch": 11.69623846699787, + "grad_norm": 0.6569585800170898, + "learning_rate": 8.830887154009938e-05, + "loss": 0.02401721030473709, + "step": 82400 + }, + { + "epoch": 11.697657913413769, + "grad_norm": 0.019921043887734413, + "learning_rate": 8.830745209368346e-05, + "loss": 0.046464449167251586, + "step": 82410 + }, + { + "epoch": 11.699077359829666, + "grad_norm": 1.3580775260925293, + "learning_rate": 8.830603264726757e-05, + "loss": 0.022325956821441652, + "step": 82420 + }, + { + "epoch": 11.700496806245564, + "grad_norm": 10.863458633422852, + "learning_rate": 8.830461320085167e-05, + "loss": 0.02395484447479248, + "step": 82430 + }, + { + "epoch": 11.701916252661462, + "grad_norm": 5.869367599487305, + "learning_rate": 8.830319375443578e-05, + "loss": 0.043967399001121524, + "step": 82440 + }, + { + "epoch": 11.70333569907736, + "grad_norm": 0.5498532056808472, + "learning_rate": 8.830177430801988e-05, + "loss": 0.02715992331504822, + "step": 82450 + }, + { + "epoch": 11.704755145493257, + "grad_norm": 0.06902461498975754, + "learning_rate": 8.830035486160397e-05, + "loss": 0.004207936301827431, + "step": 82460 + }, + { + "epoch": 11.706174591909155, + "grad_norm": 3.9329452514648438, + "learning_rate": 8.829893541518807e-05, + "loss": 0.031944799423217776, + "step": 82470 + }, + { + "epoch": 11.707594038325054, + "grad_norm": 4.1940107345581055, + "learning_rate": 8.829751596877218e-05, + "loss": 0.022048255801200865, + "step": 82480 + }, + { + "epoch": 11.70901348474095, + "grad_norm": 0.15101581811904907, + "learning_rate": 8.829609652235629e-05, + "loss": 0.021107760071754456, + "step": 82490 + }, + { + "epoch": 11.710432931156848, + "grad_norm": 0.04076530411839485, + "learning_rate": 8.829467707594039e-05, + "loss": 0.009322655946016311, + "step": 82500 + }, + { + "epoch": 11.710432931156848, + "eval_accuracy": 0.9867107522095759, + "eval_loss": 0.0509491004049778, + "eval_runtime": 33.6657, + "eval_samples_per_second": 467.153, + "eval_steps_per_second": 14.614, + "step": 82500 + }, + { + "epoch": 11.711852377572747, + "grad_norm": 12.230842590332031, + "learning_rate": 8.82932576295245e-05, + "loss": 0.03886268138885498, + "step": 82510 + }, + { + "epoch": 11.713271823988645, + "grad_norm": 9.446490287780762, + "learning_rate": 8.829183818310859e-05, + "loss": 0.01960558593273163, + "step": 82520 + }, + { + "epoch": 11.714691270404542, + "grad_norm": 0.2041768878698349, + "learning_rate": 8.82904187366927e-05, + "loss": 0.028508198261260987, + "step": 82530 + }, + { + "epoch": 11.71611071682044, + "grad_norm": 1.630429983139038, + "learning_rate": 8.82889992902768e-05, + "loss": 0.03760896623134613, + "step": 82540 + }, + { + "epoch": 11.717530163236338, + "grad_norm": 12.508625030517578, + "learning_rate": 8.82875798438609e-05, + "loss": 0.02034131735563278, + "step": 82550 + }, + { + "epoch": 11.718949609652235, + "grad_norm": 3.5060606002807617, + "learning_rate": 8.8286160397445e-05, + "loss": 0.03896633386611938, + "step": 82560 + }, + { + "epoch": 11.720369056068133, + "grad_norm": 0.034930501133203506, + "learning_rate": 8.82847409510291e-05, + "loss": 0.04598281383514404, + "step": 82570 + }, + { + "epoch": 11.721788502484031, + "grad_norm": 3.448381185531616, + "learning_rate": 8.828332150461321e-05, + "loss": 0.036952057480812074, + "step": 82580 + }, + { + "epoch": 11.72320794889993, + "grad_norm": 0.0042781527154147625, + "learning_rate": 8.828190205819731e-05, + "loss": 0.05195838212966919, + "step": 82590 + }, + { + "epoch": 11.724627395315826, + "grad_norm": 3.1850550174713135, + "learning_rate": 8.828048261178142e-05, + "loss": 0.036840036511421204, + "step": 82600 + }, + { + "epoch": 11.726046841731725, + "grad_norm": 6.152586460113525, + "learning_rate": 8.827906316536552e-05, + "loss": 0.018132734298706054, + "step": 82610 + }, + { + "epoch": 11.727466288147623, + "grad_norm": 0.34917500615119934, + "learning_rate": 8.827764371894961e-05, + "loss": 0.035174581408500674, + "step": 82620 + }, + { + "epoch": 11.72888573456352, + "grad_norm": 4.656780242919922, + "learning_rate": 8.827622427253371e-05, + "loss": 0.04415706992149353, + "step": 82630 + }, + { + "epoch": 11.730305180979418, + "grad_norm": 0.2968284785747528, + "learning_rate": 8.827480482611782e-05, + "loss": 0.045205461978912356, + "step": 82640 + }, + { + "epoch": 11.731724627395316, + "grad_norm": 2.32602596282959, + "learning_rate": 8.827338537970192e-05, + "loss": 0.008173373341560364, + "step": 82650 + }, + { + "epoch": 11.733144073811214, + "grad_norm": 0.49746090173721313, + "learning_rate": 8.827196593328603e-05, + "loss": 0.043424248695373535, + "step": 82660 + }, + { + "epoch": 11.73456352022711, + "grad_norm": 7.075344085693359, + "learning_rate": 8.827054648687011e-05, + "loss": 0.018824505805969238, + "step": 82670 + }, + { + "epoch": 11.735982966643009, + "grad_norm": 0.27628007531166077, + "learning_rate": 8.826912704045422e-05, + "loss": 0.035436037182807925, + "step": 82680 + }, + { + "epoch": 11.737402413058907, + "grad_norm": 3.1124513149261475, + "learning_rate": 8.826770759403834e-05, + "loss": 0.0338789314031601, + "step": 82690 + }, + { + "epoch": 11.738821859474804, + "grad_norm": 0.8742479085922241, + "learning_rate": 8.826628814762243e-05, + "loss": 0.022074520587921143, + "step": 82700 + }, + { + "epoch": 11.740241305890702, + "grad_norm": 4.0626373291015625, + "learning_rate": 8.826486870120654e-05, + "loss": 0.03586540520191193, + "step": 82710 + }, + { + "epoch": 11.7416607523066, + "grad_norm": 0.11810215562582016, + "learning_rate": 8.826344925479063e-05, + "loss": 0.030027234554290773, + "step": 82720 + }, + { + "epoch": 11.743080198722499, + "grad_norm": 0.0518762581050396, + "learning_rate": 8.826202980837474e-05, + "loss": 0.023009565472602845, + "step": 82730 + }, + { + "epoch": 11.744499645138395, + "grad_norm": 0.10839356482028961, + "learning_rate": 8.826061036195884e-05, + "loss": 0.05535359382629394, + "step": 82740 + }, + { + "epoch": 11.745919091554294, + "grad_norm": 0.44333168864250183, + "learning_rate": 8.825919091554295e-05, + "loss": 0.015498198568820953, + "step": 82750 + }, + { + "epoch": 11.747338537970192, + "grad_norm": 2.660344123840332, + "learning_rate": 8.825777146912704e-05, + "loss": 0.01779383420944214, + "step": 82760 + }, + { + "epoch": 11.748757984386089, + "grad_norm": 0.39260396361351013, + "learning_rate": 8.825635202271114e-05, + "loss": 0.013475912809371948, + "step": 82770 + }, + { + "epoch": 11.750177430801987, + "grad_norm": 9.97494125366211, + "learning_rate": 8.825493257629525e-05, + "loss": 0.028389009833335876, + "step": 82780 + }, + { + "epoch": 11.751596877217885, + "grad_norm": 0.7091060280799866, + "learning_rate": 8.825351312987935e-05, + "loss": 0.014123716950416565, + "step": 82790 + }, + { + "epoch": 11.753016323633783, + "grad_norm": 0.9680839776992798, + "learning_rate": 8.825209368346346e-05, + "loss": 0.05506055355072022, + "step": 82800 + }, + { + "epoch": 11.75443577004968, + "grad_norm": 0.2549281716346741, + "learning_rate": 8.825067423704756e-05, + "loss": 0.03636166155338287, + "step": 82810 + }, + { + "epoch": 11.755855216465578, + "grad_norm": 3.956082344055176, + "learning_rate": 8.824925479063166e-05, + "loss": 0.012689772248268127, + "step": 82820 + }, + { + "epoch": 11.757274662881477, + "grad_norm": 0.12777554988861084, + "learning_rate": 8.824783534421575e-05, + "loss": 0.014213849604129792, + "step": 82830 + }, + { + "epoch": 11.758694109297373, + "grad_norm": 10.15988826751709, + "learning_rate": 8.824641589779986e-05, + "loss": 0.027217572927474974, + "step": 82840 + }, + { + "epoch": 11.760113555713271, + "grad_norm": 4.8726582527160645, + "learning_rate": 8.824499645138396e-05, + "loss": 0.005710848420858383, + "step": 82850 + }, + { + "epoch": 11.76153300212917, + "grad_norm": 0.3747629225254059, + "learning_rate": 8.824357700496807e-05, + "loss": 0.031127306818962096, + "step": 82860 + }, + { + "epoch": 11.762952448545068, + "grad_norm": 12.532156944274902, + "learning_rate": 8.824215755855217e-05, + "loss": 0.03176976442337036, + "step": 82870 + }, + { + "epoch": 11.764371894960965, + "grad_norm": 1.8649965524673462, + "learning_rate": 8.824073811213627e-05, + "loss": 0.030580675601959227, + "step": 82880 + }, + { + "epoch": 11.765791341376863, + "grad_norm": 0.3033379316329956, + "learning_rate": 8.823931866572038e-05, + "loss": 0.013270074129104614, + "step": 82890 + }, + { + "epoch": 11.767210787792761, + "grad_norm": 0.012772593647241592, + "learning_rate": 8.823789921930448e-05, + "loss": 0.015141868591308593, + "step": 82900 + }, + { + "epoch": 11.768630234208658, + "grad_norm": 2.6985650062561035, + "learning_rate": 8.823647977288859e-05, + "loss": 0.005396616086363793, + "step": 82910 + }, + { + "epoch": 11.770049680624556, + "grad_norm": 1.3272356986999512, + "learning_rate": 8.823506032647268e-05, + "loss": 0.010387630760669708, + "step": 82920 + }, + { + "epoch": 11.771469127040454, + "grad_norm": 5.779232501983643, + "learning_rate": 8.823364088005678e-05, + "loss": 0.030734294652938844, + "step": 82930 + }, + { + "epoch": 11.772888573456353, + "grad_norm": 1.0797303915023804, + "learning_rate": 8.823222143364088e-05, + "loss": 0.03054520785808563, + "step": 82940 + }, + { + "epoch": 11.77430801987225, + "grad_norm": 3.3726065158843994, + "learning_rate": 8.823080198722499e-05, + "loss": 0.0510441780090332, + "step": 82950 + }, + { + "epoch": 11.775727466288147, + "grad_norm": 4.178657054901123, + "learning_rate": 8.822938254080909e-05, + "loss": 0.018844333291053773, + "step": 82960 + }, + { + "epoch": 11.777146912704046, + "grad_norm": 3.163480281829834, + "learning_rate": 8.82279630943932e-05, + "loss": 0.030176666378974915, + "step": 82970 + }, + { + "epoch": 11.778566359119942, + "grad_norm": 0.13545754551887512, + "learning_rate": 8.82265436479773e-05, + "loss": 0.06503672003746033, + "step": 82980 + }, + { + "epoch": 11.77998580553584, + "grad_norm": 0.09676958620548248, + "learning_rate": 8.822512420156139e-05, + "loss": 0.04054889976978302, + "step": 82990 + }, + { + "epoch": 11.781405251951739, + "grad_norm": 0.08766447007656097, + "learning_rate": 8.82237047551455e-05, + "loss": 0.05703636407852173, + "step": 83000 + }, + { + "epoch": 11.781405251951739, + "eval_accuracy": 0.9832771666560692, + "eval_loss": 0.055854376405477524, + "eval_runtime": 33.2418, + "eval_samples_per_second": 473.11, + "eval_steps_per_second": 14.801, + "step": 83000 + }, + { + "epoch": 11.782824698367637, + "grad_norm": 4.778905868530273, + "learning_rate": 8.82222853087296e-05, + "loss": 0.016084206104278565, + "step": 83010 + }, + { + "epoch": 11.784244144783534, + "grad_norm": 1.264276146888733, + "learning_rate": 8.822086586231371e-05, + "loss": 0.024041858315467835, + "step": 83020 + }, + { + "epoch": 11.785663591199432, + "grad_norm": 8.326826095581055, + "learning_rate": 8.82194464158978e-05, + "loss": 0.019390736520290375, + "step": 83030 + }, + { + "epoch": 11.78708303761533, + "grad_norm": 0.09791865199804306, + "learning_rate": 8.82180269694819e-05, + "loss": 0.04285701811313629, + "step": 83040 + }, + { + "epoch": 11.788502484031227, + "grad_norm": 0.4148581027984619, + "learning_rate": 8.8216607523066e-05, + "loss": 0.01719527244567871, + "step": 83050 + }, + { + "epoch": 11.789921930447125, + "grad_norm": 2.2213292121887207, + "learning_rate": 8.821518807665011e-05, + "loss": 0.03687406778335571, + "step": 83060 + }, + { + "epoch": 11.791341376863024, + "grad_norm": 0.1824258267879486, + "learning_rate": 8.821376863023421e-05, + "loss": 0.027744564414024352, + "step": 83070 + }, + { + "epoch": 11.792760823278922, + "grad_norm": 0.2214859426021576, + "learning_rate": 8.821234918381831e-05, + "loss": 0.02254961133003235, + "step": 83080 + }, + { + "epoch": 11.794180269694818, + "grad_norm": 0.35346728563308716, + "learning_rate": 8.821092973740242e-05, + "loss": 0.011183008551597595, + "step": 83090 + }, + { + "epoch": 11.795599716110717, + "grad_norm": 5.680757522583008, + "learning_rate": 8.820951029098652e-05, + "loss": 0.03402817249298096, + "step": 83100 + }, + { + "epoch": 11.797019162526615, + "grad_norm": 0.602576494216919, + "learning_rate": 8.820809084457063e-05, + "loss": 0.03264551162719727, + "step": 83110 + }, + { + "epoch": 11.798438608942512, + "grad_norm": 2.3544020652770996, + "learning_rate": 8.820667139815473e-05, + "loss": 0.025781518220901488, + "step": 83120 + }, + { + "epoch": 11.79985805535841, + "grad_norm": 0.12306859344244003, + "learning_rate": 8.820525195173882e-05, + "loss": 0.028462356328964232, + "step": 83130 + }, + { + "epoch": 11.801277501774308, + "grad_norm": 0.7537848949432373, + "learning_rate": 8.820383250532292e-05, + "loss": 0.03587826788425445, + "step": 83140 + }, + { + "epoch": 11.802696948190206, + "grad_norm": 0.05533721670508385, + "learning_rate": 8.820241305890703e-05, + "loss": 0.028028786182403564, + "step": 83150 + }, + { + "epoch": 11.804116394606103, + "grad_norm": 0.06418702751398087, + "learning_rate": 8.820099361249113e-05, + "loss": 0.01934442073106766, + "step": 83160 + }, + { + "epoch": 11.805535841022001, + "grad_norm": 2.590029239654541, + "learning_rate": 8.819957416607524e-05, + "loss": 0.06280165910720825, + "step": 83170 + }, + { + "epoch": 11.8069552874379, + "grad_norm": 2.505331039428711, + "learning_rate": 8.819815471965934e-05, + "loss": 0.021633738279342653, + "step": 83180 + }, + { + "epoch": 11.808374733853796, + "grad_norm": 0.18088681995868683, + "learning_rate": 8.819673527324343e-05, + "loss": 0.0337184876203537, + "step": 83190 + }, + { + "epoch": 11.809794180269694, + "grad_norm": 0.8823683857917786, + "learning_rate": 8.819531582682755e-05, + "loss": 0.041201424598693845, + "step": 83200 + }, + { + "epoch": 11.811213626685593, + "grad_norm": 2.2337379455566406, + "learning_rate": 8.819389638041164e-05, + "loss": 0.05025644898414612, + "step": 83210 + }, + { + "epoch": 11.812633073101491, + "grad_norm": 1.0469427108764648, + "learning_rate": 8.819247693399575e-05, + "loss": 0.03725016713142395, + "step": 83220 + }, + { + "epoch": 11.814052519517388, + "grad_norm": 0.10811296850442886, + "learning_rate": 8.819105748757985e-05, + "loss": 0.020893281698226927, + "step": 83230 + }, + { + "epoch": 11.815471965933286, + "grad_norm": 1.9570878744125366, + "learning_rate": 8.818963804116395e-05, + "loss": 0.04508139491081238, + "step": 83240 + }, + { + "epoch": 11.816891412349184, + "grad_norm": 10.603166580200195, + "learning_rate": 8.818821859474805e-05, + "loss": 0.02110636681318283, + "step": 83250 + }, + { + "epoch": 11.81831085876508, + "grad_norm": 0.4695393443107605, + "learning_rate": 8.818679914833216e-05, + "loss": 0.0809195578098297, + "step": 83260 + }, + { + "epoch": 11.819730305180979, + "grad_norm": 1.1666871309280396, + "learning_rate": 8.818537970191625e-05, + "loss": 0.10807353258132935, + "step": 83270 + }, + { + "epoch": 11.821149751596877, + "grad_norm": 1.5496456623077393, + "learning_rate": 8.818396025550037e-05, + "loss": 0.006642992049455643, + "step": 83280 + }, + { + "epoch": 11.822569198012776, + "grad_norm": 1.6191848516464233, + "learning_rate": 8.818254080908446e-05, + "loss": 0.016630643606185914, + "step": 83290 + }, + { + "epoch": 11.823988644428672, + "grad_norm": 0.05443901568651199, + "learning_rate": 8.818112136266856e-05, + "loss": 0.026843801140785217, + "step": 83300 + }, + { + "epoch": 11.82540809084457, + "grad_norm": 0.26701775193214417, + "learning_rate": 8.817970191625267e-05, + "loss": 0.009992837160825729, + "step": 83310 + }, + { + "epoch": 11.826827537260469, + "grad_norm": 4.363466739654541, + "learning_rate": 8.817828246983677e-05, + "loss": 0.029452064633369447, + "step": 83320 + }, + { + "epoch": 11.828246983676365, + "grad_norm": 0.06600237637758255, + "learning_rate": 8.817686302342088e-05, + "loss": 0.045746609568595886, + "step": 83330 + }, + { + "epoch": 11.829666430092264, + "grad_norm": 4.926809787750244, + "learning_rate": 8.817544357700496e-05, + "loss": 0.016673028469085693, + "step": 83340 + }, + { + "epoch": 11.831085876508162, + "grad_norm": 1.262709140777588, + "learning_rate": 8.817402413058907e-05, + "loss": 0.026791003346443177, + "step": 83350 + }, + { + "epoch": 11.83250532292406, + "grad_norm": 0.11825452744960785, + "learning_rate": 8.817260468417317e-05, + "loss": 0.038587138056755066, + "step": 83360 + }, + { + "epoch": 11.833924769339957, + "grad_norm": 0.49939799308776855, + "learning_rate": 8.817118523775728e-05, + "loss": 0.03778964877128601, + "step": 83370 + }, + { + "epoch": 11.835344215755855, + "grad_norm": 1.799819827079773, + "learning_rate": 8.816976579134138e-05, + "loss": 0.034172806143760684, + "step": 83380 + }, + { + "epoch": 11.836763662171753, + "grad_norm": 5.9821953773498535, + "learning_rate": 8.816834634492548e-05, + "loss": 0.03201870620250702, + "step": 83390 + }, + { + "epoch": 11.83818310858765, + "grad_norm": 8.615924835205078, + "learning_rate": 8.816692689850959e-05, + "loss": 0.049471884965896606, + "step": 83400 + }, + { + "epoch": 11.839602555003548, + "grad_norm": 0.6330484747886658, + "learning_rate": 8.816550745209369e-05, + "loss": 0.03654695153236389, + "step": 83410 + }, + { + "epoch": 11.841022001419446, + "grad_norm": 0.2800433337688446, + "learning_rate": 8.81640880056778e-05, + "loss": 0.012365755438804627, + "step": 83420 + }, + { + "epoch": 11.842441447835345, + "grad_norm": 4.388822078704834, + "learning_rate": 8.81626685592619e-05, + "loss": 0.03003677725791931, + "step": 83430 + }, + { + "epoch": 11.843860894251241, + "grad_norm": 9.12732219696045, + "learning_rate": 8.816124911284599e-05, + "loss": 0.04735245406627655, + "step": 83440 + }, + { + "epoch": 11.84528034066714, + "grad_norm": 3.422520875930786, + "learning_rate": 8.815982966643009e-05, + "loss": 0.08324976563453675, + "step": 83450 + }, + { + "epoch": 11.846699787083038, + "grad_norm": 6.075384140014648, + "learning_rate": 8.81584102200142e-05, + "loss": 0.024019157886505126, + "step": 83460 + }, + { + "epoch": 11.848119233498934, + "grad_norm": 0.9444994330406189, + "learning_rate": 8.81569907735983e-05, + "loss": 0.030841320753097534, + "step": 83470 + }, + { + "epoch": 11.849538679914833, + "grad_norm": 7.623394012451172, + "learning_rate": 8.815557132718241e-05, + "loss": 0.05833870768547058, + "step": 83480 + }, + { + "epoch": 11.850958126330731, + "grad_norm": 0.26176756620407104, + "learning_rate": 8.81541518807665e-05, + "loss": 0.004750019684433937, + "step": 83490 + }, + { + "epoch": 11.85237757274663, + "grad_norm": 0.06734207272529602, + "learning_rate": 8.81527324343506e-05, + "loss": 0.06027681231498718, + "step": 83500 + }, + { + "epoch": 11.85237757274663, + "eval_accuracy": 0.9829592420677815, + "eval_loss": 0.060624875128269196, + "eval_runtime": 33.763, + "eval_samples_per_second": 465.806, + "eval_steps_per_second": 14.572, + "step": 83500 + }, + { + "epoch": 11.853797019162526, + "grad_norm": 2.9946765899658203, + "learning_rate": 8.815131298793471e-05, + "loss": 0.03661317229270935, + "step": 83510 + }, + { + "epoch": 11.855216465578424, + "grad_norm": 7.433979511260986, + "learning_rate": 8.814989354151881e-05, + "loss": 0.04397961497306824, + "step": 83520 + }, + { + "epoch": 11.856635911994323, + "grad_norm": 2.642326831817627, + "learning_rate": 8.814847409510292e-05, + "loss": 0.027840161323547365, + "step": 83530 + }, + { + "epoch": 11.858055358410219, + "grad_norm": 2.612328052520752, + "learning_rate": 8.8147054648687e-05, + "loss": 0.08996272087097168, + "step": 83540 + }, + { + "epoch": 11.859474804826117, + "grad_norm": 10.099543571472168, + "learning_rate": 8.814563520227112e-05, + "loss": 0.04724225699901581, + "step": 83550 + }, + { + "epoch": 11.860894251242016, + "grad_norm": 2.065953493118286, + "learning_rate": 8.814421575585521e-05, + "loss": 0.023844602704048156, + "step": 83560 + }, + { + "epoch": 11.862313697657914, + "grad_norm": 3.0245158672332764, + "learning_rate": 8.814279630943932e-05, + "loss": 0.07478670477867126, + "step": 83570 + }, + { + "epoch": 11.86373314407381, + "grad_norm": 8.01756477355957, + "learning_rate": 8.814137686302342e-05, + "loss": 0.04426112174987793, + "step": 83580 + }, + { + "epoch": 11.865152590489709, + "grad_norm": 0.4189710319042206, + "learning_rate": 8.813995741660753e-05, + "loss": 0.04640637934207916, + "step": 83590 + }, + { + "epoch": 11.866572036905607, + "grad_norm": 9.745952606201172, + "learning_rate": 8.813853797019163e-05, + "loss": 0.049314913153648374, + "step": 83600 + }, + { + "epoch": 11.867991483321505, + "grad_norm": 1.1365087032318115, + "learning_rate": 8.813711852377573e-05, + "loss": 0.018093612790107728, + "step": 83610 + }, + { + "epoch": 11.869410929737402, + "grad_norm": 0.5661416053771973, + "learning_rate": 8.813569907735984e-05, + "loss": 0.052584463357925416, + "step": 83620 + }, + { + "epoch": 11.8708303761533, + "grad_norm": 7.085361957550049, + "learning_rate": 8.813427963094394e-05, + "loss": 0.03304852247238159, + "step": 83630 + }, + { + "epoch": 11.872249822569199, + "grad_norm": 1.1107380390167236, + "learning_rate": 8.813286018452805e-05, + "loss": 0.011267714947462083, + "step": 83640 + }, + { + "epoch": 11.873669268985095, + "grad_norm": 0.16078133881092072, + "learning_rate": 8.813144073811213e-05, + "loss": 0.012863248586654663, + "step": 83650 + }, + { + "epoch": 11.875088715400993, + "grad_norm": 10.058072090148926, + "learning_rate": 8.813002129169624e-05, + "loss": 0.03249671459197998, + "step": 83660 + }, + { + "epoch": 11.876508161816892, + "grad_norm": 5.011179447174072, + "learning_rate": 8.812860184528034e-05, + "loss": 0.038761311769485475, + "step": 83670 + }, + { + "epoch": 11.87792760823279, + "grad_norm": 2.0408272743225098, + "learning_rate": 8.812718239886445e-05, + "loss": 0.022072888910770416, + "step": 83680 + }, + { + "epoch": 11.879347054648687, + "grad_norm": 8.341216087341309, + "learning_rate": 8.812576295244855e-05, + "loss": 0.0365541934967041, + "step": 83690 + }, + { + "epoch": 11.880766501064585, + "grad_norm": 4.903764247894287, + "learning_rate": 8.812434350603264e-05, + "loss": 0.02184043824672699, + "step": 83700 + }, + { + "epoch": 11.882185947480483, + "grad_norm": 1.901656985282898, + "learning_rate": 8.812292405961676e-05, + "loss": 0.04549020528793335, + "step": 83710 + }, + { + "epoch": 11.88360539389638, + "grad_norm": 0.10097061842679977, + "learning_rate": 8.812150461320085e-05, + "loss": 0.027865698933601378, + "step": 83720 + }, + { + "epoch": 11.885024840312278, + "grad_norm": 1.929040551185608, + "learning_rate": 8.812008516678496e-05, + "loss": 0.005012607201933861, + "step": 83730 + }, + { + "epoch": 11.886444286728176, + "grad_norm": 3.4623374938964844, + "learning_rate": 8.811866572036906e-05, + "loss": 0.010321633517742157, + "step": 83740 + }, + { + "epoch": 11.887863733144075, + "grad_norm": 1.9667080640792847, + "learning_rate": 8.811724627395316e-05, + "loss": 0.02214210033416748, + "step": 83750 + }, + { + "epoch": 11.889283179559971, + "grad_norm": 0.031162558123469353, + "learning_rate": 8.811582682753726e-05, + "loss": 0.03217504918575287, + "step": 83760 + }, + { + "epoch": 11.89070262597587, + "grad_norm": 0.05589601397514343, + "learning_rate": 8.811440738112137e-05, + "loss": 0.0314392626285553, + "step": 83770 + }, + { + "epoch": 11.892122072391768, + "grad_norm": 8.637645721435547, + "learning_rate": 8.811298793470546e-05, + "loss": 0.05045939683914184, + "step": 83780 + }, + { + "epoch": 11.893541518807664, + "grad_norm": 0.4940531253814697, + "learning_rate": 8.811156848828958e-05, + "loss": 0.026433098316192626, + "step": 83790 + }, + { + "epoch": 11.894960965223563, + "grad_norm": 12.007257461547852, + "learning_rate": 8.811014904187367e-05, + "loss": 0.055042076110839847, + "step": 83800 + }, + { + "epoch": 11.896380411639461, + "grad_norm": 0.023529594764113426, + "learning_rate": 8.810872959545777e-05, + "loss": 0.09657434821128845, + "step": 83810 + }, + { + "epoch": 11.89779985805536, + "grad_norm": 2.9010982513427734, + "learning_rate": 8.810731014904188e-05, + "loss": 0.012377361208200455, + "step": 83820 + }, + { + "epoch": 11.899219304471256, + "grad_norm": 3.5865633487701416, + "learning_rate": 8.810589070262598e-05, + "loss": 0.01887798309326172, + "step": 83830 + }, + { + "epoch": 11.900638750887154, + "grad_norm": 0.9775835871696472, + "learning_rate": 8.810447125621009e-05, + "loss": 0.01637519896030426, + "step": 83840 + }, + { + "epoch": 11.902058197303052, + "grad_norm": 0.6207100749015808, + "learning_rate": 8.810305180979417e-05, + "loss": 0.029577887058258055, + "step": 83850 + }, + { + "epoch": 11.903477643718949, + "grad_norm": 1.9363144636154175, + "learning_rate": 8.810163236337828e-05, + "loss": 0.024132755398750306, + "step": 83860 + }, + { + "epoch": 11.904897090134847, + "grad_norm": 0.3028423488140106, + "learning_rate": 8.810021291696238e-05, + "loss": 0.026431560516357422, + "step": 83870 + }, + { + "epoch": 11.906316536550746, + "grad_norm": 6.9190192222595215, + "learning_rate": 8.809879347054649e-05, + "loss": 0.04169048666954041, + "step": 83880 + }, + { + "epoch": 11.907735982966644, + "grad_norm": 1.2599278688430786, + "learning_rate": 8.80973740241306e-05, + "loss": 0.01872767060995102, + "step": 83890 + }, + { + "epoch": 11.90915542938254, + "grad_norm": 0.038407523185014725, + "learning_rate": 8.809595457771469e-05, + "loss": 0.005365389212965965, + "step": 83900 + }, + { + "epoch": 11.910574875798439, + "grad_norm": 0.1457999050617218, + "learning_rate": 8.80945351312988e-05, + "loss": 0.009712480008602142, + "step": 83910 + }, + { + "epoch": 11.911994322214337, + "grad_norm": 0.043060123920440674, + "learning_rate": 8.80931156848829e-05, + "loss": 0.07107031345367432, + "step": 83920 + }, + { + "epoch": 11.913413768630233, + "grad_norm": 0.5718839764595032, + "learning_rate": 8.8091696238467e-05, + "loss": 0.03540462255477905, + "step": 83930 + }, + { + "epoch": 11.914833215046132, + "grad_norm": 0.2524852752685547, + "learning_rate": 8.80902767920511e-05, + "loss": 0.04274202883243561, + "step": 83940 + }, + { + "epoch": 11.91625266146203, + "grad_norm": 1.0457261800765991, + "learning_rate": 8.808885734563521e-05, + "loss": 0.0634374737739563, + "step": 83950 + }, + { + "epoch": 11.917672107877928, + "grad_norm": 14.034346580505371, + "learning_rate": 8.80874378992193e-05, + "loss": 0.06747335195541382, + "step": 83960 + }, + { + "epoch": 11.919091554293825, + "grad_norm": 1.1360832452774048, + "learning_rate": 8.808601845280341e-05, + "loss": 0.026286211609840394, + "step": 83970 + }, + { + "epoch": 11.920511000709723, + "grad_norm": 7.193599224090576, + "learning_rate": 8.808459900638752e-05, + "loss": 0.05016224980354309, + "step": 83980 + }, + { + "epoch": 11.921930447125622, + "grad_norm": 3.6520514488220215, + "learning_rate": 8.808317955997162e-05, + "loss": 0.017744763195514678, + "step": 83990 + }, + { + "epoch": 11.923349893541518, + "grad_norm": 1.2742975950241089, + "learning_rate": 8.808176011355573e-05, + "loss": 0.01798464059829712, + "step": 84000 + }, + { + "epoch": 11.923349893541518, + "eval_accuracy": 0.9836586761620144, + "eval_loss": 0.05493206903338432, + "eval_runtime": 34.3683, + "eval_samples_per_second": 457.601, + "eval_steps_per_second": 14.316, + "step": 84000 + }, + { + "epoch": 11.924769339957416, + "grad_norm": 4.129441261291504, + "learning_rate": 8.808034066713981e-05, + "loss": 0.014805413782596588, + "step": 84010 + }, + { + "epoch": 11.926188786373315, + "grad_norm": 0.5156400203704834, + "learning_rate": 8.807892122072392e-05, + "loss": 0.014667493104934693, + "step": 84020 + }, + { + "epoch": 11.927608232789213, + "grad_norm": 0.2281806617975235, + "learning_rate": 8.807750177430802e-05, + "loss": 0.037797823548316956, + "step": 84030 + }, + { + "epoch": 11.92902767920511, + "grad_norm": 9.229474067687988, + "learning_rate": 8.807608232789213e-05, + "loss": 0.031675410270690915, + "step": 84040 + }, + { + "epoch": 11.930447125621008, + "grad_norm": 0.6090600490570068, + "learning_rate": 8.807466288147623e-05, + "loss": 0.028332659602165224, + "step": 84050 + }, + { + "epoch": 11.931866572036906, + "grad_norm": 2.7820520401000977, + "learning_rate": 8.807324343506033e-05, + "loss": 0.042403769493103025, + "step": 84060 + }, + { + "epoch": 11.933286018452803, + "grad_norm": 8.19253158569336, + "learning_rate": 8.807182398864444e-05, + "loss": 0.06769433617591858, + "step": 84070 + }, + { + "epoch": 11.934705464868701, + "grad_norm": 0.1695857048034668, + "learning_rate": 8.807040454222853e-05, + "loss": 0.03822060525417328, + "step": 84080 + }, + { + "epoch": 11.9361249112846, + "grad_norm": 0.04587777331471443, + "learning_rate": 8.806898509581265e-05, + "loss": 0.03620380461215973, + "step": 84090 + }, + { + "epoch": 11.937544357700498, + "grad_norm": 0.04895970597863197, + "learning_rate": 8.806756564939674e-05, + "loss": 0.0069283261895179745, + "step": 84100 + }, + { + "epoch": 11.938963804116394, + "grad_norm": 0.5421682000160217, + "learning_rate": 8.806614620298084e-05, + "loss": 0.014993944764137268, + "step": 84110 + }, + { + "epoch": 11.940383250532292, + "grad_norm": 9.860722541809082, + "learning_rate": 8.806472675656494e-05, + "loss": 0.035263413190841676, + "step": 84120 + }, + { + "epoch": 11.94180269694819, + "grad_norm": 0.005013750400394201, + "learning_rate": 8.806330731014905e-05, + "loss": 0.018354399502277373, + "step": 84130 + }, + { + "epoch": 11.943222143364087, + "grad_norm": 3.5134024620056152, + "learning_rate": 8.806188786373315e-05, + "loss": 0.05673021674156189, + "step": 84140 + }, + { + "epoch": 11.944641589779986, + "grad_norm": 0.3052522838115692, + "learning_rate": 8.806046841731726e-05, + "loss": 0.016181229054927825, + "step": 84150 + }, + { + "epoch": 11.946061036195884, + "grad_norm": 1.608987808227539, + "learning_rate": 8.805904897090135e-05, + "loss": 0.009563577175140382, + "step": 84160 + }, + { + "epoch": 11.947480482611782, + "grad_norm": 1.9012960195541382, + "learning_rate": 8.805762952448545e-05, + "loss": 0.037493014335632326, + "step": 84170 + }, + { + "epoch": 11.948899929027679, + "grad_norm": 1.537778377532959, + "learning_rate": 8.805621007806956e-05, + "loss": 0.03181539177894592, + "step": 84180 + }, + { + "epoch": 11.950319375443577, + "grad_norm": 4.423129081726074, + "learning_rate": 8.805479063165366e-05, + "loss": 0.02197156846523285, + "step": 84190 + }, + { + "epoch": 11.951738821859475, + "grad_norm": 1.713564395904541, + "learning_rate": 8.805337118523777e-05, + "loss": 0.021484464406967163, + "step": 84200 + }, + { + "epoch": 11.953158268275372, + "grad_norm": 8.237667083740234, + "learning_rate": 8.805195173882185e-05, + "loss": 0.044994819164276126, + "step": 84210 + }, + { + "epoch": 11.95457771469127, + "grad_norm": 0.12371502816677094, + "learning_rate": 8.805053229240597e-05, + "loss": 0.0389464259147644, + "step": 84220 + }, + { + "epoch": 11.955997161107168, + "grad_norm": 2.0792911052703857, + "learning_rate": 8.804911284599006e-05, + "loss": 0.02157101035118103, + "step": 84230 + }, + { + "epoch": 11.957416607523067, + "grad_norm": 6.914712905883789, + "learning_rate": 8.804769339957417e-05, + "loss": 0.027994582056999208, + "step": 84240 + }, + { + "epoch": 11.958836053938963, + "grad_norm": 10.593100547790527, + "learning_rate": 8.804627395315827e-05, + "loss": 0.017365607619285583, + "step": 84250 + }, + { + "epoch": 11.960255500354862, + "grad_norm": 0.14070631563663483, + "learning_rate": 8.804485450674238e-05, + "loss": 0.011767487227916717, + "step": 84260 + }, + { + "epoch": 11.96167494677076, + "grad_norm": 0.10577062517404556, + "learning_rate": 8.804343506032648e-05, + "loss": 0.06398826837539673, + "step": 84270 + }, + { + "epoch": 11.963094393186656, + "grad_norm": 0.2251124233007431, + "learning_rate": 8.804201561391058e-05, + "loss": 0.02591215670108795, + "step": 84280 + }, + { + "epoch": 11.964513839602555, + "grad_norm": 0.45450490713119507, + "learning_rate": 8.804059616749469e-05, + "loss": 0.024134044349193574, + "step": 84290 + }, + { + "epoch": 11.965933286018453, + "grad_norm": 10.233354568481445, + "learning_rate": 8.803917672107879e-05, + "loss": 0.035779574513435365, + "step": 84300 + }, + { + "epoch": 11.967352732434351, + "grad_norm": 10.191351890563965, + "learning_rate": 8.80377572746629e-05, + "loss": 0.02158864587545395, + "step": 84310 + }, + { + "epoch": 11.968772178850248, + "grad_norm": 0.2624339163303375, + "learning_rate": 8.803633782824698e-05, + "loss": 0.03314901888370514, + "step": 84320 + }, + { + "epoch": 11.970191625266146, + "grad_norm": 0.15902841091156006, + "learning_rate": 8.803491838183109e-05, + "loss": 0.013961750268936157, + "step": 84330 + }, + { + "epoch": 11.971611071682045, + "grad_norm": 7.513896942138672, + "learning_rate": 8.803349893541519e-05, + "loss": 0.054462003707885745, + "step": 84340 + }, + { + "epoch": 11.973030518097941, + "grad_norm": 0.7898911237716675, + "learning_rate": 8.80320794889993e-05, + "loss": 0.01811348497867584, + "step": 84350 + }, + { + "epoch": 11.97444996451384, + "grad_norm": 1.5465627908706665, + "learning_rate": 8.80306600425834e-05, + "loss": 0.011357621103525162, + "step": 84360 + }, + { + "epoch": 11.975869410929738, + "grad_norm": 6.9257683753967285, + "learning_rate": 8.80292405961675e-05, + "loss": 0.019734883308410646, + "step": 84370 + }, + { + "epoch": 11.977288857345636, + "grad_norm": 5.433879375457764, + "learning_rate": 8.80278211497516e-05, + "loss": 0.030297473073005676, + "step": 84380 + }, + { + "epoch": 11.978708303761533, + "grad_norm": 0.7244393825531006, + "learning_rate": 8.80264017033357e-05, + "loss": 0.038099372386932374, + "step": 84390 + }, + { + "epoch": 11.98012775017743, + "grad_norm": 1.208174467086792, + "learning_rate": 8.802498225691981e-05, + "loss": 0.011677633225917815, + "step": 84400 + }, + { + "epoch": 11.98154719659333, + "grad_norm": 1.5116193294525146, + "learning_rate": 8.802356281050391e-05, + "loss": 0.033107328414916995, + "step": 84410 + }, + { + "epoch": 11.982966643009226, + "grad_norm": 0.7113156914710999, + "learning_rate": 8.802214336408801e-05, + "loss": 0.04576504826545715, + "step": 84420 + }, + { + "epoch": 11.984386089425124, + "grad_norm": 0.0780448466539383, + "learning_rate": 8.80207239176721e-05, + "loss": 0.01310802698135376, + "step": 84430 + }, + { + "epoch": 11.985805535841022, + "grad_norm": 9.950695991516113, + "learning_rate": 8.801930447125622e-05, + "loss": 0.04769536256790161, + "step": 84440 + }, + { + "epoch": 11.98722498225692, + "grad_norm": 1.4136978387832642, + "learning_rate": 8.801788502484031e-05, + "loss": 0.017603138089179994, + "step": 84450 + }, + { + "epoch": 11.988644428672817, + "grad_norm": 1.4347903728485107, + "learning_rate": 8.801646557842442e-05, + "loss": 0.054668909311294554, + "step": 84460 + }, + { + "epoch": 11.990063875088715, + "grad_norm": 0.16239799559116364, + "learning_rate": 8.801504613200852e-05, + "loss": 0.009764498472213745, + "step": 84470 + }, + { + "epoch": 11.991483321504614, + "grad_norm": 0.02905164659023285, + "learning_rate": 8.801362668559262e-05, + "loss": 0.025337016582489012, + "step": 84480 + }, + { + "epoch": 11.99290276792051, + "grad_norm": 0.4277700185775757, + "learning_rate": 8.801220723917673e-05, + "loss": 0.023854957520961763, + "step": 84490 + }, + { + "epoch": 11.994322214336409, + "grad_norm": 2.598402738571167, + "learning_rate": 8.801078779276083e-05, + "loss": 0.01903749108314514, + "step": 84500 + }, + { + "epoch": 11.994322214336409, + "eval_accuracy": 0.9682075411712342, + "eval_loss": 0.12699328362941742, + "eval_runtime": 32.4308, + "eval_samples_per_second": 484.941, + "eval_steps_per_second": 15.171, + "step": 84500 + }, + { + "epoch": 11.995741660752307, + "grad_norm": 0.46173426508903503, + "learning_rate": 8.800936834634494e-05, + "loss": 0.052141273021698, + "step": 84510 + }, + { + "epoch": 11.997161107168205, + "grad_norm": 0.17388634383678436, + "learning_rate": 8.800794889992902e-05, + "loss": 0.050818198919296266, + "step": 84520 + }, + { + "epoch": 11.998580553584102, + "grad_norm": 4.341207504272461, + "learning_rate": 8.800652945351313e-05, + "loss": 0.03561738431453705, + "step": 84530 + }, + { + "epoch": 12.0, + "grad_norm": 0.30166885256767273, + "learning_rate": 8.800511000709723e-05, + "loss": 0.02941356897354126, + "step": 84540 + }, + { + "epoch": 12.001419446415898, + "grad_norm": 3.268622875213623, + "learning_rate": 8.800369056068134e-05, + "loss": 0.017552968859672547, + "step": 84550 + }, + { + "epoch": 12.002838892831795, + "grad_norm": 2.5731606483459473, + "learning_rate": 8.800227111426544e-05, + "loss": 0.009592925757169723, + "step": 84560 + }, + { + "epoch": 12.004258339247693, + "grad_norm": 0.22703532874584198, + "learning_rate": 8.800085166784954e-05, + "loss": 0.016140460968017578, + "step": 84570 + }, + { + "epoch": 12.005677785663591, + "grad_norm": 1.0973620414733887, + "learning_rate": 8.799943222143365e-05, + "loss": 0.02522848844528198, + "step": 84580 + }, + { + "epoch": 12.00709723207949, + "grad_norm": 0.019226137548685074, + "learning_rate": 8.799801277501774e-05, + "loss": 0.01752144694328308, + "step": 84590 + }, + { + "epoch": 12.008516678495386, + "grad_norm": 5.661785125732422, + "learning_rate": 8.799659332860186e-05, + "loss": 0.10360106229782104, + "step": 84600 + }, + { + "epoch": 12.009936124911285, + "grad_norm": 5.24439811706543, + "learning_rate": 8.799517388218595e-05, + "loss": 0.038429513573646545, + "step": 84610 + }, + { + "epoch": 12.011355571327183, + "grad_norm": 15.420430183410645, + "learning_rate": 8.799375443577006e-05, + "loss": 0.04234688878059387, + "step": 84620 + }, + { + "epoch": 12.01277501774308, + "grad_norm": 0.12376894801855087, + "learning_rate": 8.799233498935415e-05, + "loss": 0.05130982995033264, + "step": 84630 + }, + { + "epoch": 12.014194464158978, + "grad_norm": 0.19059942662715912, + "learning_rate": 8.799091554293826e-05, + "loss": 0.004893422871828079, + "step": 84640 + }, + { + "epoch": 12.015613910574876, + "grad_norm": 7.90791130065918, + "learning_rate": 8.798949609652236e-05, + "loss": 0.020802582800388335, + "step": 84650 + }, + { + "epoch": 12.017033356990774, + "grad_norm": 0.09958259761333466, + "learning_rate": 8.798807665010647e-05, + "loss": 0.008918841928243637, + "step": 84660 + }, + { + "epoch": 12.01845280340667, + "grad_norm": 5.014199733734131, + "learning_rate": 8.798665720369056e-05, + "loss": 0.04075751900672912, + "step": 84670 + }, + { + "epoch": 12.01987224982257, + "grad_norm": 1.203556776046753, + "learning_rate": 8.798523775727466e-05, + "loss": 0.02592116892337799, + "step": 84680 + }, + { + "epoch": 12.021291696238467, + "grad_norm": 5.959599018096924, + "learning_rate": 8.798381831085877e-05, + "loss": 0.017769895493984222, + "step": 84690 + }, + { + "epoch": 12.022711142654364, + "grad_norm": 7.269739151000977, + "learning_rate": 8.798239886444287e-05, + "loss": 0.04146575927734375, + "step": 84700 + }, + { + "epoch": 12.024130589070262, + "grad_norm": 3.2094407081604004, + "learning_rate": 8.798097941802698e-05, + "loss": 0.013438931107521057, + "step": 84710 + }, + { + "epoch": 12.02555003548616, + "grad_norm": 4.8658623695373535, + "learning_rate": 8.797955997161108e-05, + "loss": 0.02216584384441376, + "step": 84720 + }, + { + "epoch": 12.026969481902059, + "grad_norm": 2.480189561843872, + "learning_rate": 8.797814052519518e-05, + "loss": 0.03759796917438507, + "step": 84730 + }, + { + "epoch": 12.028388928317955, + "grad_norm": 9.961448669433594, + "learning_rate": 8.797672107877927e-05, + "loss": 0.022990162670612335, + "step": 84740 + }, + { + "epoch": 12.029808374733854, + "grad_norm": 0.17480824887752533, + "learning_rate": 8.797530163236338e-05, + "loss": 0.011842547357082367, + "step": 84750 + }, + { + "epoch": 12.031227821149752, + "grad_norm": 0.02928129956126213, + "learning_rate": 8.797388218594748e-05, + "loss": 0.042740797996520995, + "step": 84760 + }, + { + "epoch": 12.032647267565649, + "grad_norm": 1.0462205410003662, + "learning_rate": 8.797246273953159e-05, + "loss": 0.009797403216362, + "step": 84770 + }, + { + "epoch": 12.034066713981547, + "grad_norm": 1.6143826246261597, + "learning_rate": 8.797104329311569e-05, + "loss": 0.031755182147026065, + "step": 84780 + }, + { + "epoch": 12.035486160397445, + "grad_norm": 10.038415908813477, + "learning_rate": 8.796962384669979e-05, + "loss": 0.0391110509634018, + "step": 84790 + }, + { + "epoch": 12.036905606813344, + "grad_norm": 0.18434542417526245, + "learning_rate": 8.79682044002839e-05, + "loss": 0.0238394096493721, + "step": 84800 + }, + { + "epoch": 12.03832505322924, + "grad_norm": 0.39857715368270874, + "learning_rate": 8.7966784953868e-05, + "loss": 0.07768634557724, + "step": 84810 + }, + { + "epoch": 12.039744499645138, + "grad_norm": 0.1263289898633957, + "learning_rate": 8.796550745209368e-05, + "loss": 0.029108119010925294, + "step": 84820 + }, + { + "epoch": 12.041163946061037, + "grad_norm": 0.02896190620958805, + "learning_rate": 8.796408800567779e-05, + "loss": 0.04185638725757599, + "step": 84830 + }, + { + "epoch": 12.042583392476933, + "grad_norm": 1.1194353103637695, + "learning_rate": 8.79626685592619e-05, + "loss": 0.013438387215137482, + "step": 84840 + }, + { + "epoch": 12.044002838892832, + "grad_norm": 0.6616882085800171, + "learning_rate": 8.796124911284599e-05, + "loss": 0.02819703221321106, + "step": 84850 + }, + { + "epoch": 12.04542228530873, + "grad_norm": 0.004331925883889198, + "learning_rate": 8.79598296664301e-05, + "loss": 0.03522735834121704, + "step": 84860 + }, + { + "epoch": 12.046841731724628, + "grad_norm": 8.60290813446045, + "learning_rate": 8.79584102200142e-05, + "loss": 0.020204275846481323, + "step": 84870 + }, + { + "epoch": 12.048261178140525, + "grad_norm": 0.041052740067243576, + "learning_rate": 8.79569907735983e-05, + "loss": 0.008595021069049835, + "step": 84880 + }, + { + "epoch": 12.049680624556423, + "grad_norm": 4.408438205718994, + "learning_rate": 8.79555713271824e-05, + "loss": 0.01435663402080536, + "step": 84890 + }, + { + "epoch": 12.051100070972321, + "grad_norm": 18.28489112854004, + "learning_rate": 8.79541518807665e-05, + "loss": 0.036411243677139285, + "step": 84900 + }, + { + "epoch": 12.052519517388218, + "grad_norm": 0.154909148812294, + "learning_rate": 8.79527324343506e-05, + "loss": 0.03882510662078857, + "step": 84910 + }, + { + "epoch": 12.053938963804116, + "grad_norm": 0.7121272683143616, + "learning_rate": 8.795131298793471e-05, + "loss": 0.0175392284989357, + "step": 84920 + }, + { + "epoch": 12.055358410220014, + "grad_norm": 0.04540358856320381, + "learning_rate": 8.794989354151882e-05, + "loss": 0.004501216858625412, + "step": 84930 + }, + { + "epoch": 12.056777856635913, + "grad_norm": 0.15085090696811676, + "learning_rate": 8.794847409510292e-05, + "loss": 0.017955848574638368, + "step": 84940 + }, + { + "epoch": 12.05819730305181, + "grad_norm": 0.16968463361263275, + "learning_rate": 8.794705464868703e-05, + "loss": 0.0195046991109848, + "step": 84950 + }, + { + "epoch": 12.059616749467708, + "grad_norm": 0.21109625697135925, + "learning_rate": 8.794563520227111e-05, + "loss": 0.035113084316253665, + "step": 84960 + }, + { + "epoch": 12.061036195883606, + "grad_norm": 0.023791976273059845, + "learning_rate": 8.794421575585522e-05, + "loss": 0.017290794849395753, + "step": 84970 + }, + { + "epoch": 12.062455642299502, + "grad_norm": 3.2964980602264404, + "learning_rate": 8.794279630943932e-05, + "loss": 0.044614797830581664, + "step": 84980 + }, + { + "epoch": 12.0638750887154, + "grad_norm": 1.710684061050415, + "learning_rate": 8.794137686302343e-05, + "loss": 0.0351711630821228, + "step": 84990 + }, + { + "epoch": 12.065294535131299, + "grad_norm": 5.109529495239258, + "learning_rate": 8.793995741660753e-05, + "loss": 0.02618541419506073, + "step": 85000 + }, + { + "epoch": 12.065294535131299, + "eval_accuracy": 0.9881096203980416, + "eval_loss": 0.044174253940582275, + "eval_runtime": 32.5626, + "eval_samples_per_second": 482.977, + "eval_steps_per_second": 15.109, + "step": 85000 + }, + { + "epoch": 12.066713981547197, + "grad_norm": 2.1556882858276367, + "learning_rate": 8.793853797019163e-05, + "loss": 0.01659655123949051, + "step": 85010 + }, + { + "epoch": 12.068133427963094, + "grad_norm": 1.9246563911437988, + "learning_rate": 8.793711852377574e-05, + "loss": 0.02644846439361572, + "step": 85020 + }, + { + "epoch": 12.069552874378992, + "grad_norm": 0.8818451166152954, + "learning_rate": 8.793569907735983e-05, + "loss": 0.009180599451065063, + "step": 85030 + }, + { + "epoch": 12.07097232079489, + "grad_norm": 0.22749079763889313, + "learning_rate": 8.793427963094394e-05, + "loss": 0.07038314342498779, + "step": 85040 + }, + { + "epoch": 12.072391767210787, + "grad_norm": 0.3684888482093811, + "learning_rate": 8.793286018452804e-05, + "loss": 0.019059973955154418, + "step": 85050 + }, + { + "epoch": 12.073811213626685, + "grad_norm": 1.719611406326294, + "learning_rate": 8.793144073811214e-05, + "loss": 0.004040063545107842, + "step": 85060 + }, + { + "epoch": 12.075230660042584, + "grad_norm": 2.7327284812927246, + "learning_rate": 8.793002129169624e-05, + "loss": 0.025533831119537352, + "step": 85070 + }, + { + "epoch": 12.076650106458482, + "grad_norm": 6.146058082580566, + "learning_rate": 8.792860184528035e-05, + "loss": 0.041575449705123904, + "step": 85080 + }, + { + "epoch": 12.078069552874378, + "grad_norm": 0.6352998614311218, + "learning_rate": 8.792718239886445e-05, + "loss": 0.016179861128330232, + "step": 85090 + }, + { + "epoch": 12.079488999290277, + "grad_norm": 0.15260739624500275, + "learning_rate": 8.792576295244856e-05, + "loss": 0.008620496094226836, + "step": 85100 + }, + { + "epoch": 12.080908445706175, + "grad_norm": 12.496967315673828, + "learning_rate": 8.792434350603265e-05, + "loss": 0.02953541874885559, + "step": 85110 + }, + { + "epoch": 12.082327892122072, + "grad_norm": 0.03197462856769562, + "learning_rate": 8.792292405961675e-05, + "loss": 0.03554803729057312, + "step": 85120 + }, + { + "epoch": 12.08374733853797, + "grad_norm": 0.2803524136543274, + "learning_rate": 8.792150461320086e-05, + "loss": 0.02972387373447418, + "step": 85130 + }, + { + "epoch": 12.085166784953868, + "grad_norm": 0.4536881446838379, + "learning_rate": 8.792008516678496e-05, + "loss": 0.018435779213905334, + "step": 85140 + }, + { + "epoch": 12.086586231369767, + "grad_norm": 3.5018510818481445, + "learning_rate": 8.791866572036907e-05, + "loss": 0.011114455759525299, + "step": 85150 + }, + { + "epoch": 12.088005677785663, + "grad_norm": 1.3714983463287354, + "learning_rate": 8.791724627395315e-05, + "loss": 0.024516399204730987, + "step": 85160 + }, + { + "epoch": 12.089425124201561, + "grad_norm": 5.15742826461792, + "learning_rate": 8.791582682753726e-05, + "loss": 0.006654751300811767, + "step": 85170 + }, + { + "epoch": 12.09084457061746, + "grad_norm": 0.24019791185855865, + "learning_rate": 8.791440738112136e-05, + "loss": 0.025834381580352783, + "step": 85180 + }, + { + "epoch": 12.092264017033356, + "grad_norm": 0.5296810269355774, + "learning_rate": 8.791298793470547e-05, + "loss": 0.04131495654582977, + "step": 85190 + }, + { + "epoch": 12.093683463449254, + "grad_norm": 0.09318207204341888, + "learning_rate": 8.791156848828957e-05, + "loss": 0.010322081297636032, + "step": 85200 + }, + { + "epoch": 12.095102909865153, + "grad_norm": 0.1374143660068512, + "learning_rate": 8.791014904187367e-05, + "loss": 0.08588246107101441, + "step": 85210 + }, + { + "epoch": 12.096522356281051, + "grad_norm": 0.15659472346305847, + "learning_rate": 8.790872959545778e-05, + "loss": 0.014092278480529786, + "step": 85220 + }, + { + "epoch": 12.097941802696948, + "grad_norm": 3.863607883453369, + "learning_rate": 8.790731014904188e-05, + "loss": 0.004379024729132652, + "step": 85230 + }, + { + "epoch": 12.099361249112846, + "grad_norm": 0.365823358297348, + "learning_rate": 8.790589070262599e-05, + "loss": 0.024506431818008424, + "step": 85240 + }, + { + "epoch": 12.100780695528744, + "grad_norm": 0.06521604210138321, + "learning_rate": 8.790447125621008e-05, + "loss": 0.048830336332321166, + "step": 85250 + }, + { + "epoch": 12.10220014194464, + "grad_norm": 4.4670233726501465, + "learning_rate": 8.790305180979418e-05, + "loss": 0.05310940742492676, + "step": 85260 + }, + { + "epoch": 12.103619588360539, + "grad_norm": 0.0818549171090126, + "learning_rate": 8.790163236337828e-05, + "loss": 0.0211592435836792, + "step": 85270 + }, + { + "epoch": 12.105039034776437, + "grad_norm": 14.488906860351562, + "learning_rate": 8.790021291696239e-05, + "loss": 0.06654778718948365, + "step": 85280 + }, + { + "epoch": 12.106458481192336, + "grad_norm": 0.9826055765151978, + "learning_rate": 8.789879347054649e-05, + "loss": 0.022806084156036376, + "step": 85290 + }, + { + "epoch": 12.107877927608232, + "grad_norm": 6.558762550354004, + "learning_rate": 8.78973740241306e-05, + "loss": 0.07291821837425232, + "step": 85300 + }, + { + "epoch": 12.10929737402413, + "grad_norm": 0.09733277559280396, + "learning_rate": 8.78959545777147e-05, + "loss": 0.02255593240261078, + "step": 85310 + }, + { + "epoch": 12.110716820440029, + "grad_norm": 0.38312506675720215, + "learning_rate": 8.789453513129879e-05, + "loss": 0.028494805097579956, + "step": 85320 + }, + { + "epoch": 12.112136266855925, + "grad_norm": 0.46621382236480713, + "learning_rate": 8.78931156848829e-05, + "loss": 0.016409771144390108, + "step": 85330 + }, + { + "epoch": 12.113555713271824, + "grad_norm": 7.448116779327393, + "learning_rate": 8.7891696238467e-05, + "loss": 0.02532399296760559, + "step": 85340 + }, + { + "epoch": 12.114975159687722, + "grad_norm": 0.5830575227737427, + "learning_rate": 8.789027679205111e-05, + "loss": 0.016569083929061888, + "step": 85350 + }, + { + "epoch": 12.11639460610362, + "grad_norm": 1.3412967920303345, + "learning_rate": 8.788885734563521e-05, + "loss": 0.04238985180854797, + "step": 85360 + }, + { + "epoch": 12.117814052519517, + "grad_norm": 0.38850438594818115, + "learning_rate": 8.788743789921931e-05, + "loss": 0.01213361620903015, + "step": 85370 + }, + { + "epoch": 12.119233498935415, + "grad_norm": 13.0916166305542, + "learning_rate": 8.78860184528034e-05, + "loss": 0.026457768678665162, + "step": 85380 + }, + { + "epoch": 12.120652945351313, + "grad_norm": 10.388897895812988, + "learning_rate": 8.788459900638752e-05, + "loss": 0.06054552793502808, + "step": 85390 + }, + { + "epoch": 12.12207239176721, + "grad_norm": 0.6791418790817261, + "learning_rate": 8.788317955997161e-05, + "loss": 0.010788274556398391, + "step": 85400 + }, + { + "epoch": 12.123491838183108, + "grad_norm": 6.098667621612549, + "learning_rate": 8.788176011355572e-05, + "loss": 0.016460536420345305, + "step": 85410 + }, + { + "epoch": 12.124911284599007, + "grad_norm": 0.7227396965026855, + "learning_rate": 8.788034066713982e-05, + "loss": 0.005558209121227264, + "step": 85420 + }, + { + "epoch": 12.126330731014905, + "grad_norm": 0.8304058909416199, + "learning_rate": 8.787892122072392e-05, + "loss": 0.040136903524398804, + "step": 85430 + }, + { + "epoch": 12.127750177430801, + "grad_norm": 10.306302070617676, + "learning_rate": 8.787750177430803e-05, + "loss": 0.04540317952632904, + "step": 85440 + }, + { + "epoch": 12.1291696238467, + "grad_norm": 0.0453554131090641, + "learning_rate": 8.787608232789213e-05, + "loss": 0.030978840589523316, + "step": 85450 + }, + { + "epoch": 12.130589070262598, + "grad_norm": 1.693662405014038, + "learning_rate": 8.787466288147624e-05, + "loss": 0.07573475837707519, + "step": 85460 + }, + { + "epoch": 12.132008516678495, + "grad_norm": 0.27647534012794495, + "learning_rate": 8.787324343506032e-05, + "loss": 0.04110590517520905, + "step": 85470 + }, + { + "epoch": 12.133427963094393, + "grad_norm": 0.5244948267936707, + "learning_rate": 8.787182398864443e-05, + "loss": 0.04320420622825623, + "step": 85480 + }, + { + "epoch": 12.134847409510291, + "grad_norm": 0.07078727334737778, + "learning_rate": 8.787040454222853e-05, + "loss": 0.026226553320884704, + "step": 85490 + }, + { + "epoch": 12.13626685592619, + "grad_norm": 0.5406977534294128, + "learning_rate": 8.786898509581264e-05, + "loss": 0.0808382511138916, + "step": 85500 + }, + { + "epoch": 12.13626685592619, + "eval_accuracy": 0.9654733897119603, + "eval_loss": 0.12036281824111938, + "eval_runtime": 31.5777, + "eval_samples_per_second": 498.042, + "eval_steps_per_second": 15.581, + "step": 85500 + }, + { + "epoch": 12.137686302342086, + "grad_norm": 0.1550203561782837, + "learning_rate": 8.786756564939674e-05, + "loss": 0.06799649596214294, + "step": 85510 + }, + { + "epoch": 12.139105748757984, + "grad_norm": 4.776898384094238, + "learning_rate": 8.786614620298084e-05, + "loss": 0.03676438629627228, + "step": 85520 + }, + { + "epoch": 12.140525195173883, + "grad_norm": 7.222288131713867, + "learning_rate": 8.786472675656495e-05, + "loss": 0.03599470853805542, + "step": 85530 + }, + { + "epoch": 12.14194464158978, + "grad_norm": 1.93971586227417, + "learning_rate": 8.786330731014904e-05, + "loss": 0.01595723330974579, + "step": 85540 + }, + { + "epoch": 12.143364088005677, + "grad_norm": 14.762378692626953, + "learning_rate": 8.786188786373315e-05, + "loss": 0.03826345801353455, + "step": 85550 + }, + { + "epoch": 12.144783534421576, + "grad_norm": 0.04824106767773628, + "learning_rate": 8.786046841731725e-05, + "loss": 0.03802756071090698, + "step": 85560 + }, + { + "epoch": 12.146202980837474, + "grad_norm": 7.385923385620117, + "learning_rate": 8.785904897090135e-05, + "loss": 0.016000357270240784, + "step": 85570 + }, + { + "epoch": 12.14762242725337, + "grad_norm": 0.8453985452651978, + "learning_rate": 8.785762952448545e-05, + "loss": 0.02127433121204376, + "step": 85580 + }, + { + "epoch": 12.149041873669269, + "grad_norm": 0.05099086835980415, + "learning_rate": 8.785621007806956e-05, + "loss": 0.06052837371826172, + "step": 85590 + }, + { + "epoch": 12.150461320085167, + "grad_norm": 0.31436964869499207, + "learning_rate": 8.785479063165366e-05, + "loss": 0.01670397222042084, + "step": 85600 + }, + { + "epoch": 12.151880766501064, + "grad_norm": 3.1501166820526123, + "learning_rate": 8.785337118523777e-05, + "loss": 0.013452109694480897, + "step": 85610 + }, + { + "epoch": 12.153300212916962, + "grad_norm": 1.261924147605896, + "learning_rate": 8.785195173882186e-05, + "loss": 0.019531291723251343, + "step": 85620 + }, + { + "epoch": 12.15471965933286, + "grad_norm": 0.0056761568412184715, + "learning_rate": 8.785053229240596e-05, + "loss": 0.0211712047457695, + "step": 85630 + }, + { + "epoch": 12.156139105748759, + "grad_norm": 0.9235444664955139, + "learning_rate": 8.784911284599007e-05, + "loss": 0.03171195089817047, + "step": 85640 + }, + { + "epoch": 12.157558552164655, + "grad_norm": 0.06382320076227188, + "learning_rate": 8.784769339957417e-05, + "loss": 0.03569975793361664, + "step": 85650 + }, + { + "epoch": 12.158977998580554, + "grad_norm": 2.014090061187744, + "learning_rate": 8.784627395315828e-05, + "loss": 0.045996904373168945, + "step": 85660 + }, + { + "epoch": 12.160397444996452, + "grad_norm": 0.9534714818000793, + "learning_rate": 8.784485450674238e-05, + "loss": 0.10697624683380128, + "step": 85670 + }, + { + "epoch": 12.161816891412348, + "grad_norm": 0.5350373387336731, + "learning_rate": 8.784343506032647e-05, + "loss": 0.010471509397029876, + "step": 85680 + }, + { + "epoch": 12.163236337828247, + "grad_norm": 1.1189122200012207, + "learning_rate": 8.784201561391057e-05, + "loss": 0.020827175676822664, + "step": 85690 + }, + { + "epoch": 12.164655784244145, + "grad_norm": 6.082579612731934, + "learning_rate": 8.784059616749468e-05, + "loss": 0.01840526908636093, + "step": 85700 + }, + { + "epoch": 12.166075230660043, + "grad_norm": 0.36387625336647034, + "learning_rate": 8.783917672107878e-05, + "loss": 0.04171132147312164, + "step": 85710 + }, + { + "epoch": 12.16749467707594, + "grad_norm": 0.13835683465003967, + "learning_rate": 8.783775727466289e-05, + "loss": 0.0185529425740242, + "step": 85720 + }, + { + "epoch": 12.168914123491838, + "grad_norm": 0.04023800790309906, + "learning_rate": 8.783633782824699e-05, + "loss": 0.02126876711845398, + "step": 85730 + }, + { + "epoch": 12.170333569907736, + "grad_norm": 7.292525291442871, + "learning_rate": 8.783491838183109e-05, + "loss": 0.041138219833374026, + "step": 85740 + }, + { + "epoch": 12.171753016323633, + "grad_norm": 0.21447692811489105, + "learning_rate": 8.78334989354152e-05, + "loss": 0.046198248863220215, + "step": 85750 + }, + { + "epoch": 12.173172462739531, + "grad_norm": 3.7342817783355713, + "learning_rate": 8.78320794889993e-05, + "loss": 0.01802656352519989, + "step": 85760 + }, + { + "epoch": 12.17459190915543, + "grad_norm": 2.4353530406951904, + "learning_rate": 8.78306600425834e-05, + "loss": 0.01277841180562973, + "step": 85770 + }, + { + "epoch": 12.176011355571328, + "grad_norm": 1.732850193977356, + "learning_rate": 8.782924059616749e-05, + "loss": 0.008014071732759476, + "step": 85780 + }, + { + "epoch": 12.177430801987224, + "grad_norm": 0.05266590043902397, + "learning_rate": 8.78278211497516e-05, + "loss": 0.04063029289245605, + "step": 85790 + }, + { + "epoch": 12.178850248403123, + "grad_norm": 0.3241954445838928, + "learning_rate": 8.78264017033357e-05, + "loss": 0.012653402984142303, + "step": 85800 + }, + { + "epoch": 12.180269694819021, + "grad_norm": 0.054089661687612534, + "learning_rate": 8.782498225691981e-05, + "loss": 0.005601692199707031, + "step": 85810 + }, + { + "epoch": 12.181689141234918, + "grad_norm": 0.6758180856704712, + "learning_rate": 8.78235628105039e-05, + "loss": 0.008006727695465088, + "step": 85820 + }, + { + "epoch": 12.183108587650816, + "grad_norm": 0.05027813836932182, + "learning_rate": 8.7822143364088e-05, + "loss": 0.014886750280857087, + "step": 85830 + }, + { + "epoch": 12.184528034066714, + "grad_norm": 1.267127275466919, + "learning_rate": 8.782072391767211e-05, + "loss": 0.012593789398670197, + "step": 85840 + }, + { + "epoch": 12.185947480482612, + "grad_norm": 3.4801025390625, + "learning_rate": 8.781930447125621e-05, + "loss": 0.028322494029998778, + "step": 85850 + }, + { + "epoch": 12.187366926898509, + "grad_norm": 0.7395821809768677, + "learning_rate": 8.781788502484032e-05, + "loss": 0.021465349197387695, + "step": 85860 + }, + { + "epoch": 12.188786373314407, + "grad_norm": 3.554295539855957, + "learning_rate": 8.781646557842442e-05, + "loss": 0.0428952544927597, + "step": 85870 + }, + { + "epoch": 12.190205819730306, + "grad_norm": 3.1385884284973145, + "learning_rate": 8.781504613200852e-05, + "loss": 0.031047120690345764, + "step": 85880 + }, + { + "epoch": 12.191625266146202, + "grad_norm": 3.8755886554718018, + "learning_rate": 8.781362668559261e-05, + "loss": 0.012460941076278686, + "step": 85890 + }, + { + "epoch": 12.1930447125621, + "grad_norm": 7.198748588562012, + "learning_rate": 8.781220723917673e-05, + "loss": 0.04200557470321655, + "step": 85900 + }, + { + "epoch": 12.194464158977999, + "grad_norm": 10.587272644042969, + "learning_rate": 8.781078779276082e-05, + "loss": 0.03208665251731872, + "step": 85910 + }, + { + "epoch": 12.195883605393897, + "grad_norm": 0.7728947401046753, + "learning_rate": 8.780936834634493e-05, + "loss": 0.013271036744117736, + "step": 85920 + }, + { + "epoch": 12.197303051809794, + "grad_norm": 2.66263747215271, + "learning_rate": 8.780794889992903e-05, + "loss": 0.031674724817276, + "step": 85930 + }, + { + "epoch": 12.198722498225692, + "grad_norm": 1.868409514427185, + "learning_rate": 8.780652945351313e-05, + "loss": 0.008502017706632614, + "step": 85940 + }, + { + "epoch": 12.20014194464159, + "grad_norm": 0.21075168251991272, + "learning_rate": 8.780511000709724e-05, + "loss": 0.10288900136947632, + "step": 85950 + }, + { + "epoch": 12.201561391057487, + "grad_norm": 7.489137649536133, + "learning_rate": 8.780369056068134e-05, + "loss": 0.02367044985294342, + "step": 85960 + }, + { + "epoch": 12.202980837473385, + "grad_norm": 6.909646511077881, + "learning_rate": 8.780227111426545e-05, + "loss": 0.016005274653434754, + "step": 85970 + }, + { + "epoch": 12.204400283889283, + "grad_norm": 0.8582497239112854, + "learning_rate": 8.780085166784953e-05, + "loss": 0.030254873633384704, + "step": 85980 + }, + { + "epoch": 12.205819730305182, + "grad_norm": 0.6521355509757996, + "learning_rate": 8.779943222143364e-05, + "loss": 0.027053722739219667, + "step": 85990 + }, + { + "epoch": 12.207239176721078, + "grad_norm": 1.3930158615112305, + "learning_rate": 8.779801277501774e-05, + "loss": 0.08021281957626343, + "step": 86000 + }, + { + "epoch": 12.207239176721078, + "eval_accuracy": 0.9818782984676034, + "eval_loss": 0.06434565782546997, + "eval_runtime": 31.8889, + "eval_samples_per_second": 493.181, + "eval_steps_per_second": 15.429, + "step": 86000 + }, + { + "epoch": 12.208658623136976, + "grad_norm": 1.3139967918395996, + "learning_rate": 8.779659332860185e-05, + "loss": 0.017941921949386597, + "step": 86010 + }, + { + "epoch": 12.210078069552875, + "grad_norm": 1.020004391670227, + "learning_rate": 8.779517388218595e-05, + "loss": 0.03532530665397644, + "step": 86020 + }, + { + "epoch": 12.211497515968771, + "grad_norm": 0.030459938570857048, + "learning_rate": 8.779375443577006e-05, + "loss": 0.020873503386974336, + "step": 86030 + }, + { + "epoch": 12.21291696238467, + "grad_norm": 0.11012019962072372, + "learning_rate": 8.779233498935416e-05, + "loss": 0.03752686977386475, + "step": 86040 + }, + { + "epoch": 12.214336408800568, + "grad_norm": 10.345964431762695, + "learning_rate": 8.779091554293825e-05, + "loss": 0.025034779310226442, + "step": 86050 + }, + { + "epoch": 12.215755855216466, + "grad_norm": 4.960020065307617, + "learning_rate": 8.778949609652236e-05, + "loss": 0.06039838194847107, + "step": 86060 + }, + { + "epoch": 12.217175301632363, + "grad_norm": 0.007862528786063194, + "learning_rate": 8.778807665010646e-05, + "loss": 0.02569767236709595, + "step": 86070 + }, + { + "epoch": 12.218594748048261, + "grad_norm": 2.3586621284484863, + "learning_rate": 8.778665720369057e-05, + "loss": 0.007292249798774719, + "step": 86080 + }, + { + "epoch": 12.22001419446416, + "grad_norm": 0.06399425864219666, + "learning_rate": 8.778523775727466e-05, + "loss": 0.012808781862258912, + "step": 86090 + }, + { + "epoch": 12.221433640880056, + "grad_norm": 6.091718673706055, + "learning_rate": 8.778381831085877e-05, + "loss": 0.05899875164031983, + "step": 86100 + }, + { + "epoch": 12.222853087295954, + "grad_norm": 0.7794937491416931, + "learning_rate": 8.778239886444287e-05, + "loss": 0.008084338158369064, + "step": 86110 + }, + { + "epoch": 12.224272533711853, + "grad_norm": 0.4178735613822937, + "learning_rate": 8.778097941802698e-05, + "loss": 0.017458078265190125, + "step": 86120 + }, + { + "epoch": 12.22569198012775, + "grad_norm": 0.5997690558433533, + "learning_rate": 8.777955997161109e-05, + "loss": 0.039548417925834654, + "step": 86130 + }, + { + "epoch": 12.227111426543647, + "grad_norm": 0.3146066963672638, + "learning_rate": 8.777814052519517e-05, + "loss": 0.022693848609924315, + "step": 86140 + }, + { + "epoch": 12.228530872959546, + "grad_norm": 0.6580975651741028, + "learning_rate": 8.777672107877928e-05, + "loss": 0.02807498276233673, + "step": 86150 + }, + { + "epoch": 12.229950319375444, + "grad_norm": 1.1448265314102173, + "learning_rate": 8.777530163236338e-05, + "loss": 0.0626761257648468, + "step": 86160 + }, + { + "epoch": 12.231369765791342, + "grad_norm": 3.368934154510498, + "learning_rate": 8.777388218594749e-05, + "loss": 0.0760569453239441, + "step": 86170 + }, + { + "epoch": 12.232789212207239, + "grad_norm": 2.0013580322265625, + "learning_rate": 8.777246273953159e-05, + "loss": 0.023546977341175078, + "step": 86180 + }, + { + "epoch": 12.234208658623137, + "grad_norm": 0.20961083471775055, + "learning_rate": 8.777104329311569e-05, + "loss": 0.05598819851875305, + "step": 86190 + }, + { + "epoch": 12.235628105039035, + "grad_norm": 0.22564516961574554, + "learning_rate": 8.776962384669978e-05, + "loss": 0.02459408938884735, + "step": 86200 + }, + { + "epoch": 12.237047551454932, + "grad_norm": 10.181661605834961, + "learning_rate": 8.77682044002839e-05, + "loss": 0.03393624722957611, + "step": 86210 + }, + { + "epoch": 12.23846699787083, + "grad_norm": 8.41063117980957, + "learning_rate": 8.7766784953868e-05, + "loss": 0.04015167355537415, + "step": 86220 + }, + { + "epoch": 12.239886444286729, + "grad_norm": 6.153536796569824, + "learning_rate": 8.77653655074521e-05, + "loss": 0.035242652893066405, + "step": 86230 + }, + { + "epoch": 12.241305890702627, + "grad_norm": 0.06803173571825027, + "learning_rate": 8.77639460610362e-05, + "loss": 0.035042256116867065, + "step": 86240 + }, + { + "epoch": 12.242725337118523, + "grad_norm": 0.18211659789085388, + "learning_rate": 8.77625266146203e-05, + "loss": 0.0313386470079422, + "step": 86250 + }, + { + "epoch": 12.244144783534422, + "grad_norm": 0.041296329349279404, + "learning_rate": 8.776110716820441e-05, + "loss": 0.022816309332847597, + "step": 86260 + }, + { + "epoch": 12.24556422995032, + "grad_norm": 1.6136507987976074, + "learning_rate": 8.77596877217885e-05, + "loss": 0.02124781161546707, + "step": 86270 + }, + { + "epoch": 12.246983676366217, + "grad_norm": 2.71836519241333, + "learning_rate": 8.775826827537262e-05, + "loss": 0.0672789990901947, + "step": 86280 + }, + { + "epoch": 12.248403122782115, + "grad_norm": 0.29414162039756775, + "learning_rate": 8.77568488289567e-05, + "loss": 0.014540690183639526, + "step": 86290 + }, + { + "epoch": 12.249822569198013, + "grad_norm": 1.0567787885665894, + "learning_rate": 8.775542938254081e-05, + "loss": 0.02073398381471634, + "step": 86300 + }, + { + "epoch": 12.251242015613911, + "grad_norm": 0.6744303107261658, + "learning_rate": 8.775400993612492e-05, + "loss": 0.010882169753313065, + "step": 86310 + }, + { + "epoch": 12.252661462029808, + "grad_norm": 0.1246824786067009, + "learning_rate": 8.775259048970902e-05, + "loss": 0.006403592228889465, + "step": 86320 + }, + { + "epoch": 12.254080908445706, + "grad_norm": 1.0266505479812622, + "learning_rate": 8.775117104329313e-05, + "loss": 0.008550825715065002, + "step": 86330 + }, + { + "epoch": 12.255500354861605, + "grad_norm": 0.10675075650215149, + "learning_rate": 8.774975159687721e-05, + "loss": 0.0436547189950943, + "step": 86340 + }, + { + "epoch": 12.256919801277501, + "grad_norm": 3.3878304958343506, + "learning_rate": 8.774833215046132e-05, + "loss": 0.04974772930145264, + "step": 86350 + }, + { + "epoch": 12.2583392476934, + "grad_norm": 0.26958853006362915, + "learning_rate": 8.774691270404542e-05, + "loss": 0.02548518478870392, + "step": 86360 + }, + { + "epoch": 12.259758694109298, + "grad_norm": 4.33382511138916, + "learning_rate": 8.774549325762953e-05, + "loss": 0.036443135142326354, + "step": 86370 + }, + { + "epoch": 12.261178140525196, + "grad_norm": 2.351497173309326, + "learning_rate": 8.774407381121363e-05, + "loss": 0.040211799740791324, + "step": 86380 + }, + { + "epoch": 12.262597586941093, + "grad_norm": 4.3762407302856445, + "learning_rate": 8.774265436479774e-05, + "loss": 0.02976323664188385, + "step": 86390 + }, + { + "epoch": 12.264017033356991, + "grad_norm": 0.07373414933681488, + "learning_rate": 8.774123491838184e-05, + "loss": 0.0378105491399765, + "step": 86400 + }, + { + "epoch": 12.26543647977289, + "grad_norm": 0.4650828242301941, + "learning_rate": 8.773981547196594e-05, + "loss": 0.030419424176216125, + "step": 86410 + }, + { + "epoch": 12.266855926188786, + "grad_norm": 1.0882513523101807, + "learning_rate": 8.773839602555005e-05, + "loss": 0.04058949947357178, + "step": 86420 + }, + { + "epoch": 12.268275372604684, + "grad_norm": 0.41338181495666504, + "learning_rate": 8.773697657913414e-05, + "loss": 0.01279766708612442, + "step": 86430 + }, + { + "epoch": 12.269694819020582, + "grad_norm": 1.0868407487869263, + "learning_rate": 8.773555713271825e-05, + "loss": 0.03736717700958252, + "step": 86440 + }, + { + "epoch": 12.27111426543648, + "grad_norm": 0.4957295060157776, + "learning_rate": 8.773413768630234e-05, + "loss": 0.022770945727825165, + "step": 86450 + }, + { + "epoch": 12.272533711852377, + "grad_norm": 7.433931827545166, + "learning_rate": 8.773271823988645e-05, + "loss": 0.04367608726024628, + "step": 86460 + }, + { + "epoch": 12.273953158268275, + "grad_norm": 3.417067527770996, + "learning_rate": 8.773129879347055e-05, + "loss": 0.036573588848114014, + "step": 86470 + }, + { + "epoch": 12.275372604684174, + "grad_norm": 0.04351586103439331, + "learning_rate": 8.772987934705466e-05, + "loss": 0.03823851346969605, + "step": 86480 + }, + { + "epoch": 12.27679205110007, + "grad_norm": 5.9566874504089355, + "learning_rate": 8.772845990063876e-05, + "loss": 0.035113397240638736, + "step": 86490 + }, + { + "epoch": 12.278211497515969, + "grad_norm": 0.16312649846076965, + "learning_rate": 8.772704045422285e-05, + "loss": 0.02089274823665619, + "step": 86500 + }, + { + "epoch": 12.278211497515969, + "eval_accuracy": 0.9846760348445349, + "eval_loss": 0.053695064038038254, + "eval_runtime": 33.0639, + "eval_samples_per_second": 475.655, + "eval_steps_per_second": 14.88, + "step": 86500 + }, + { + "epoch": 12.279630943931867, + "grad_norm": 4.228842735290527, + "learning_rate": 8.772562100780696e-05, + "loss": 0.025933349132537843, + "step": 86510 + }, + { + "epoch": 12.281050390347765, + "grad_norm": 2.5222129821777344, + "learning_rate": 8.772420156139106e-05, + "loss": 0.04900032877922058, + "step": 86520 + }, + { + "epoch": 12.282469836763662, + "grad_norm": 0.12866684794425964, + "learning_rate": 8.772278211497517e-05, + "loss": 0.012445084750652313, + "step": 86530 + }, + { + "epoch": 12.28388928317956, + "grad_norm": 0.30662041902542114, + "learning_rate": 8.772136266855927e-05, + "loss": 0.013581423461437226, + "step": 86540 + }, + { + "epoch": 12.285308729595458, + "grad_norm": 2.5577304363250732, + "learning_rate": 8.771994322214337e-05, + "loss": 0.012446528673171997, + "step": 86550 + }, + { + "epoch": 12.286728176011355, + "grad_norm": 9.133108139038086, + "learning_rate": 8.771866572036907e-05, + "loss": 0.034515559673309326, + "step": 86560 + }, + { + "epoch": 12.288147622427253, + "grad_norm": 2.261014461517334, + "learning_rate": 8.771724627395316e-05, + "loss": 0.04418281614780426, + "step": 86570 + }, + { + "epoch": 12.289567068843152, + "grad_norm": 1.050868034362793, + "learning_rate": 8.771582682753726e-05, + "loss": 0.049887990951538085, + "step": 86580 + }, + { + "epoch": 12.29098651525905, + "grad_norm": 0.37167036533355713, + "learning_rate": 8.771440738112137e-05, + "loss": 0.07505257725715637, + "step": 86590 + }, + { + "epoch": 12.292405961674946, + "grad_norm": 6.5589399337768555, + "learning_rate": 8.771298793470547e-05, + "loss": 0.009983015060424805, + "step": 86600 + }, + { + "epoch": 12.293825408090845, + "grad_norm": 2.896353244781494, + "learning_rate": 8.771156848828958e-05, + "loss": 0.08249455690383911, + "step": 86610 + }, + { + "epoch": 12.295244854506743, + "grad_norm": 0.8286024928092957, + "learning_rate": 8.771014904187366e-05, + "loss": 0.026764780282974243, + "step": 86620 + }, + { + "epoch": 12.29666430092264, + "grad_norm": 3.3944568634033203, + "learning_rate": 8.770872959545777e-05, + "loss": 0.005344720929861069, + "step": 86630 + }, + { + "epoch": 12.298083747338538, + "grad_norm": 0.10840263217687607, + "learning_rate": 8.770731014904187e-05, + "loss": 0.024625831842422487, + "step": 86640 + }, + { + "epoch": 12.299503193754436, + "grad_norm": 7.262483596801758, + "learning_rate": 8.770589070262598e-05, + "loss": 0.014307631552219391, + "step": 86650 + }, + { + "epoch": 12.300922640170334, + "grad_norm": 6.646247386932373, + "learning_rate": 8.770447125621008e-05, + "loss": 0.05552939772605896, + "step": 86660 + }, + { + "epoch": 12.302342086586231, + "grad_norm": 0.5893328189849854, + "learning_rate": 8.770305180979419e-05, + "loss": 0.03609735369682312, + "step": 86670 + }, + { + "epoch": 12.30376153300213, + "grad_norm": 8.235060691833496, + "learning_rate": 8.770163236337829e-05, + "loss": 0.03485163152217865, + "step": 86680 + }, + { + "epoch": 12.305180979418028, + "grad_norm": 0.4184919595718384, + "learning_rate": 8.770021291696239e-05, + "loss": 0.0376891016960144, + "step": 86690 + }, + { + "epoch": 12.306600425833924, + "grad_norm": 0.1280488520860672, + "learning_rate": 8.76987934705465e-05, + "loss": 0.01505531519651413, + "step": 86700 + }, + { + "epoch": 12.308019872249822, + "grad_norm": 7.553830146789551, + "learning_rate": 8.76973740241306e-05, + "loss": 0.06388399600982667, + "step": 86710 + }, + { + "epoch": 12.30943931866572, + "grad_norm": 13.967238426208496, + "learning_rate": 8.76959545777147e-05, + "loss": 0.06212584972381592, + "step": 86720 + }, + { + "epoch": 12.310858765081619, + "grad_norm": 0.37098854780197144, + "learning_rate": 8.769453513129879e-05, + "loss": 0.02619161903858185, + "step": 86730 + }, + { + "epoch": 12.312278211497516, + "grad_norm": 0.16169452667236328, + "learning_rate": 8.76931156848829e-05, + "loss": 0.05232579112052917, + "step": 86740 + }, + { + "epoch": 12.313697657913414, + "grad_norm": 5.871384620666504, + "learning_rate": 8.7691696238467e-05, + "loss": 0.030590057373046875, + "step": 86750 + }, + { + "epoch": 12.315117104329312, + "grad_norm": 0.049284838140010834, + "learning_rate": 8.769027679205111e-05, + "loss": 0.008392262458801269, + "step": 86760 + }, + { + "epoch": 12.316536550745209, + "grad_norm": 0.2415841817855835, + "learning_rate": 8.76888573456352e-05, + "loss": 0.0048000641167163845, + "step": 86770 + }, + { + "epoch": 12.317955997161107, + "grad_norm": 0.17869527637958527, + "learning_rate": 8.76874378992193e-05, + "loss": 0.06827518343925476, + "step": 86780 + }, + { + "epoch": 12.319375443577005, + "grad_norm": 6.73385763168335, + "learning_rate": 8.768601845280341e-05, + "loss": 0.05431227087974548, + "step": 86790 + }, + { + "epoch": 12.320794889992904, + "grad_norm": 0.7754188179969788, + "learning_rate": 8.768459900638751e-05, + "loss": 0.016968098282814027, + "step": 86800 + }, + { + "epoch": 12.3222143364088, + "grad_norm": 0.057217102497816086, + "learning_rate": 8.768317955997162e-05, + "loss": 0.011527723073959351, + "step": 86810 + }, + { + "epoch": 12.323633782824698, + "grad_norm": 0.12364845722913742, + "learning_rate": 8.768176011355572e-05, + "loss": 0.021833422780036926, + "step": 86820 + }, + { + "epoch": 12.325053229240597, + "grad_norm": 7.2985334396362305, + "learning_rate": 8.768034066713982e-05, + "loss": 0.02750059962272644, + "step": 86830 + }, + { + "epoch": 12.326472675656493, + "grad_norm": 2.3215739727020264, + "learning_rate": 8.767892122072391e-05, + "loss": 0.01575092077255249, + "step": 86840 + }, + { + "epoch": 12.327892122072392, + "grad_norm": 0.5124582052230835, + "learning_rate": 8.767750177430802e-05, + "loss": 0.011324916779994965, + "step": 86850 + }, + { + "epoch": 12.32931156848829, + "grad_norm": 0.26997870206832886, + "learning_rate": 8.767608232789212e-05, + "loss": 0.02369029074907303, + "step": 86860 + }, + { + "epoch": 12.330731014904188, + "grad_norm": 0.14223651587963104, + "learning_rate": 8.767466288147623e-05, + "loss": 0.017847174406051637, + "step": 86870 + }, + { + "epoch": 12.332150461320085, + "grad_norm": 7.334930419921875, + "learning_rate": 8.767324343506033e-05, + "loss": 0.023191067576408386, + "step": 86880 + }, + { + "epoch": 12.333569907735983, + "grad_norm": 0.027162916958332062, + "learning_rate": 8.767182398864443e-05, + "loss": 0.018505416810512543, + "step": 86890 + }, + { + "epoch": 12.334989354151881, + "grad_norm": 0.060514308512210846, + "learning_rate": 8.767040454222854e-05, + "loss": 0.011638328433036804, + "step": 86900 + }, + { + "epoch": 12.336408800567778, + "grad_norm": 0.48239290714263916, + "learning_rate": 8.766898509581264e-05, + "loss": 0.015294665098190307, + "step": 86910 + }, + { + "epoch": 12.337828246983676, + "grad_norm": 0.8900887370109558, + "learning_rate": 8.766756564939675e-05, + "loss": 0.04495021998882294, + "step": 86920 + }, + { + "epoch": 12.339247693399575, + "grad_norm": 0.049312490969896317, + "learning_rate": 8.766614620298083e-05, + "loss": 0.026754915714263916, + "step": 86930 + }, + { + "epoch": 12.340667139815473, + "grad_norm": 0.03092627413570881, + "learning_rate": 8.766472675656494e-05, + "loss": 0.007481810450553894, + "step": 86940 + }, + { + "epoch": 12.34208658623137, + "grad_norm": 0.10244758427143097, + "learning_rate": 8.766330731014904e-05, + "loss": 0.003902510926127434, + "step": 86950 + }, + { + "epoch": 12.343506032647268, + "grad_norm": 4.041190147399902, + "learning_rate": 8.766188786373315e-05, + "loss": 0.02546464204788208, + "step": 86960 + }, + { + "epoch": 12.344925479063166, + "grad_norm": 0.008100933395326138, + "learning_rate": 8.766046841731726e-05, + "loss": 0.0286540150642395, + "step": 86970 + }, + { + "epoch": 12.346344925479062, + "grad_norm": 0.07827972620725632, + "learning_rate": 8.765904897090134e-05, + "loss": 0.014019955694675446, + "step": 86980 + }, + { + "epoch": 12.34776437189496, + "grad_norm": 0.503950297832489, + "learning_rate": 8.765762952448546e-05, + "loss": 0.02017439007759094, + "step": 86990 + }, + { + "epoch": 12.349183818310859, + "grad_norm": 8.430594444274902, + "learning_rate": 8.765621007806955e-05, + "loss": 0.02455626130104065, + "step": 87000 + }, + { + "epoch": 12.349183818310859, + "eval_accuracy": 0.9834679214090418, + "eval_loss": 0.055014777928590775, + "eval_runtime": 33.1479, + "eval_samples_per_second": 474.449, + "eval_steps_per_second": 14.843, + "step": 87000 + }, + { + "epoch": 12.350603264726757, + "grad_norm": 3.624072790145874, + "learning_rate": 8.765479063165366e-05, + "loss": 0.02058798223733902, + "step": 87010 + }, + { + "epoch": 12.352022711142654, + "grad_norm": 5.688584327697754, + "learning_rate": 8.765337118523776e-05, + "loss": 0.02451040893793106, + "step": 87020 + }, + { + "epoch": 12.353442157558552, + "grad_norm": 0.16633503139019012, + "learning_rate": 8.765195173882187e-05, + "loss": 0.012659680843353272, + "step": 87030 + }, + { + "epoch": 12.35486160397445, + "grad_norm": 10.611780166625977, + "learning_rate": 8.765053229240596e-05, + "loss": 0.05080366134643555, + "step": 87040 + }, + { + "epoch": 12.356281050390347, + "grad_norm": 3.00984525680542, + "learning_rate": 8.764911284599007e-05, + "loss": 0.03359057009220123, + "step": 87050 + }, + { + "epoch": 12.357700496806245, + "grad_norm": 4.661736011505127, + "learning_rate": 8.764769339957416e-05, + "loss": 0.011646966636180877, + "step": 87060 + }, + { + "epoch": 12.359119943222144, + "grad_norm": 1.608762264251709, + "learning_rate": 8.764627395315828e-05, + "loss": 0.015019053220748901, + "step": 87070 + }, + { + "epoch": 12.360539389638042, + "grad_norm": 0.6708681583404541, + "learning_rate": 8.764485450674239e-05, + "loss": 0.024743953347206117, + "step": 87080 + }, + { + "epoch": 12.361958836053939, + "grad_norm": 0.05358989164233208, + "learning_rate": 8.764343506032647e-05, + "loss": 0.015583789348602295, + "step": 87090 + }, + { + "epoch": 12.363378282469837, + "grad_norm": 1.0911883115768433, + "learning_rate": 8.764201561391058e-05, + "loss": 0.021644075214862824, + "step": 87100 + }, + { + "epoch": 12.364797728885735, + "grad_norm": 0.013731296174228191, + "learning_rate": 8.764059616749468e-05, + "loss": 0.010372109711170197, + "step": 87110 + }, + { + "epoch": 12.366217175301632, + "grad_norm": 4.24946403503418, + "learning_rate": 8.763917672107879e-05, + "loss": 0.024570387601852418, + "step": 87120 + }, + { + "epoch": 12.36763662171753, + "grad_norm": 0.3531741797924042, + "learning_rate": 8.763775727466289e-05, + "loss": 0.030188676714897156, + "step": 87130 + }, + { + "epoch": 12.369056068133428, + "grad_norm": 0.037166986614465714, + "learning_rate": 8.763633782824698e-05, + "loss": 0.015771086513996124, + "step": 87140 + }, + { + "epoch": 12.370475514549327, + "grad_norm": 0.09290240705013275, + "learning_rate": 8.763491838183108e-05, + "loss": 0.013740764558315277, + "step": 87150 + }, + { + "epoch": 12.371894960965223, + "grad_norm": 6.755671501159668, + "learning_rate": 8.763349893541519e-05, + "loss": 0.045585596561431886, + "step": 87160 + }, + { + "epoch": 12.373314407381121, + "grad_norm": 1.835556149482727, + "learning_rate": 8.76320794889993e-05, + "loss": 0.006282643973827362, + "step": 87170 + }, + { + "epoch": 12.37473385379702, + "grad_norm": 0.2747805714607239, + "learning_rate": 8.76306600425834e-05, + "loss": 0.031342515349388124, + "step": 87180 + }, + { + "epoch": 12.376153300212916, + "grad_norm": 0.30110907554626465, + "learning_rate": 8.76292405961675e-05, + "loss": 0.02236345112323761, + "step": 87190 + }, + { + "epoch": 12.377572746628815, + "grad_norm": 0.16461510956287384, + "learning_rate": 8.76278211497516e-05, + "loss": 0.006141844391822815, + "step": 87200 + }, + { + "epoch": 12.378992193044713, + "grad_norm": 3.9134035110473633, + "learning_rate": 8.76264017033357e-05, + "loss": 0.011970852315425873, + "step": 87210 + }, + { + "epoch": 12.380411639460611, + "grad_norm": 12.888188362121582, + "learning_rate": 8.76249822569198e-05, + "loss": 0.026162534952163696, + "step": 87220 + }, + { + "epoch": 12.381831085876508, + "grad_norm": 1.6967792510986328, + "learning_rate": 8.762356281050391e-05, + "loss": 0.017707546055316926, + "step": 87230 + }, + { + "epoch": 12.383250532292406, + "grad_norm": 0.01907888986170292, + "learning_rate": 8.7622143364088e-05, + "loss": 0.016062094271183013, + "step": 87240 + }, + { + "epoch": 12.384669978708304, + "grad_norm": 1.7975367307662964, + "learning_rate": 8.762072391767211e-05, + "loss": 0.057080841064453124, + "step": 87250 + }, + { + "epoch": 12.3860894251242, + "grad_norm": 0.19056759774684906, + "learning_rate": 8.761930447125622e-05, + "loss": 0.009392456710338592, + "step": 87260 + }, + { + "epoch": 12.3875088715401, + "grad_norm": 0.057802751660346985, + "learning_rate": 8.761788502484032e-05, + "loss": 0.02272159457206726, + "step": 87270 + }, + { + "epoch": 12.388928317955997, + "grad_norm": 0.046848129481077194, + "learning_rate": 8.761646557842443e-05, + "loss": 0.01983257681131363, + "step": 87280 + }, + { + "epoch": 12.390347764371896, + "grad_norm": 6.277688503265381, + "learning_rate": 8.761504613200851e-05, + "loss": 0.02165149450302124, + "step": 87290 + }, + { + "epoch": 12.391767210787792, + "grad_norm": 6.719855308532715, + "learning_rate": 8.761362668559262e-05, + "loss": 0.06963080167770386, + "step": 87300 + }, + { + "epoch": 12.39318665720369, + "grad_norm": 7.310576915740967, + "learning_rate": 8.761220723917672e-05, + "loss": 0.012097867578268051, + "step": 87310 + }, + { + "epoch": 12.394606103619589, + "grad_norm": 0.18734972178936005, + "learning_rate": 8.761078779276083e-05, + "loss": 0.0036718335002660753, + "step": 87320 + }, + { + "epoch": 12.396025550035485, + "grad_norm": 5.503396511077881, + "learning_rate": 8.760936834634493e-05, + "loss": 0.017102155089378356, + "step": 87330 + }, + { + "epoch": 12.397444996451384, + "grad_norm": 1.302686095237732, + "learning_rate": 8.760794889992903e-05, + "loss": 0.03908879160881042, + "step": 87340 + }, + { + "epoch": 12.398864442867282, + "grad_norm": 0.13944663107395172, + "learning_rate": 8.760652945351314e-05, + "loss": 0.04314888119697571, + "step": 87350 + }, + { + "epoch": 12.40028388928318, + "grad_norm": 0.19432903826236725, + "learning_rate": 8.760511000709723e-05, + "loss": 0.014165303111076355, + "step": 87360 + }, + { + "epoch": 12.401703335699077, + "grad_norm": 1.674633264541626, + "learning_rate": 8.760369056068135e-05, + "loss": 0.0025971658527851106, + "step": 87370 + }, + { + "epoch": 12.403122782114975, + "grad_norm": 4.423507213592529, + "learning_rate": 8.760227111426544e-05, + "loss": 0.01057143062353134, + "step": 87380 + }, + { + "epoch": 12.404542228530874, + "grad_norm": 0.0152947548776865, + "learning_rate": 8.760085166784955e-05, + "loss": 0.032986536622047424, + "step": 87390 + }, + { + "epoch": 12.40596167494677, + "grad_norm": 0.5295947194099426, + "learning_rate": 8.759943222143364e-05, + "loss": 0.023621892929077147, + "step": 87400 + }, + { + "epoch": 12.407381121362668, + "grad_norm": 0.017911149188876152, + "learning_rate": 8.759801277501775e-05, + "loss": 0.029540061950683594, + "step": 87410 + }, + { + "epoch": 12.408800567778567, + "grad_norm": 0.042582545429468155, + "learning_rate": 8.759659332860185e-05, + "loss": 0.03940750360488891, + "step": 87420 + }, + { + "epoch": 12.410220014194465, + "grad_norm": 14.954813957214355, + "learning_rate": 8.759517388218596e-05, + "loss": 0.054214847087860105, + "step": 87430 + }, + { + "epoch": 12.411639460610361, + "grad_norm": 0.5235926508903503, + "learning_rate": 8.759375443577005e-05, + "loss": 0.06929436922073365, + "step": 87440 + }, + { + "epoch": 12.41305890702626, + "grad_norm": 9.736461639404297, + "learning_rate": 8.759233498935415e-05, + "loss": 0.18118247985839844, + "step": 87450 + }, + { + "epoch": 12.414478353442158, + "grad_norm": 0.6641530990600586, + "learning_rate": 8.759091554293826e-05, + "loss": 0.030246061086654664, + "step": 87460 + }, + { + "epoch": 12.415897799858055, + "grad_norm": 0.011021087877452374, + "learning_rate": 8.758949609652236e-05, + "loss": 0.005495109036564827, + "step": 87470 + }, + { + "epoch": 12.417317246273953, + "grad_norm": 4.158930778503418, + "learning_rate": 8.758807665010647e-05, + "loss": 0.025970342755317687, + "step": 87480 + }, + { + "epoch": 12.418736692689851, + "grad_norm": 0.2828434109687805, + "learning_rate": 8.758665720369057e-05, + "loss": 0.013630589842796326, + "step": 87490 + }, + { + "epoch": 12.42015613910575, + "grad_norm": 0.26347261667251587, + "learning_rate": 8.758523775727467e-05, + "loss": 0.01589176505804062, + "step": 87500 + }, + { + "epoch": 12.42015613910575, + "eval_accuracy": 0.9822598079735487, + "eval_loss": 0.06081771478056908, + "eval_runtime": 32.4567, + "eval_samples_per_second": 484.553, + "eval_steps_per_second": 15.159, + "step": 87500 + }, + { + "epoch": 12.421575585521646, + "grad_norm": 0.53838711977005, + "learning_rate": 8.758381831085876e-05, + "loss": 0.013520075380802155, + "step": 87510 + }, + { + "epoch": 12.422995031937544, + "grad_norm": 0.6461429595947266, + "learning_rate": 8.758239886444287e-05, + "loss": 0.028435495495796204, + "step": 87520 + }, + { + "epoch": 12.424414478353443, + "grad_norm": 0.12812745571136475, + "learning_rate": 8.758097941802697e-05, + "loss": 0.010488449782133102, + "step": 87530 + }, + { + "epoch": 12.42583392476934, + "grad_norm": 0.09482559561729431, + "learning_rate": 8.757955997161108e-05, + "loss": 0.029420602321624755, + "step": 87540 + }, + { + "epoch": 12.427253371185238, + "grad_norm": 0.06975217163562775, + "learning_rate": 8.757814052519518e-05, + "loss": 0.02707561254501343, + "step": 87550 + }, + { + "epoch": 12.428672817601136, + "grad_norm": 5.481614112854004, + "learning_rate": 8.757672107877928e-05, + "loss": 0.033937618136405945, + "step": 87560 + }, + { + "epoch": 12.430092264017034, + "grad_norm": 12.287485122680664, + "learning_rate": 8.757530163236339e-05, + "loss": 0.03466455936431885, + "step": 87570 + }, + { + "epoch": 12.43151171043293, + "grad_norm": 0.046677395701408386, + "learning_rate": 8.757388218594749e-05, + "loss": 0.03979707658290863, + "step": 87580 + }, + { + "epoch": 12.432931156848829, + "grad_norm": 11.278783798217773, + "learning_rate": 8.75724627395316e-05, + "loss": 0.044660136103630066, + "step": 87590 + }, + { + "epoch": 12.434350603264727, + "grad_norm": 0.04154336825013161, + "learning_rate": 8.757104329311568e-05, + "loss": 0.011902222782373429, + "step": 87600 + }, + { + "epoch": 12.435770049680624, + "grad_norm": 4.308584213256836, + "learning_rate": 8.756962384669979e-05, + "loss": 0.04704400897026062, + "step": 87610 + }, + { + "epoch": 12.437189496096522, + "grad_norm": 0.3787962794303894, + "learning_rate": 8.756820440028389e-05, + "loss": 0.05933155417442322, + "step": 87620 + }, + { + "epoch": 12.43860894251242, + "grad_norm": 2.361604690551758, + "learning_rate": 8.7566784953868e-05, + "loss": 0.048081979155540466, + "step": 87630 + }, + { + "epoch": 12.440028388928319, + "grad_norm": 0.29323363304138184, + "learning_rate": 8.75653655074521e-05, + "loss": 0.013682647049427033, + "step": 87640 + }, + { + "epoch": 12.441447835344215, + "grad_norm": 0.1297416388988495, + "learning_rate": 8.75639460610362e-05, + "loss": 0.02944377064704895, + "step": 87650 + }, + { + "epoch": 12.442867281760114, + "grad_norm": 2.6698644161224365, + "learning_rate": 8.75625266146203e-05, + "loss": 0.03293330073356628, + "step": 87660 + }, + { + "epoch": 12.444286728176012, + "grad_norm": 6.906935691833496, + "learning_rate": 8.75611071682044e-05, + "loss": 0.019272826611995697, + "step": 87670 + }, + { + "epoch": 12.445706174591908, + "grad_norm": 0.045950256288051605, + "learning_rate": 8.755968772178851e-05, + "loss": 0.03073270320892334, + "step": 87680 + }, + { + "epoch": 12.447125621007807, + "grad_norm": 0.040875449776649475, + "learning_rate": 8.755826827537261e-05, + "loss": 0.11078486442565919, + "step": 87690 + }, + { + "epoch": 12.448545067423705, + "grad_norm": 10.369391441345215, + "learning_rate": 8.755684882895671e-05, + "loss": 0.040163788199424746, + "step": 87700 + }, + { + "epoch": 12.449964513839603, + "grad_norm": 14.18911075592041, + "learning_rate": 8.75554293825408e-05, + "loss": 0.04805826246738434, + "step": 87710 + }, + { + "epoch": 12.4513839602555, + "grad_norm": 0.6210464835166931, + "learning_rate": 8.755400993612492e-05, + "loss": 0.03395252823829651, + "step": 87720 + }, + { + "epoch": 12.452803406671398, + "grad_norm": 0.23483672738075256, + "learning_rate": 8.755259048970901e-05, + "loss": 0.02177533507347107, + "step": 87730 + }, + { + "epoch": 12.454222853087296, + "grad_norm": 0.06254476308822632, + "learning_rate": 8.755117104329312e-05, + "loss": 0.03554804921150208, + "step": 87740 + }, + { + "epoch": 12.455642299503193, + "grad_norm": 0.46991172432899475, + "learning_rate": 8.754975159687722e-05, + "loss": 0.004773439094424248, + "step": 87750 + }, + { + "epoch": 12.457061745919091, + "grad_norm": 0.015528268180787563, + "learning_rate": 8.754833215046132e-05, + "loss": 0.015317653119564057, + "step": 87760 + }, + { + "epoch": 12.45848119233499, + "grad_norm": 0.04909134656190872, + "learning_rate": 8.754691270404543e-05, + "loss": 0.016335402429103852, + "step": 87770 + }, + { + "epoch": 12.459900638750888, + "grad_norm": 2.1774823665618896, + "learning_rate": 8.754549325762953e-05, + "loss": 0.02170611321926117, + "step": 87780 + }, + { + "epoch": 12.461320085166784, + "grad_norm": 0.414140522480011, + "learning_rate": 8.754407381121364e-05, + "loss": 0.04764865934848785, + "step": 87790 + }, + { + "epoch": 12.462739531582683, + "grad_norm": 1.7557913064956665, + "learning_rate": 8.754265436479774e-05, + "loss": 0.05648176074028015, + "step": 87800 + }, + { + "epoch": 12.464158977998581, + "grad_norm": 0.21937665343284607, + "learning_rate": 8.754123491838183e-05, + "loss": 0.0062197927385568615, + "step": 87810 + }, + { + "epoch": 12.465578424414478, + "grad_norm": 6.514895915985107, + "learning_rate": 8.753981547196593e-05, + "loss": 0.037446460127830504, + "step": 87820 + }, + { + "epoch": 12.466997870830376, + "grad_norm": 0.046004533767700195, + "learning_rate": 8.753839602555004e-05, + "loss": 0.020039723813533784, + "step": 87830 + }, + { + "epoch": 12.468417317246274, + "grad_norm": 0.03143101558089256, + "learning_rate": 8.753697657913414e-05, + "loss": 0.004655058309435844, + "step": 87840 + }, + { + "epoch": 12.469836763662173, + "grad_norm": 1.383615255355835, + "learning_rate": 8.753555713271825e-05, + "loss": 0.0048833321779966354, + "step": 87850 + }, + { + "epoch": 12.471256210078069, + "grad_norm": 0.08284850418567657, + "learning_rate": 8.753413768630235e-05, + "loss": 0.03349995911121369, + "step": 87860 + }, + { + "epoch": 12.472675656493967, + "grad_norm": 10.989090919494629, + "learning_rate": 8.753271823988644e-05, + "loss": 0.02032398581504822, + "step": 87870 + }, + { + "epoch": 12.474095102909866, + "grad_norm": 0.06767125427722931, + "learning_rate": 8.753129879347056e-05, + "loss": 0.04788086712360382, + "step": 87880 + }, + { + "epoch": 12.475514549325762, + "grad_norm": 0.33816125988960266, + "learning_rate": 8.752987934705465e-05, + "loss": 0.07616091370582581, + "step": 87890 + }, + { + "epoch": 12.47693399574166, + "grad_norm": 14.20706558227539, + "learning_rate": 8.752845990063876e-05, + "loss": 0.06578752994537354, + "step": 87900 + }, + { + "epoch": 12.478353442157559, + "grad_norm": 2.63919997215271, + "learning_rate": 8.752704045422285e-05, + "loss": 0.02024722993373871, + "step": 87910 + }, + { + "epoch": 12.479772888573457, + "grad_norm": 9.86892318725586, + "learning_rate": 8.752562100780696e-05, + "loss": 0.044533932209014894, + "step": 87920 + }, + { + "epoch": 12.481192334989354, + "grad_norm": 0.9026236534118652, + "learning_rate": 8.752420156139106e-05, + "loss": 0.0754780113697052, + "step": 87930 + }, + { + "epoch": 12.482611781405252, + "grad_norm": 18.482194900512695, + "learning_rate": 8.752278211497517e-05, + "loss": 0.06532397866249084, + "step": 87940 + }, + { + "epoch": 12.48403122782115, + "grad_norm": 0.9888849854469299, + "learning_rate": 8.752136266855926e-05, + "loss": 0.05627738833427429, + "step": 87950 + }, + { + "epoch": 12.485450674237047, + "grad_norm": 4.003131866455078, + "learning_rate": 8.751994322214336e-05, + "loss": 0.05450453758239746, + "step": 87960 + }, + { + "epoch": 12.486870120652945, + "grad_norm": 8.732240676879883, + "learning_rate": 8.751852377572747e-05, + "loss": 0.039267003536224365, + "step": 87970 + }, + { + "epoch": 12.488289567068843, + "grad_norm": 1.3493448495864868, + "learning_rate": 8.751710432931157e-05, + "loss": 0.0159692645072937, + "step": 87980 + }, + { + "epoch": 12.489709013484742, + "grad_norm": 0.08096921443939209, + "learning_rate": 8.751568488289568e-05, + "loss": 0.02429681420326233, + "step": 87990 + }, + { + "epoch": 12.491128459900638, + "grad_norm": 5.899989604949951, + "learning_rate": 8.751426543647978e-05, + "loss": 0.04755156338214874, + "step": 88000 + }, + { + "epoch": 12.491128459900638, + "eval_accuracy": 0.9847396197621924, + "eval_loss": 0.05461974814534187, + "eval_runtime": 33.2764, + "eval_samples_per_second": 472.617, + "eval_steps_per_second": 14.785, + "step": 88000 + }, + { + "epoch": 12.492547906316537, + "grad_norm": 0.3637841045856476, + "learning_rate": 8.751284599006388e-05, + "loss": 0.010181383788585662, + "step": 88010 + }, + { + "epoch": 12.493967352732435, + "grad_norm": 0.056884367018938065, + "learning_rate": 8.751142654364797e-05, + "loss": 0.02700384259223938, + "step": 88020 + }, + { + "epoch": 12.495386799148331, + "grad_norm": 0.06662727892398834, + "learning_rate": 8.751000709723208e-05, + "loss": 0.024283269047737123, + "step": 88030 + }, + { + "epoch": 12.49680624556423, + "grad_norm": 2.124976873397827, + "learning_rate": 8.750858765081618e-05, + "loss": 0.03328163623809814, + "step": 88040 + }, + { + "epoch": 12.498225691980128, + "grad_norm": 0.18708670139312744, + "learning_rate": 8.750716820440029e-05, + "loss": 0.035369104146957396, + "step": 88050 + }, + { + "epoch": 12.499645138396026, + "grad_norm": 13.181412696838379, + "learning_rate": 8.750574875798439e-05, + "loss": 0.02761123776435852, + "step": 88060 + }, + { + "epoch": 12.501064584811923, + "grad_norm": 0.07607295364141464, + "learning_rate": 8.750432931156849e-05, + "loss": 0.008057416975498199, + "step": 88070 + }, + { + "epoch": 12.502484031227821, + "grad_norm": 0.4912075996398926, + "learning_rate": 8.75029098651526e-05, + "loss": 0.026665246486663817, + "step": 88080 + }, + { + "epoch": 12.50390347764372, + "grad_norm": 6.729429721832275, + "learning_rate": 8.75014904187367e-05, + "loss": 0.02897602915763855, + "step": 88090 + }, + { + "epoch": 12.505322924059616, + "grad_norm": 0.07232243567705154, + "learning_rate": 8.75000709723208e-05, + "loss": 0.026740044355392456, + "step": 88100 + }, + { + "epoch": 12.506742370475514, + "grad_norm": 3.6393470764160156, + "learning_rate": 8.74986515259049e-05, + "loss": 0.009813410043716431, + "step": 88110 + }, + { + "epoch": 12.508161816891413, + "grad_norm": 1.4777772426605225, + "learning_rate": 8.7497232079489e-05, + "loss": 0.024714049696922303, + "step": 88120 + }, + { + "epoch": 12.509581263307311, + "grad_norm": 1.4388811588287354, + "learning_rate": 8.74958126330731e-05, + "loss": 0.06101258993148804, + "step": 88130 + }, + { + "epoch": 12.511000709723207, + "grad_norm": 7.557995319366455, + "learning_rate": 8.749439318665721e-05, + "loss": 0.10327945947647095, + "step": 88140 + }, + { + "epoch": 12.512420156139106, + "grad_norm": 9.990800857543945, + "learning_rate": 8.749297374024131e-05, + "loss": 0.05634713768959045, + "step": 88150 + }, + { + "epoch": 12.513839602555004, + "grad_norm": 4.879364967346191, + "learning_rate": 8.749155429382542e-05, + "loss": 0.06440256237983703, + "step": 88160 + }, + { + "epoch": 12.5152590489709, + "grad_norm": 16.748706817626953, + "learning_rate": 8.749013484740952e-05, + "loss": 0.05296328663825989, + "step": 88170 + }, + { + "epoch": 12.516678495386799, + "grad_norm": 2.7976927757263184, + "learning_rate": 8.748871540099361e-05, + "loss": 0.024095140397548676, + "step": 88180 + }, + { + "epoch": 12.518097941802697, + "grad_norm": 1.1793253421783447, + "learning_rate": 8.748729595457772e-05, + "loss": 0.02114529013633728, + "step": 88190 + }, + { + "epoch": 12.519517388218595, + "grad_norm": 8.5409517288208, + "learning_rate": 8.748587650816182e-05, + "loss": 0.021783101558685302, + "step": 88200 + }, + { + "epoch": 12.520936834634492, + "grad_norm": 6.478908061981201, + "learning_rate": 8.748445706174593e-05, + "loss": 0.019826227426528932, + "step": 88210 + }, + { + "epoch": 12.52235628105039, + "grad_norm": 1.4375824928283691, + "learning_rate": 8.748303761533002e-05, + "loss": 0.05534259080886841, + "step": 88220 + }, + { + "epoch": 12.523775727466289, + "grad_norm": 0.06427717208862305, + "learning_rate": 8.748161816891413e-05, + "loss": 0.051985299587249754, + "step": 88230 + }, + { + "epoch": 12.525195173882185, + "grad_norm": 0.13583670556545258, + "learning_rate": 8.748019872249822e-05, + "loss": 0.022438764572143555, + "step": 88240 + }, + { + "epoch": 12.526614620298083, + "grad_norm": 5.735683917999268, + "learning_rate": 8.747877927608233e-05, + "loss": 0.04711911678314209, + "step": 88250 + }, + { + "epoch": 12.528034066713982, + "grad_norm": 0.3198873996734619, + "learning_rate": 8.747735982966643e-05, + "loss": 0.027721744775772095, + "step": 88260 + }, + { + "epoch": 12.52945351312988, + "grad_norm": 7.128531455993652, + "learning_rate": 8.747594038325053e-05, + "loss": 0.012630848586559296, + "step": 88270 + }, + { + "epoch": 12.530872959545777, + "grad_norm": 0.5762258768081665, + "learning_rate": 8.747452093683464e-05, + "loss": 0.013543166220188141, + "step": 88280 + }, + { + "epoch": 12.532292405961675, + "grad_norm": 1.9634393453598022, + "learning_rate": 8.747310149041874e-05, + "loss": 0.016037152707576753, + "step": 88290 + }, + { + "epoch": 12.533711852377573, + "grad_norm": 6.396450042724609, + "learning_rate": 8.747168204400285e-05, + "loss": 0.043987932801246646, + "step": 88300 + }, + { + "epoch": 12.53513129879347, + "grad_norm": 0.2535754442214966, + "learning_rate": 8.747026259758695e-05, + "loss": 0.08850111961364746, + "step": 88310 + }, + { + "epoch": 12.536550745209368, + "grad_norm": 0.5153784155845642, + "learning_rate": 8.746884315117104e-05, + "loss": 0.006086389720439911, + "step": 88320 + }, + { + "epoch": 12.537970191625266, + "grad_norm": 0.9000504612922668, + "learning_rate": 8.746742370475514e-05, + "loss": 0.05115777254104614, + "step": 88330 + }, + { + "epoch": 12.539389638041165, + "grad_norm": 0.12878790497779846, + "learning_rate": 8.746600425833925e-05, + "loss": 0.008339033275842667, + "step": 88340 + }, + { + "epoch": 12.540809084457061, + "grad_norm": 0.431140661239624, + "learning_rate": 8.746458481192335e-05, + "loss": 0.023614758253097536, + "step": 88350 + }, + { + "epoch": 12.54222853087296, + "grad_norm": 0.34187155961990356, + "learning_rate": 8.746316536550746e-05, + "loss": 0.03767357170581818, + "step": 88360 + }, + { + "epoch": 12.543647977288858, + "grad_norm": 0.11099901050329208, + "learning_rate": 8.746174591909156e-05, + "loss": 0.03305617868900299, + "step": 88370 + }, + { + "epoch": 12.545067423704754, + "grad_norm": 0.17631615698337555, + "learning_rate": 8.746032647267565e-05, + "loss": 0.016865435242652892, + "step": 88380 + }, + { + "epoch": 12.546486870120653, + "grad_norm": 0.03785773366689682, + "learning_rate": 8.745890702625977e-05, + "loss": 0.0281915158033371, + "step": 88390 + }, + { + "epoch": 12.547906316536551, + "grad_norm": 0.6064432263374329, + "learning_rate": 8.745748757984386e-05, + "loss": 0.023923471570014954, + "step": 88400 + }, + { + "epoch": 12.54932576295245, + "grad_norm": 3.660982370376587, + "learning_rate": 8.745606813342797e-05, + "loss": 0.011482635140419006, + "step": 88410 + }, + { + "epoch": 12.550745209368346, + "grad_norm": 3.0567476749420166, + "learning_rate": 8.745464868701206e-05, + "loss": 0.015174010396003723, + "step": 88420 + }, + { + "epoch": 12.552164655784244, + "grad_norm": 0.22067005932331085, + "learning_rate": 8.745322924059617e-05, + "loss": 0.014688675105571748, + "step": 88430 + }, + { + "epoch": 12.553584102200142, + "grad_norm": 6.446946620941162, + "learning_rate": 8.745180979418027e-05, + "loss": 0.020932187139987946, + "step": 88440 + }, + { + "epoch": 12.555003548616039, + "grad_norm": 1.4346462488174438, + "learning_rate": 8.745039034776438e-05, + "loss": 0.064886873960495, + "step": 88450 + }, + { + "epoch": 12.556422995031937, + "grad_norm": 0.07562988251447678, + "learning_rate": 8.744897090134849e-05, + "loss": 0.012400922179222108, + "step": 88460 + }, + { + "epoch": 12.557842441447836, + "grad_norm": 1.1553202867507935, + "learning_rate": 8.744755145493259e-05, + "loss": 0.02751150131225586, + "step": 88470 + }, + { + "epoch": 12.559261887863734, + "grad_norm": 0.09762617945671082, + "learning_rate": 8.744613200851668e-05, + "loss": 0.04214627742767334, + "step": 88480 + }, + { + "epoch": 12.56068133427963, + "grad_norm": 7.1212592124938965, + "learning_rate": 8.744471256210078e-05, + "loss": 0.027154302597045897, + "step": 88490 + }, + { + "epoch": 12.562100780695529, + "grad_norm": 2.778210163116455, + "learning_rate": 8.744329311568489e-05, + "loss": 0.04185508191585541, + "step": 88500 + }, + { + "epoch": 12.562100780695529, + "eval_accuracy": 0.982069053220576, + "eval_loss": 0.06504856050014496, + "eval_runtime": 32.7255, + "eval_samples_per_second": 480.573, + "eval_steps_per_second": 15.034, + "step": 88500 + }, + { + "epoch": 12.563520227111427, + "grad_norm": 1.278740406036377, + "learning_rate": 8.744187366926899e-05, + "loss": 0.019236615300178526, + "step": 88510 + }, + { + "epoch": 12.564939673527324, + "grad_norm": 0.029653554782271385, + "learning_rate": 8.74404542228531e-05, + "loss": 0.02058974504470825, + "step": 88520 + }, + { + "epoch": 12.566359119943222, + "grad_norm": 2.3865671157836914, + "learning_rate": 8.743903477643718e-05, + "loss": 0.02472461760044098, + "step": 88530 + }, + { + "epoch": 12.56777856635912, + "grad_norm": 0.04514153301715851, + "learning_rate": 8.74376153300213e-05, + "loss": 0.016757330298423766, + "step": 88540 + }, + { + "epoch": 12.569198012775018, + "grad_norm": 2.034675359725952, + "learning_rate": 8.74361958836054e-05, + "loss": 0.017388372123241423, + "step": 88550 + }, + { + "epoch": 12.570617459190915, + "grad_norm": 13.655932426452637, + "learning_rate": 8.74347764371895e-05, + "loss": 0.020960594713687896, + "step": 88560 + }, + { + "epoch": 12.572036905606813, + "grad_norm": 0.2972569763660431, + "learning_rate": 8.743335699077361e-05, + "loss": 0.019746491312980653, + "step": 88570 + }, + { + "epoch": 12.573456352022712, + "grad_norm": 0.038133490830659866, + "learning_rate": 8.74319375443577e-05, + "loss": 0.017399133741855623, + "step": 88580 + }, + { + "epoch": 12.574875798438608, + "grad_norm": 8.638161659240723, + "learning_rate": 8.743051809794181e-05, + "loss": 0.041193830966949466, + "step": 88590 + }, + { + "epoch": 12.576295244854506, + "grad_norm": 5.905753135681152, + "learning_rate": 8.74290986515259e-05, + "loss": 0.012840107083320618, + "step": 88600 + }, + { + "epoch": 12.577714691270405, + "grad_norm": 0.0456351675093174, + "learning_rate": 8.742767920511002e-05, + "loss": 0.05107580423355103, + "step": 88610 + }, + { + "epoch": 12.579134137686303, + "grad_norm": 2.247901201248169, + "learning_rate": 8.742625975869411e-05, + "loss": 0.008354905247688293, + "step": 88620 + }, + { + "epoch": 12.5805535841022, + "grad_norm": 4.338565826416016, + "learning_rate": 8.742484031227821e-05, + "loss": 0.011515699326992035, + "step": 88630 + }, + { + "epoch": 12.581973030518098, + "grad_norm": 10.359500885009766, + "learning_rate": 8.742342086586232e-05, + "loss": 0.08170977830886841, + "step": 88640 + }, + { + "epoch": 12.583392476933996, + "grad_norm": 0.3888038396835327, + "learning_rate": 8.742200141944642e-05, + "loss": 0.023364748060703277, + "step": 88650 + }, + { + "epoch": 12.584811923349893, + "grad_norm": 6.801969528198242, + "learning_rate": 8.742058197303053e-05, + "loss": 0.039721333980560304, + "step": 88660 + }, + { + "epoch": 12.586231369765791, + "grad_norm": 1.2758439779281616, + "learning_rate": 8.741916252661463e-05, + "loss": 0.007038282603025437, + "step": 88670 + }, + { + "epoch": 12.58765081618169, + "grad_norm": 0.23825159668922424, + "learning_rate": 8.741774308019873e-05, + "loss": 0.028574222326278688, + "step": 88680 + }, + { + "epoch": 12.589070262597588, + "grad_norm": 0.2223246991634369, + "learning_rate": 8.741632363378282e-05, + "loss": 0.030116242170333863, + "step": 88690 + }, + { + "epoch": 12.590489709013484, + "grad_norm": 6.101010322570801, + "learning_rate": 8.741490418736693e-05, + "loss": 0.022591683268547057, + "step": 88700 + }, + { + "epoch": 12.591909155429382, + "grad_norm": 2.12394642829895, + "learning_rate": 8.741348474095103e-05, + "loss": 0.034531053900718686, + "step": 88710 + }, + { + "epoch": 12.59332860184528, + "grad_norm": 0.01367959938943386, + "learning_rate": 8.741206529453514e-05, + "loss": 0.025950926542282104, + "step": 88720 + }, + { + "epoch": 12.594748048261177, + "grad_norm": 0.10593011975288391, + "learning_rate": 8.741064584811924e-05, + "loss": 0.035642963647842404, + "step": 88730 + }, + { + "epoch": 12.596167494677076, + "grad_norm": 0.04683855548501015, + "learning_rate": 8.740922640170334e-05, + "loss": 0.01912751644849777, + "step": 88740 + }, + { + "epoch": 12.597586941092974, + "grad_norm": 0.07964295893907547, + "learning_rate": 8.740780695528745e-05, + "loss": 0.036763134598732, + "step": 88750 + }, + { + "epoch": 12.599006387508872, + "grad_norm": 14.841459274291992, + "learning_rate": 8.740638750887154e-05, + "loss": 0.036690741777420044, + "step": 88760 + }, + { + "epoch": 12.600425833924769, + "grad_norm": 0.33520805835723877, + "learning_rate": 8.740496806245566e-05, + "loss": 0.053361940383911136, + "step": 88770 + }, + { + "epoch": 12.601845280340667, + "grad_norm": 1.1256662607192993, + "learning_rate": 8.740354861603974e-05, + "loss": 0.01567111909389496, + "step": 88780 + }, + { + "epoch": 12.603264726756565, + "grad_norm": 13.993224143981934, + "learning_rate": 8.740212916962385e-05, + "loss": 0.06229985952377319, + "step": 88790 + }, + { + "epoch": 12.604684173172462, + "grad_norm": 0.03460023179650307, + "learning_rate": 8.740070972320795e-05, + "loss": 0.027052420377731323, + "step": 88800 + }, + { + "epoch": 12.60610361958836, + "grad_norm": 2.6936776638031006, + "learning_rate": 8.739929027679206e-05, + "loss": 0.02551792562007904, + "step": 88810 + }, + { + "epoch": 12.607523066004259, + "grad_norm": 2.129927396774292, + "learning_rate": 8.739787083037616e-05, + "loss": 0.022685115039348603, + "step": 88820 + }, + { + "epoch": 12.608942512420157, + "grad_norm": 0.14080578088760376, + "learning_rate": 8.739645138396027e-05, + "loss": 0.023080846667289732, + "step": 88830 + }, + { + "epoch": 12.610361958836053, + "grad_norm": 11.936066627502441, + "learning_rate": 8.739503193754436e-05, + "loss": 0.030691647529602052, + "step": 88840 + }, + { + "epoch": 12.611781405251952, + "grad_norm": 0.7048534750938416, + "learning_rate": 8.739361249112846e-05, + "loss": 0.012999877333641052, + "step": 88850 + }, + { + "epoch": 12.61320085166785, + "grad_norm": 0.25170546770095825, + "learning_rate": 8.739219304471257e-05, + "loss": 0.02892131209373474, + "step": 88860 + }, + { + "epoch": 12.614620298083747, + "grad_norm": 14.818376541137695, + "learning_rate": 8.739077359829667e-05, + "loss": 0.05256804823875427, + "step": 88870 + }, + { + "epoch": 12.616039744499645, + "grad_norm": 3.001458168029785, + "learning_rate": 8.738935415188078e-05, + "loss": 0.01690353751182556, + "step": 88880 + }, + { + "epoch": 12.617459190915543, + "grad_norm": 2.985347032546997, + "learning_rate": 8.738793470546487e-05, + "loss": 0.01725500375032425, + "step": 88890 + }, + { + "epoch": 12.618878637331441, + "grad_norm": 0.04500554874539375, + "learning_rate": 8.738651525904898e-05, + "loss": 0.038158965110778806, + "step": 88900 + }, + { + "epoch": 12.620298083747338, + "grad_norm": 2.2031068801879883, + "learning_rate": 8.738509581263307e-05, + "loss": 0.025946080684661865, + "step": 88910 + }, + { + "epoch": 12.621717530163236, + "grad_norm": 0.04379592835903168, + "learning_rate": 8.738367636621718e-05, + "loss": 0.0072929747402668, + "step": 88920 + }, + { + "epoch": 12.623136976579135, + "grad_norm": 9.492351531982422, + "learning_rate": 8.738225691980128e-05, + "loss": 0.025685006380081178, + "step": 88930 + }, + { + "epoch": 12.624556422995031, + "grad_norm": 0.008654547855257988, + "learning_rate": 8.738083747338538e-05, + "loss": 0.027894750237464905, + "step": 88940 + }, + { + "epoch": 12.62597586941093, + "grad_norm": 0.26972833275794983, + "learning_rate": 8.737941802696949e-05, + "loss": 0.020068514347076415, + "step": 88950 + }, + { + "epoch": 12.627395315826828, + "grad_norm": 0.13917766511440277, + "learning_rate": 8.737799858055359e-05, + "loss": 0.02202069163322449, + "step": 88960 + }, + { + "epoch": 12.628814762242726, + "grad_norm": 0.10656342655420303, + "learning_rate": 8.73765791341377e-05, + "loss": 0.020247262716293336, + "step": 88970 + }, + { + "epoch": 12.630234208658623, + "grad_norm": 9.946798324584961, + "learning_rate": 8.73751596877218e-05, + "loss": 0.01564723551273346, + "step": 88980 + }, + { + "epoch": 12.63165365507452, + "grad_norm": 0.1037072166800499, + "learning_rate": 8.737374024130589e-05, + "loss": 0.012409783899784088, + "step": 88990 + }, + { + "epoch": 12.63307310149042, + "grad_norm": 1.5739264488220215, + "learning_rate": 8.737232079488999e-05, + "loss": 0.03171505033969879, + "step": 89000 + }, + { + "epoch": 12.63307310149042, + "eval_accuracy": 0.9773637693139188, + "eval_loss": 0.08339247107505798, + "eval_runtime": 31.6288, + "eval_samples_per_second": 497.236, + "eval_steps_per_second": 15.555, + "step": 89000 + }, + { + "epoch": 12.634492547906316, + "grad_norm": 0.10447607189416885, + "learning_rate": 8.73709013484741e-05, + "loss": 0.040927499532699585, + "step": 89010 + }, + { + "epoch": 12.635911994322214, + "grad_norm": 0.1741301566362381, + "learning_rate": 8.73694819020582e-05, + "loss": 0.0033416420221328734, + "step": 89020 + }, + { + "epoch": 12.637331440738112, + "grad_norm": 1.1373984813690186, + "learning_rate": 8.736806245564231e-05, + "loss": 0.059111952781677246, + "step": 89030 + }, + { + "epoch": 12.63875088715401, + "grad_norm": 1.553276777267456, + "learning_rate": 8.736664300922641e-05, + "loss": 0.009425174444913864, + "step": 89040 + }, + { + "epoch": 12.640170333569907, + "grad_norm": 2.646817922592163, + "learning_rate": 8.73652235628105e-05, + "loss": 0.010878220200538635, + "step": 89050 + }, + { + "epoch": 12.641589779985805, + "grad_norm": 3.485379219055176, + "learning_rate": 8.736380411639462e-05, + "loss": 0.03806718289852142, + "step": 89060 + }, + { + "epoch": 12.643009226401704, + "grad_norm": 5.934311389923096, + "learning_rate": 8.736238466997871e-05, + "loss": 0.043131566047668456, + "step": 89070 + }, + { + "epoch": 12.6444286728176, + "grad_norm": 0.9953676462173462, + "learning_rate": 8.736096522356282e-05, + "loss": 0.019209764897823334, + "step": 89080 + }, + { + "epoch": 12.645848119233499, + "grad_norm": 5.589478015899658, + "learning_rate": 8.735954577714691e-05, + "loss": 0.033051124215126036, + "step": 89090 + }, + { + "epoch": 12.647267565649397, + "grad_norm": 5.7257256507873535, + "learning_rate": 8.735812633073102e-05, + "loss": 0.05738807916641235, + "step": 89100 + }, + { + "epoch": 12.648687012065295, + "grad_norm": 1.0921305418014526, + "learning_rate": 8.735670688431512e-05, + "loss": 0.020826832950115205, + "step": 89110 + }, + { + "epoch": 12.650106458481192, + "grad_norm": 0.13475766777992249, + "learning_rate": 8.735528743789923e-05, + "loss": 0.07699592709541321, + "step": 89120 + }, + { + "epoch": 12.65152590489709, + "grad_norm": 0.24702370166778564, + "learning_rate": 8.735386799148332e-05, + "loss": 0.05097814798355103, + "step": 89130 + }, + { + "epoch": 12.652945351312988, + "grad_norm": 3.9055874347686768, + "learning_rate": 8.735244854506742e-05, + "loss": 0.03892681002616882, + "step": 89140 + }, + { + "epoch": 12.654364797728885, + "grad_norm": 3.2908849716186523, + "learning_rate": 8.735102909865153e-05, + "loss": 0.10024460554122924, + "step": 89150 + }, + { + "epoch": 12.655784244144783, + "grad_norm": 11.251198768615723, + "learning_rate": 8.734960965223563e-05, + "loss": 0.03453767895698547, + "step": 89160 + }, + { + "epoch": 12.657203690560682, + "grad_norm": 8.44933795928955, + "learning_rate": 8.734819020581974e-05, + "loss": 0.028436681628227232, + "step": 89170 + }, + { + "epoch": 12.65862313697658, + "grad_norm": 0.1921631097793579, + "learning_rate": 8.734677075940384e-05, + "loss": 0.03966635465621948, + "step": 89180 + }, + { + "epoch": 12.660042583392476, + "grad_norm": 8.453948020935059, + "learning_rate": 8.734535131298795e-05, + "loss": 0.038900962471961974, + "step": 89190 + }, + { + "epoch": 12.661462029808375, + "grad_norm": 2.7020156383514404, + "learning_rate": 8.734393186657203e-05, + "loss": 0.03722269833087921, + "step": 89200 + }, + { + "epoch": 12.662881476224273, + "grad_norm": 0.048562612384557724, + "learning_rate": 8.734251242015614e-05, + "loss": 0.052408403158187865, + "step": 89210 + }, + { + "epoch": 12.66430092264017, + "grad_norm": 9.211213111877441, + "learning_rate": 8.734109297374024e-05, + "loss": 0.09202111959457397, + "step": 89220 + }, + { + "epoch": 12.665720369056068, + "grad_norm": 4.951529502868652, + "learning_rate": 8.733967352732435e-05, + "loss": 0.019138604402542114, + "step": 89230 + }, + { + "epoch": 12.667139815471966, + "grad_norm": 2.420982599258423, + "learning_rate": 8.733825408090845e-05, + "loss": 0.06687188744544983, + "step": 89240 + }, + { + "epoch": 12.668559261887864, + "grad_norm": 6.244063377380371, + "learning_rate": 8.733683463449255e-05, + "loss": 0.030867373943328856, + "step": 89250 + }, + { + "epoch": 12.669978708303761, + "grad_norm": 0.06475788354873657, + "learning_rate": 8.733541518807666e-05, + "loss": 0.021957488358020784, + "step": 89260 + }, + { + "epoch": 12.67139815471966, + "grad_norm": 12.286905288696289, + "learning_rate": 8.733399574166076e-05, + "loss": 0.018048429489135744, + "step": 89270 + }, + { + "epoch": 12.672817601135558, + "grad_norm": 0.29012051224708557, + "learning_rate": 8.733257629524487e-05, + "loss": 0.0073526807129383085, + "step": 89280 + }, + { + "epoch": 12.674237047551454, + "grad_norm": 0.33186063170433044, + "learning_rate": 8.733115684882896e-05, + "loss": 0.013821916282176971, + "step": 89290 + }, + { + "epoch": 12.675656493967352, + "grad_norm": 1.7569838762283325, + "learning_rate": 8.732973740241306e-05, + "loss": 0.03041381239891052, + "step": 89300 + }, + { + "epoch": 12.67707594038325, + "grad_norm": 0.06672785431146622, + "learning_rate": 8.732831795599716e-05, + "loss": 0.02769661545753479, + "step": 89310 + }, + { + "epoch": 12.678495386799149, + "grad_norm": 4.808945655822754, + "learning_rate": 8.732689850958127e-05, + "loss": 0.06973714232444764, + "step": 89320 + }, + { + "epoch": 12.679914833215046, + "grad_norm": 7.926112174987793, + "learning_rate": 8.732547906316537e-05, + "loss": 0.030643126368522643, + "step": 89330 + }, + { + "epoch": 12.681334279630944, + "grad_norm": 0.5075589418411255, + "learning_rate": 8.732405961674948e-05, + "loss": 0.0034361004829406737, + "step": 89340 + }, + { + "epoch": 12.682753726046842, + "grad_norm": 0.45156118273735046, + "learning_rate": 8.732264017033357e-05, + "loss": 0.02787870466709137, + "step": 89350 + }, + { + "epoch": 12.684173172462739, + "grad_norm": 2.541034698486328, + "learning_rate": 8.732122072391767e-05, + "loss": 0.010050937533378601, + "step": 89360 + }, + { + "epoch": 12.685592618878637, + "grad_norm": 7.8918280601501465, + "learning_rate": 8.731980127750178e-05, + "loss": 0.020843583345413207, + "step": 89370 + }, + { + "epoch": 12.687012065294535, + "grad_norm": 0.7082682847976685, + "learning_rate": 8.731838183108588e-05, + "loss": 0.03357086181640625, + "step": 89380 + }, + { + "epoch": 12.688431511710434, + "grad_norm": 0.05603795126080513, + "learning_rate": 8.731696238466999e-05, + "loss": 0.052737241983413695, + "step": 89390 + }, + { + "epoch": 12.68985095812633, + "grad_norm": 0.9145916700363159, + "learning_rate": 8.731554293825408e-05, + "loss": 0.025367552042007448, + "step": 89400 + }, + { + "epoch": 12.691270404542228, + "grad_norm": 0.27008840441703796, + "learning_rate": 8.731412349183819e-05, + "loss": 0.026930320262908935, + "step": 89410 + }, + { + "epoch": 12.692689850958127, + "grad_norm": 0.9023039937019348, + "learning_rate": 8.731270404542228e-05, + "loss": 0.0035582900047302244, + "step": 89420 + }, + { + "epoch": 12.694109297374023, + "grad_norm": 2.4974489212036133, + "learning_rate": 8.73112845990064e-05, + "loss": 0.013577282428741455, + "step": 89430 + }, + { + "epoch": 12.695528743789922, + "grad_norm": 0.05072787031531334, + "learning_rate": 8.730986515259049e-05, + "loss": 0.010160622000694276, + "step": 89440 + }, + { + "epoch": 12.69694819020582, + "grad_norm": 0.5751616358757019, + "learning_rate": 8.730844570617459e-05, + "loss": 0.01841437667608261, + "step": 89450 + }, + { + "epoch": 12.698367636621718, + "grad_norm": 0.13704656064510345, + "learning_rate": 8.73070262597587e-05, + "loss": 0.014767895638942718, + "step": 89460 + }, + { + "epoch": 12.699787083037615, + "grad_norm": 1.5391329526901245, + "learning_rate": 8.73056068133428e-05, + "loss": 0.00814460813999176, + "step": 89470 + }, + { + "epoch": 12.701206529453513, + "grad_norm": 11.336458206176758, + "learning_rate": 8.730418736692691e-05, + "loss": 0.026091352105140686, + "step": 89480 + }, + { + "epoch": 12.702625975869411, + "grad_norm": 0.3561452329158783, + "learning_rate": 8.7302767920511e-05, + "loss": 0.009797683358192444, + "step": 89490 + }, + { + "epoch": 12.704045422285308, + "grad_norm": 4.071054935455322, + "learning_rate": 8.730134847409512e-05, + "loss": 0.008877287060022354, + "step": 89500 + }, + { + "epoch": 12.704045422285308, + "eval_accuracy": 0.9854390538564253, + "eval_loss": 0.04127642139792442, + "eval_runtime": 34.1649, + "eval_samples_per_second": 460.326, + "eval_steps_per_second": 14.401, + "step": 89500 + }, + { + "epoch": 12.705464868701206, + "grad_norm": 1.4673656225204468, + "learning_rate": 8.72999290276792e-05, + "loss": 0.02310786545276642, + "step": 89510 + }, + { + "epoch": 12.706884315117104, + "grad_norm": 0.3841665983200073, + "learning_rate": 8.729850958126331e-05, + "loss": 0.0045328203588724135, + "step": 89520 + }, + { + "epoch": 12.708303761533003, + "grad_norm": 0.1576647162437439, + "learning_rate": 8.729709013484741e-05, + "loss": 0.022032007575035095, + "step": 89530 + }, + { + "epoch": 12.7097232079489, + "grad_norm": 1.5458643436431885, + "learning_rate": 8.729567068843152e-05, + "loss": 0.04079245924949646, + "step": 89540 + }, + { + "epoch": 12.711142654364798, + "grad_norm": 0.017013955861330032, + "learning_rate": 8.729425124201562e-05, + "loss": 0.035155534744262695, + "step": 89550 + }, + { + "epoch": 12.712562100780696, + "grad_norm": 0.8977913856506348, + "learning_rate": 8.729283179559971e-05, + "loss": 0.00961761102080345, + "step": 89560 + }, + { + "epoch": 12.713981547196592, + "grad_norm": 0.07090606540441513, + "learning_rate": 8.729141234918383e-05, + "loss": 0.03372899293899536, + "step": 89570 + }, + { + "epoch": 12.71540099361249, + "grad_norm": 0.10601519793272018, + "learning_rate": 8.728999290276792e-05, + "loss": 0.022131067514419556, + "step": 89580 + }, + { + "epoch": 12.716820440028389, + "grad_norm": 0.08834227919578552, + "learning_rate": 8.728857345635203e-05, + "loss": 0.004504304751753807, + "step": 89590 + }, + { + "epoch": 12.718239886444287, + "grad_norm": 0.6667763590812683, + "learning_rate": 8.728715400993613e-05, + "loss": 0.03264138400554657, + "step": 89600 + }, + { + "epoch": 12.719659332860184, + "grad_norm": 0.06539282947778702, + "learning_rate": 8.728573456352023e-05, + "loss": 0.027134037017822264, + "step": 89610 + }, + { + "epoch": 12.721078779276082, + "grad_norm": 9.912678718566895, + "learning_rate": 8.728431511710433e-05, + "loss": 0.016111087799072266, + "step": 89620 + }, + { + "epoch": 12.72249822569198, + "grad_norm": 3.6912591457366943, + "learning_rate": 8.728289567068844e-05, + "loss": 0.03288787305355072, + "step": 89630 + }, + { + "epoch": 12.723917672107877, + "grad_norm": 1.6711541414260864, + "learning_rate": 8.728147622427253e-05, + "loss": 0.01833134740591049, + "step": 89640 + }, + { + "epoch": 12.725337118523775, + "grad_norm": 0.7487699389457703, + "learning_rate": 8.728005677785665e-05, + "loss": 0.031482148170471194, + "step": 89650 + }, + { + "epoch": 12.726756564939674, + "grad_norm": 11.18490219116211, + "learning_rate": 8.727863733144074e-05, + "loss": 0.025566044449806213, + "step": 89660 + }, + { + "epoch": 12.728176011355572, + "grad_norm": 0.023123400285840034, + "learning_rate": 8.727721788502484e-05, + "loss": 0.026927375793457033, + "step": 89670 + }, + { + "epoch": 12.729595457771469, + "grad_norm": 2.2637369632720947, + "learning_rate": 8.727579843860895e-05, + "loss": 0.0070426806807518, + "step": 89680 + }, + { + "epoch": 12.731014904187367, + "grad_norm": 1.263985276222229, + "learning_rate": 8.727437899219305e-05, + "loss": 0.015499216318130494, + "step": 89690 + }, + { + "epoch": 12.732434350603265, + "grad_norm": 0.1351531594991684, + "learning_rate": 8.727295954577716e-05, + "loss": 0.0721098005771637, + "step": 89700 + }, + { + "epoch": 12.733853797019162, + "grad_norm": 0.5390442609786987, + "learning_rate": 8.727154009936124e-05, + "loss": 0.007533705979585648, + "step": 89710 + }, + { + "epoch": 12.73527324343506, + "grad_norm": 2.3574366569519043, + "learning_rate": 8.727012065294535e-05, + "loss": 0.010569178313016892, + "step": 89720 + }, + { + "epoch": 12.736692689850958, + "grad_norm": 0.09657580405473709, + "learning_rate": 8.726870120652945e-05, + "loss": 0.029726698994636536, + "step": 89730 + }, + { + "epoch": 12.738112136266857, + "grad_norm": 9.368902206420898, + "learning_rate": 8.726728176011356e-05, + "loss": 0.026808175444602966, + "step": 89740 + }, + { + "epoch": 12.739531582682753, + "grad_norm": 0.05883624777197838, + "learning_rate": 8.726586231369766e-05, + "loss": 0.009979787468910217, + "step": 89750 + }, + { + "epoch": 12.740951029098651, + "grad_norm": 0.6890395879745483, + "learning_rate": 8.726444286728176e-05, + "loss": 0.057667660713195804, + "step": 89760 + }, + { + "epoch": 12.74237047551455, + "grad_norm": 0.05285236984491348, + "learning_rate": 8.726302342086587e-05, + "loss": 0.05786900520324707, + "step": 89770 + }, + { + "epoch": 12.743789921930446, + "grad_norm": 0.22699399292469025, + "learning_rate": 8.726160397444997e-05, + "loss": 0.027164924144744872, + "step": 89780 + }, + { + "epoch": 12.745209368346345, + "grad_norm": 7.963822841644287, + "learning_rate": 8.726018452803408e-05, + "loss": 0.03848668932914734, + "step": 89790 + }, + { + "epoch": 12.746628814762243, + "grad_norm": 0.9460397958755493, + "learning_rate": 8.725876508161817e-05, + "loss": 0.030627134442329406, + "step": 89800 + }, + { + "epoch": 12.748048261178141, + "grad_norm": 16.650693893432617, + "learning_rate": 8.725734563520227e-05, + "loss": 0.038635873794555665, + "step": 89810 + }, + { + "epoch": 12.749467707594038, + "grad_norm": 0.7919514179229736, + "learning_rate": 8.725592618878637e-05, + "loss": 0.02418453395366669, + "step": 89820 + }, + { + "epoch": 12.750887154009936, + "grad_norm": 0.04367893561720848, + "learning_rate": 8.725450674237048e-05, + "loss": 0.013546989858150482, + "step": 89830 + }, + { + "epoch": 12.752306600425834, + "grad_norm": 6.330883979797363, + "learning_rate": 8.725308729595458e-05, + "loss": 0.05271314382553101, + "step": 89840 + }, + { + "epoch": 12.75372604684173, + "grad_norm": 8.027472496032715, + "learning_rate": 8.725166784953869e-05, + "loss": 0.031959670782089236, + "step": 89850 + }, + { + "epoch": 12.75514549325763, + "grad_norm": 0.03651771694421768, + "learning_rate": 8.72502484031228e-05, + "loss": 0.03219501376152038, + "step": 89860 + }, + { + "epoch": 12.756564939673527, + "grad_norm": 0.16615909337997437, + "learning_rate": 8.724882895670688e-05, + "loss": 0.019113722443580627, + "step": 89870 + }, + { + "epoch": 12.757984386089426, + "grad_norm": 6.589572906494141, + "learning_rate": 8.724740951029099e-05, + "loss": 0.06304314136505126, + "step": 89880 + }, + { + "epoch": 12.759403832505322, + "grad_norm": 0.18990400433540344, + "learning_rate": 8.724599006387509e-05, + "loss": 0.03654749989509583, + "step": 89890 + }, + { + "epoch": 12.76082327892122, + "grad_norm": 0.21855756640434265, + "learning_rate": 8.72445706174592e-05, + "loss": 0.018128784000873567, + "step": 89900 + }, + { + "epoch": 12.762242725337119, + "grad_norm": 0.07131622731685638, + "learning_rate": 8.72431511710433e-05, + "loss": 0.043461188673973083, + "step": 89910 + }, + { + "epoch": 12.763662171753015, + "grad_norm": 0.7148534655570984, + "learning_rate": 8.72417317246274e-05, + "loss": 0.04350023567676544, + "step": 89920 + }, + { + "epoch": 12.765081618168914, + "grad_norm": 0.6358012557029724, + "learning_rate": 8.72403122782115e-05, + "loss": 0.02536094784736633, + "step": 89930 + }, + { + "epoch": 12.766501064584812, + "grad_norm": 5.968013286590576, + "learning_rate": 8.72388928317956e-05, + "loss": 0.06381597518920898, + "step": 89940 + }, + { + "epoch": 12.76792051100071, + "grad_norm": 5.589460849761963, + "learning_rate": 8.723747338537972e-05, + "loss": 0.014945511519908906, + "step": 89950 + }, + { + "epoch": 12.769339957416607, + "grad_norm": 0.3752530813217163, + "learning_rate": 8.723605393896381e-05, + "loss": 0.02037852108478546, + "step": 89960 + }, + { + "epoch": 12.770759403832505, + "grad_norm": 8.791796684265137, + "learning_rate": 8.723463449254791e-05, + "loss": 0.03790717720985413, + "step": 89970 + }, + { + "epoch": 12.772178850248403, + "grad_norm": 0.8387459516525269, + "learning_rate": 8.723321504613201e-05, + "loss": 0.03979597389698029, + "step": 89980 + }, + { + "epoch": 12.7735982966643, + "grad_norm": 0.14796994626522064, + "learning_rate": 8.723179559971612e-05, + "loss": 0.02478184700012207, + "step": 89990 + }, + { + "epoch": 12.775017743080198, + "grad_norm": 12.8732271194458, + "learning_rate": 8.723037615330022e-05, + "loss": 0.03733752369880676, + "step": 90000 + }, + { + "epoch": 12.775017743080198, + "eval_accuracy": 0.9837222610796719, + "eval_loss": 0.053042687475681305, + "eval_runtime": 32.4327, + "eval_samples_per_second": 484.912, + "eval_steps_per_second": 15.17, + "step": 90000 + }, + { + "epoch": 12.776437189496097, + "grad_norm": 6.415987968444824, + "learning_rate": 8.722895670688433e-05, + "loss": 0.061280882358551024, + "step": 90010 + }, + { + "epoch": 12.777856635911995, + "grad_norm": 0.22661128640174866, + "learning_rate": 8.722753726046841e-05, + "loss": 0.03263997435569763, + "step": 90020 + }, + { + "epoch": 12.779276082327891, + "grad_norm": 5.485864639282227, + "learning_rate": 8.722611781405252e-05, + "loss": 0.027382856607437132, + "step": 90030 + }, + { + "epoch": 12.78069552874379, + "grad_norm": 2.0795586109161377, + "learning_rate": 8.722469836763663e-05, + "loss": 0.06238746643066406, + "step": 90040 + }, + { + "epoch": 12.782114975159688, + "grad_norm": 4.153426647186279, + "learning_rate": 8.722327892122073e-05, + "loss": 0.07989214658737183, + "step": 90050 + }, + { + "epoch": 12.783534421575585, + "grad_norm": 2.226921558380127, + "learning_rate": 8.722185947480484e-05, + "loss": 0.034956902265548706, + "step": 90060 + }, + { + "epoch": 12.784953867991483, + "grad_norm": 6.154031753540039, + "learning_rate": 8.722044002838892e-05, + "loss": 0.028828498721122742, + "step": 90070 + }, + { + "epoch": 12.786373314407381, + "grad_norm": 0.2934417724609375, + "learning_rate": 8.721902058197304e-05, + "loss": 0.04316558837890625, + "step": 90080 + }, + { + "epoch": 12.78779276082328, + "grad_norm": 0.5022813677787781, + "learning_rate": 8.721760113555713e-05, + "loss": 0.021439780294895173, + "step": 90090 + }, + { + "epoch": 12.789212207239176, + "grad_norm": 0.060481056571006775, + "learning_rate": 8.721618168914124e-05, + "loss": 0.03436042368412018, + "step": 90100 + }, + { + "epoch": 12.790631653655074, + "grad_norm": 8.783007621765137, + "learning_rate": 8.721476224272534e-05, + "loss": 0.023770597577095032, + "step": 90110 + }, + { + "epoch": 12.792051100070973, + "grad_norm": 3.9978301525115967, + "learning_rate": 8.721334279630944e-05, + "loss": 0.01868281066417694, + "step": 90120 + }, + { + "epoch": 12.79347054648687, + "grad_norm": 0.25623878836631775, + "learning_rate": 8.721192334989355e-05, + "loss": 0.018424060940742493, + "step": 90130 + }, + { + "epoch": 12.794889992902768, + "grad_norm": 5.259542465209961, + "learning_rate": 8.721050390347765e-05, + "loss": 0.0191888228058815, + "step": 90140 + }, + { + "epoch": 12.796309439318666, + "grad_norm": 6.814998149871826, + "learning_rate": 8.720908445706176e-05, + "loss": 0.044765892624855044, + "step": 90150 + }, + { + "epoch": 12.797728885734564, + "grad_norm": 7.195475101470947, + "learning_rate": 8.720766501064586e-05, + "loss": 0.03070288598537445, + "step": 90160 + }, + { + "epoch": 12.79914833215046, + "grad_norm": 0.6905927658081055, + "learning_rate": 8.720624556422995e-05, + "loss": 0.018119293451309203, + "step": 90170 + }, + { + "epoch": 12.800567778566359, + "grad_norm": 0.650661051273346, + "learning_rate": 8.720482611781405e-05, + "loss": 0.027859440445899962, + "step": 90180 + }, + { + "epoch": 12.801987224982257, + "grad_norm": 0.24278753995895386, + "learning_rate": 8.720340667139816e-05, + "loss": 0.01636711657047272, + "step": 90190 + }, + { + "epoch": 12.803406671398154, + "grad_norm": 3.6955575942993164, + "learning_rate": 8.720198722498226e-05, + "loss": 0.018730704486370087, + "step": 90200 + }, + { + "epoch": 12.804826117814052, + "grad_norm": 0.05204196646809578, + "learning_rate": 8.720056777856637e-05, + "loss": 0.012679100036621094, + "step": 90210 + }, + { + "epoch": 12.80624556422995, + "grad_norm": 5.993483066558838, + "learning_rate": 8.719914833215047e-05, + "loss": 0.08045894503593445, + "step": 90220 + }, + { + "epoch": 12.807665010645849, + "grad_norm": 0.022297637537121773, + "learning_rate": 8.719772888573456e-05, + "loss": 0.004308861494064331, + "step": 90230 + }, + { + "epoch": 12.809084457061745, + "grad_norm": 1.582815408706665, + "learning_rate": 8.719630943931867e-05, + "loss": 0.01453346610069275, + "step": 90240 + }, + { + "epoch": 12.810503903477644, + "grad_norm": 0.013253006152808666, + "learning_rate": 8.719488999290277e-05, + "loss": 0.007113112509250641, + "step": 90250 + }, + { + "epoch": 12.811923349893542, + "grad_norm": 10.942405700683594, + "learning_rate": 8.719347054648688e-05, + "loss": 0.04160553216934204, + "step": 90260 + }, + { + "epoch": 12.813342796309438, + "grad_norm": 1.6827470064163208, + "learning_rate": 8.719205110007098e-05, + "loss": 0.018131645023822786, + "step": 90270 + }, + { + "epoch": 12.814762242725337, + "grad_norm": 15.465213775634766, + "learning_rate": 8.719063165365508e-05, + "loss": 0.025337904691696167, + "step": 90280 + }, + { + "epoch": 12.816181689141235, + "grad_norm": 0.231109157204628, + "learning_rate": 8.718921220723918e-05, + "loss": 0.02845522463321686, + "step": 90290 + }, + { + "epoch": 12.817601135557133, + "grad_norm": 0.13257861137390137, + "learning_rate": 8.718779276082329e-05, + "loss": 0.016551059484481812, + "step": 90300 + }, + { + "epoch": 12.81902058197303, + "grad_norm": 0.6941499710083008, + "learning_rate": 8.718637331440738e-05, + "loss": 0.03151951730251312, + "step": 90310 + }, + { + "epoch": 12.820440028388928, + "grad_norm": 10.343978881835938, + "learning_rate": 8.71849538679915e-05, + "loss": 0.02398531138896942, + "step": 90320 + }, + { + "epoch": 12.821859474804826, + "grad_norm": 14.737664222717285, + "learning_rate": 8.718353442157559e-05, + "loss": 0.028624552488327026, + "step": 90330 + }, + { + "epoch": 12.823278921220723, + "grad_norm": 2.1050493717193604, + "learning_rate": 8.718211497515969e-05, + "loss": 0.07168601155281067, + "step": 90340 + }, + { + "epoch": 12.824698367636621, + "grad_norm": 6.589676856994629, + "learning_rate": 8.71806955287438e-05, + "loss": 0.045221933722496034, + "step": 90350 + }, + { + "epoch": 12.82611781405252, + "grad_norm": 1.8379383087158203, + "learning_rate": 8.71792760823279e-05, + "loss": 0.05340749621391296, + "step": 90360 + }, + { + "epoch": 12.827537260468418, + "grad_norm": 0.2016642987728119, + "learning_rate": 8.717785663591201e-05, + "loss": 0.01270725429058075, + "step": 90370 + }, + { + "epoch": 12.828956706884314, + "grad_norm": 3.9885146617889404, + "learning_rate": 8.717643718949609e-05, + "loss": 0.03886641561985016, + "step": 90380 + }, + { + "epoch": 12.830376153300213, + "grad_norm": 14.107388496398926, + "learning_rate": 8.71750177430802e-05, + "loss": 0.05452651977539062, + "step": 90390 + }, + { + "epoch": 12.831795599716111, + "grad_norm": 1.8865777254104614, + "learning_rate": 8.71735982966643e-05, + "loss": 0.047424548864364625, + "step": 90400 + }, + { + "epoch": 12.833215046132008, + "grad_norm": 0.6092913150787354, + "learning_rate": 8.717217885024841e-05, + "loss": 0.06444382071495056, + "step": 90410 + }, + { + "epoch": 12.834634492547906, + "grad_norm": 4.197598934173584, + "learning_rate": 8.717075940383251e-05, + "loss": 0.03637649714946747, + "step": 90420 + }, + { + "epoch": 12.836053938963804, + "grad_norm": 0.08683058619499207, + "learning_rate": 8.71693399574166e-05, + "loss": 0.05653232932090759, + "step": 90430 + }, + { + "epoch": 12.837473385379703, + "grad_norm": 0.11898249387741089, + "learning_rate": 8.716792051100072e-05, + "loss": 0.006302830576896667, + "step": 90440 + }, + { + "epoch": 12.838892831795599, + "grad_norm": 6.252366065979004, + "learning_rate": 8.716650106458481e-05, + "loss": 0.0305631160736084, + "step": 90450 + }, + { + "epoch": 12.840312278211497, + "grad_norm": 0.15573939681053162, + "learning_rate": 8.716508161816893e-05, + "loss": 0.01531701534986496, + "step": 90460 + }, + { + "epoch": 12.841731724627396, + "grad_norm": 3.1314682960510254, + "learning_rate": 8.716366217175302e-05, + "loss": 0.013777315616607666, + "step": 90470 + }, + { + "epoch": 12.843151171043292, + "grad_norm": 0.027007605880498886, + "learning_rate": 8.716224272533712e-05, + "loss": 0.038998952507972716, + "step": 90480 + }, + { + "epoch": 12.84457061745919, + "grad_norm": 2.1043355464935303, + "learning_rate": 8.716082327892122e-05, + "loss": 0.025404781103134155, + "step": 90490 + }, + { + "epoch": 12.845990063875089, + "grad_norm": 0.07467706501483917, + "learning_rate": 8.715940383250533e-05, + "loss": 0.006243685632944107, + "step": 90500 + }, + { + "epoch": 12.845990063875089, + "eval_accuracy": 0.9862656577859732, + "eval_loss": 0.047130122780799866, + "eval_runtime": 33.3807, + "eval_samples_per_second": 471.14, + "eval_steps_per_second": 14.739, + "step": 90500 + }, + { + "epoch": 12.847409510290987, + "grad_norm": 1.8158254623413086, + "learning_rate": 8.715798438608943e-05, + "loss": 0.010209519416093826, + "step": 90510 + }, + { + "epoch": 12.848828956706884, + "grad_norm": 2.3448336124420166, + "learning_rate": 8.715656493967354e-05, + "loss": 0.05426924824714661, + "step": 90520 + }, + { + "epoch": 12.850248403122782, + "grad_norm": 11.133930206298828, + "learning_rate": 8.715514549325763e-05, + "loss": 0.07704846858978272, + "step": 90530 + }, + { + "epoch": 12.85166784953868, + "grad_norm": 5.328191757202148, + "learning_rate": 8.715372604684173e-05, + "loss": 0.04808947145938873, + "step": 90540 + }, + { + "epoch": 12.853087295954577, + "grad_norm": 0.2553696036338806, + "learning_rate": 8.715230660042584e-05, + "loss": 0.0963461935520172, + "step": 90550 + }, + { + "epoch": 12.854506742370475, + "grad_norm": 6.152266979217529, + "learning_rate": 8.715088715400994e-05, + "loss": 0.04504083395004273, + "step": 90560 + }, + { + "epoch": 12.855926188786373, + "grad_norm": 0.06309587508440018, + "learning_rate": 8.714946770759405e-05, + "loss": 0.005174351111054421, + "step": 90570 + }, + { + "epoch": 12.857345635202272, + "grad_norm": 0.1511959433555603, + "learning_rate": 8.714804826117815e-05, + "loss": 0.014291897416114807, + "step": 90580 + }, + { + "epoch": 12.858765081618168, + "grad_norm": 13.10915756225586, + "learning_rate": 8.714662881476225e-05, + "loss": 0.04409765303134918, + "step": 90590 + }, + { + "epoch": 12.860184528034067, + "grad_norm": 1.8174325227737427, + "learning_rate": 8.714520936834634e-05, + "loss": 0.011032898724079133, + "step": 90600 + }, + { + "epoch": 12.861603974449965, + "grad_norm": 0.5066937208175659, + "learning_rate": 8.714378992193045e-05, + "loss": 0.01093606948852539, + "step": 90610 + }, + { + "epoch": 12.863023420865863, + "grad_norm": 8.572830200195312, + "learning_rate": 8.714237047551455e-05, + "loss": 0.057040858268737796, + "step": 90620 + }, + { + "epoch": 12.86444286728176, + "grad_norm": 0.1343429684638977, + "learning_rate": 8.714109297374025e-05, + "loss": 0.024809937179088592, + "step": 90630 + }, + { + "epoch": 12.865862313697658, + "grad_norm": 1.4768531322479248, + "learning_rate": 8.713967352732435e-05, + "loss": 0.029907482862472533, + "step": 90640 + }, + { + "epoch": 12.867281760113556, + "grad_norm": 8.033143997192383, + "learning_rate": 8.713825408090846e-05, + "loss": 0.030059036612510682, + "step": 90650 + }, + { + "epoch": 12.868701206529453, + "grad_norm": 0.09025631844997406, + "learning_rate": 8.713683463449254e-05, + "loss": 0.01888950914144516, + "step": 90660 + }, + { + "epoch": 12.870120652945351, + "grad_norm": 0.6663010716438293, + "learning_rate": 8.713541518807665e-05, + "loss": 0.018822093307971955, + "step": 90670 + }, + { + "epoch": 12.87154009936125, + "grad_norm": 0.09493912756443024, + "learning_rate": 8.713399574166075e-05, + "loss": 0.042925047874450686, + "step": 90680 + }, + { + "epoch": 12.872959545777148, + "grad_norm": 0.49431559443473816, + "learning_rate": 8.713257629524486e-05, + "loss": 0.041593000292778015, + "step": 90690 + }, + { + "epoch": 12.874378992193044, + "grad_norm": 1.4785473346710205, + "learning_rate": 8.713115684882897e-05, + "loss": 0.015245826542377472, + "step": 90700 + }, + { + "epoch": 12.875798438608943, + "grad_norm": 10.384889602661133, + "learning_rate": 8.712973740241306e-05, + "loss": 0.027532801032066345, + "step": 90710 + }, + { + "epoch": 12.87721788502484, + "grad_norm": 0.7916569113731384, + "learning_rate": 8.712831795599717e-05, + "loss": 0.020438848435878752, + "step": 90720 + }, + { + "epoch": 12.878637331440737, + "grad_norm": 0.20680011808872223, + "learning_rate": 8.712689850958126e-05, + "loss": 0.018569478392601015, + "step": 90730 + }, + { + "epoch": 12.880056777856636, + "grad_norm": 4.336511135101318, + "learning_rate": 8.712547906316538e-05, + "loss": 0.012251495569944381, + "step": 90740 + }, + { + "epoch": 12.881476224272534, + "grad_norm": 11.582043647766113, + "learning_rate": 8.712405961674947e-05, + "loss": 0.02740319073200226, + "step": 90750 + }, + { + "epoch": 12.882895670688432, + "grad_norm": 1.1124207973480225, + "learning_rate": 8.712264017033357e-05, + "loss": 0.027994123101234437, + "step": 90760 + }, + { + "epoch": 12.884315117104329, + "grad_norm": 4.376797676086426, + "learning_rate": 8.712122072391767e-05, + "loss": 0.017722992599010466, + "step": 90770 + }, + { + "epoch": 12.885734563520227, + "grad_norm": 1.3260844945907593, + "learning_rate": 8.711980127750178e-05, + "loss": 0.025655841827392577, + "step": 90780 + }, + { + "epoch": 12.887154009936125, + "grad_norm": 0.5041131973266602, + "learning_rate": 8.711838183108589e-05, + "loss": 0.02144547700881958, + "step": 90790 + }, + { + "epoch": 12.888573456352022, + "grad_norm": 0.06130081042647362, + "learning_rate": 8.711696238466999e-05, + "loss": 0.004414296522736549, + "step": 90800 + }, + { + "epoch": 12.88999290276792, + "grad_norm": 0.38922789692878723, + "learning_rate": 8.711554293825408e-05, + "loss": 0.0188855916261673, + "step": 90810 + }, + { + "epoch": 12.891412349183819, + "grad_norm": 7.74660062789917, + "learning_rate": 8.711412349183818e-05, + "loss": 0.035584890842437746, + "step": 90820 + }, + { + "epoch": 12.892831795599717, + "grad_norm": 0.9597287774085999, + "learning_rate": 8.711270404542229e-05, + "loss": 0.04100579619407654, + "step": 90830 + }, + { + "epoch": 12.894251242015613, + "grad_norm": 8.280255317687988, + "learning_rate": 8.711128459900639e-05, + "loss": 0.022879604995250703, + "step": 90840 + }, + { + "epoch": 12.895670688431512, + "grad_norm": 0.06499747931957245, + "learning_rate": 8.71098651525905e-05, + "loss": 0.0381752610206604, + "step": 90850 + }, + { + "epoch": 12.89709013484741, + "grad_norm": 0.22900985181331635, + "learning_rate": 8.710844570617458e-05, + "loss": 0.01734771877527237, + "step": 90860 + }, + { + "epoch": 12.898509581263307, + "grad_norm": 2.9776017665863037, + "learning_rate": 8.71070262597587e-05, + "loss": 0.050455224514007566, + "step": 90870 + }, + { + "epoch": 12.899929027679205, + "grad_norm": 0.06448038667440414, + "learning_rate": 8.71056068133428e-05, + "loss": 0.03693827986717224, + "step": 90880 + }, + { + "epoch": 12.901348474095103, + "grad_norm": 0.2390090674161911, + "learning_rate": 8.71041873669269e-05, + "loss": 0.014416129887104034, + "step": 90890 + }, + { + "epoch": 12.902767920511002, + "grad_norm": 1.1335889101028442, + "learning_rate": 8.710276792051101e-05, + "loss": 0.08491934537887573, + "step": 90900 + }, + { + "epoch": 12.904187366926898, + "grad_norm": 1.1026262044906616, + "learning_rate": 8.710134847409511e-05, + "loss": 0.01418365240097046, + "step": 90910 + }, + { + "epoch": 12.905606813342796, + "grad_norm": 1.0436301231384277, + "learning_rate": 8.709992902767921e-05, + "loss": 0.017918017506599427, + "step": 90920 + }, + { + "epoch": 12.907026259758695, + "grad_norm": 5.441536903381348, + "learning_rate": 8.70985095812633e-05, + "loss": 0.012655892968177795, + "step": 90930 + }, + { + "epoch": 12.908445706174591, + "grad_norm": 0.10456965863704681, + "learning_rate": 8.709709013484742e-05, + "loss": 0.015247131884098052, + "step": 90940 + }, + { + "epoch": 12.90986515259049, + "grad_norm": 0.014488224871456623, + "learning_rate": 8.709567068843151e-05, + "loss": 0.01566433012485504, + "step": 90950 + }, + { + "epoch": 12.911284599006388, + "grad_norm": 0.009824546054005623, + "learning_rate": 8.709425124201563e-05, + "loss": 0.003042099252343178, + "step": 90960 + }, + { + "epoch": 12.912704045422286, + "grad_norm": 0.9563096165657043, + "learning_rate": 8.709283179559972e-05, + "loss": 0.03195181488990784, + "step": 90970 + }, + { + "epoch": 12.914123491838183, + "grad_norm": 13.064339637756348, + "learning_rate": 8.709141234918382e-05, + "loss": 0.023437163233757018, + "step": 90980 + }, + { + "epoch": 12.915542938254081, + "grad_norm": 0.026108302175998688, + "learning_rate": 8.708999290276793e-05, + "loss": 0.007652238756418228, + "step": 90990 + }, + { + "epoch": 12.91696238466998, + "grad_norm": 0.28071707487106323, + "learning_rate": 8.708857345635203e-05, + "loss": 0.05530444383621216, + "step": 91000 + }, + { + "epoch": 12.91696238466998, + "eval_accuracy": 0.9828320722324665, + "eval_loss": 0.0642710030078888, + "eval_runtime": 32.7388, + "eval_samples_per_second": 480.377, + "eval_steps_per_second": 15.028, + "step": 91000 + }, + { + "epoch": 12.918381831085876, + "grad_norm": 15.131595611572266, + "learning_rate": 8.708715400993614e-05, + "loss": 0.06838587522506714, + "step": 91010 + }, + { + "epoch": 12.919801277501774, + "grad_norm": 1.486463189125061, + "learning_rate": 8.708573456352022e-05, + "loss": 0.018643665313720702, + "step": 91020 + }, + { + "epoch": 12.921220723917672, + "grad_norm": 2.5784308910369873, + "learning_rate": 8.708431511710433e-05, + "loss": 0.019171588122844696, + "step": 91030 + }, + { + "epoch": 12.92264017033357, + "grad_norm": 0.051978398114442825, + "learning_rate": 8.708289567068843e-05, + "loss": 0.02243105471134186, + "step": 91040 + }, + { + "epoch": 12.924059616749467, + "grad_norm": 0.61635822057724, + "learning_rate": 8.708147622427254e-05, + "loss": 0.002178187668323517, + "step": 91050 + }, + { + "epoch": 12.925479063165366, + "grad_norm": 12.922770500183105, + "learning_rate": 8.708005677785664e-05, + "loss": 0.09026901125907898, + "step": 91060 + }, + { + "epoch": 12.926898509581264, + "grad_norm": 7.460283279418945, + "learning_rate": 8.707863733144074e-05, + "loss": 0.06489250659942628, + "step": 91070 + }, + { + "epoch": 12.92831795599716, + "grad_norm": 7.113394737243652, + "learning_rate": 8.707721788502485e-05, + "loss": 0.047018444538116454, + "step": 91080 + }, + { + "epoch": 12.929737402413059, + "grad_norm": 0.26584914326667786, + "learning_rate": 8.707579843860895e-05, + "loss": 0.028078389167785645, + "step": 91090 + }, + { + "epoch": 12.931156848828957, + "grad_norm": 0.032915230840444565, + "learning_rate": 8.707437899219306e-05, + "loss": 0.04310223460197449, + "step": 91100 + }, + { + "epoch": 12.932576295244855, + "grad_norm": 0.1749131828546524, + "learning_rate": 8.707295954577715e-05, + "loss": 0.02489803433418274, + "step": 91110 + }, + { + "epoch": 12.933995741660752, + "grad_norm": 0.0382566824555397, + "learning_rate": 8.707154009936125e-05, + "loss": 0.04579323828220368, + "step": 91120 + }, + { + "epoch": 12.93541518807665, + "grad_norm": 0.15481582283973694, + "learning_rate": 8.707012065294535e-05, + "loss": 0.02880704402923584, + "step": 91130 + }, + { + "epoch": 12.936834634492548, + "grad_norm": 0.1466173231601715, + "learning_rate": 8.706870120652946e-05, + "loss": 0.0475196361541748, + "step": 91140 + }, + { + "epoch": 12.938254080908445, + "grad_norm": 0.21036475896835327, + "learning_rate": 8.706728176011356e-05, + "loss": 0.020145165920257568, + "step": 91150 + }, + { + "epoch": 12.939673527324343, + "grad_norm": 6.4919657707214355, + "learning_rate": 8.706586231369767e-05, + "loss": 0.03438344597816467, + "step": 91160 + }, + { + "epoch": 12.941092973740242, + "grad_norm": 0.09811893850564957, + "learning_rate": 8.706444286728177e-05, + "loss": 0.003525005280971527, + "step": 91170 + }, + { + "epoch": 12.94251242015614, + "grad_norm": 8.104255676269531, + "learning_rate": 8.706302342086586e-05, + "loss": 0.0142546147108078, + "step": 91180 + }, + { + "epoch": 12.943931866572036, + "grad_norm": 2.091709613800049, + "learning_rate": 8.706160397444997e-05, + "loss": 0.00975591540336609, + "step": 91190 + }, + { + "epoch": 12.945351312987935, + "grad_norm": 0.7164672613143921, + "learning_rate": 8.706018452803407e-05, + "loss": 0.018023268878459932, + "step": 91200 + }, + { + "epoch": 12.946770759403833, + "grad_norm": 0.48031216859817505, + "learning_rate": 8.705876508161818e-05, + "loss": 0.02688279151916504, + "step": 91210 + }, + { + "epoch": 12.94819020581973, + "grad_norm": 0.22432000935077667, + "learning_rate": 8.705734563520227e-05, + "loss": 0.06052901148796082, + "step": 91220 + }, + { + "epoch": 12.949609652235628, + "grad_norm": 0.17801009118556976, + "learning_rate": 8.705592618878638e-05, + "loss": 0.024128471314907075, + "step": 91230 + }, + { + "epoch": 12.951029098651526, + "grad_norm": 0.3342018723487854, + "learning_rate": 8.705450674237047e-05, + "loss": 0.022981099784374237, + "step": 91240 + }, + { + "epoch": 12.952448545067424, + "grad_norm": 0.14481079578399658, + "learning_rate": 8.705308729595459e-05, + "loss": 0.01762939989566803, + "step": 91250 + }, + { + "epoch": 12.953867991483321, + "grad_norm": 1.8251750469207764, + "learning_rate": 8.705166784953868e-05, + "loss": 0.06004202365875244, + "step": 91260 + }, + { + "epoch": 12.95528743789922, + "grad_norm": 0.30406421422958374, + "learning_rate": 8.70502484031228e-05, + "loss": 0.017629969120025634, + "step": 91270 + }, + { + "epoch": 12.956706884315118, + "grad_norm": 0.018968552350997925, + "learning_rate": 8.704882895670689e-05, + "loss": 0.03772530853748322, + "step": 91280 + }, + { + "epoch": 12.958126330731014, + "grad_norm": 0.34433281421661377, + "learning_rate": 8.704740951029099e-05, + "loss": 0.010790017247200013, + "step": 91290 + }, + { + "epoch": 12.959545777146912, + "grad_norm": 7.834738731384277, + "learning_rate": 8.70459900638751e-05, + "loss": 0.02302001416683197, + "step": 91300 + }, + { + "epoch": 12.96096522356281, + "grad_norm": 1.474596619606018, + "learning_rate": 8.70445706174592e-05, + "loss": 0.020346474647521973, + "step": 91310 + }, + { + "epoch": 12.962384669978709, + "grad_norm": 0.15759314596652985, + "learning_rate": 8.704315117104331e-05, + "loss": 0.04993346631526947, + "step": 91320 + }, + { + "epoch": 12.963804116394606, + "grad_norm": 0.3664032518863678, + "learning_rate": 8.704173172462739e-05, + "loss": 0.023996689915657045, + "step": 91330 + }, + { + "epoch": 12.965223562810504, + "grad_norm": 2.838346004486084, + "learning_rate": 8.70403122782115e-05, + "loss": 0.031538930535316465, + "step": 91340 + }, + { + "epoch": 12.966643009226402, + "grad_norm": 6.921056270599365, + "learning_rate": 8.70388928317956e-05, + "loss": 0.08351738452911377, + "step": 91350 + }, + { + "epoch": 12.968062455642299, + "grad_norm": 7.643664360046387, + "learning_rate": 8.703747338537971e-05, + "loss": 0.03136724233627319, + "step": 91360 + }, + { + "epoch": 12.969481902058197, + "grad_norm": 0.6630023121833801, + "learning_rate": 8.703605393896381e-05, + "loss": 0.018689344823360442, + "step": 91370 + }, + { + "epoch": 12.970901348474095, + "grad_norm": 1.5902106761932373, + "learning_rate": 8.70346344925479e-05, + "loss": 0.06974080801010132, + "step": 91380 + }, + { + "epoch": 12.972320794889994, + "grad_norm": 0.12607437372207642, + "learning_rate": 8.703321504613202e-05, + "loss": 0.033649593591690063, + "step": 91390 + }, + { + "epoch": 12.97374024130589, + "grad_norm": 6.137654781341553, + "learning_rate": 8.703179559971611e-05, + "loss": 0.021195295453071594, + "step": 91400 + }, + { + "epoch": 12.975159687721789, + "grad_norm": 0.14958131313323975, + "learning_rate": 8.703037615330022e-05, + "loss": 0.05002725124359131, + "step": 91410 + }, + { + "epoch": 12.976579134137687, + "grad_norm": 2.645599365234375, + "learning_rate": 8.702895670688432e-05, + "loss": 0.01684072017669678, + "step": 91420 + }, + { + "epoch": 12.977998580553583, + "grad_norm": 3.953803062438965, + "learning_rate": 8.702753726046842e-05, + "loss": 0.058473384380340575, + "step": 91430 + }, + { + "epoch": 12.979418026969482, + "grad_norm": 0.7704017162322998, + "learning_rate": 8.702611781405252e-05, + "loss": 0.03435961604118347, + "step": 91440 + }, + { + "epoch": 12.98083747338538, + "grad_norm": 0.45102816820144653, + "learning_rate": 8.702469836763663e-05, + "loss": 0.02208094298839569, + "step": 91450 + }, + { + "epoch": 12.982256919801278, + "grad_norm": 0.9969324469566345, + "learning_rate": 8.702327892122072e-05, + "loss": 0.04234072864055634, + "step": 91460 + }, + { + "epoch": 12.983676366217175, + "grad_norm": 0.06572148203849792, + "learning_rate": 8.702185947480484e-05, + "loss": 0.04454489350318909, + "step": 91470 + }, + { + "epoch": 12.985095812633073, + "grad_norm": 4.023639678955078, + "learning_rate": 8.702044002838893e-05, + "loss": 0.034791293740272525, + "step": 91480 + }, + { + "epoch": 12.986515259048971, + "grad_norm": 4.597278118133545, + "learning_rate": 8.701902058197303e-05, + "loss": 0.015966880321502685, + "step": 91490 + }, + { + "epoch": 12.987934705464868, + "grad_norm": 0.13566672801971436, + "learning_rate": 8.701760113555714e-05, + "loss": 0.02301221191883087, + "step": 91500 + }, + { + "epoch": 12.987934705464868, + "eval_accuracy": 0.9842309404209322, + "eval_loss": 0.052828311920166016, + "eval_runtime": 32.743, + "eval_samples_per_second": 480.316, + "eval_steps_per_second": 15.026, + "step": 91500 + }, + { + "epoch": 12.989354151880766, + "grad_norm": 0.3484897017478943, + "learning_rate": 8.701618168914124e-05, + "loss": 0.04626628756523132, + "step": 91510 + }, + { + "epoch": 12.990773598296665, + "grad_norm": 0.4147621989250183, + "learning_rate": 8.701476224272535e-05, + "loss": 0.008229909092187881, + "step": 91520 + }, + { + "epoch": 12.992193044712563, + "grad_norm": 0.07728651911020279, + "learning_rate": 8.701334279630943e-05, + "loss": 0.011543260514736175, + "step": 91530 + }, + { + "epoch": 12.99361249112846, + "grad_norm": 0.470676988363266, + "learning_rate": 8.701192334989354e-05, + "loss": 0.030454790592193602, + "step": 91540 + }, + { + "epoch": 12.995031937544358, + "grad_norm": 15.090569496154785, + "learning_rate": 8.701050390347764e-05, + "loss": 0.03786148726940155, + "step": 91550 + }, + { + "epoch": 12.996451383960256, + "grad_norm": 3.7643697261810303, + "learning_rate": 8.700908445706175e-05, + "loss": 0.01893087774515152, + "step": 91560 + }, + { + "epoch": 12.997870830376153, + "grad_norm": 0.3498643636703491, + "learning_rate": 8.700766501064585e-05, + "loss": 0.006908781081438065, + "step": 91570 + }, + { + "epoch": 12.99929027679205, + "grad_norm": 14.163825035095215, + "learning_rate": 8.700624556422995e-05, + "loss": 0.04151703715324402, + "step": 91580 + }, + { + "epoch": 13.00070972320795, + "grad_norm": 16.969526290893555, + "learning_rate": 8.700482611781406e-05, + "loss": 0.056925737857818605, + "step": 91590 + }, + { + "epoch": 13.002129169623847, + "grad_norm": 0.5581228137016296, + "learning_rate": 8.700340667139816e-05, + "loss": 0.008616255968809128, + "step": 91600 + }, + { + "epoch": 13.003548616039744, + "grad_norm": 1.8690402507781982, + "learning_rate": 8.700198722498227e-05, + "loss": 0.015176343917846679, + "step": 91610 + }, + { + "epoch": 13.004968062455642, + "grad_norm": 0.2172848880290985, + "learning_rate": 8.700056777856636e-05, + "loss": 0.012149479985237122, + "step": 91620 + }, + { + "epoch": 13.00638750887154, + "grad_norm": 0.11255660653114319, + "learning_rate": 8.699914833215048e-05, + "loss": 0.030363094806671143, + "step": 91630 + }, + { + "epoch": 13.007806955287437, + "grad_norm": 1.7092961072921753, + "learning_rate": 8.699772888573456e-05, + "loss": 0.010361893475055695, + "step": 91640 + }, + { + "epoch": 13.009226401703335, + "grad_norm": 0.0253172367811203, + "learning_rate": 8.699630943931867e-05, + "loss": 0.026502841711044313, + "step": 91650 + }, + { + "epoch": 13.010645848119234, + "grad_norm": 1.3576223850250244, + "learning_rate": 8.699488999290277e-05, + "loss": 0.019933232665061952, + "step": 91660 + }, + { + "epoch": 13.012065294535132, + "grad_norm": 0.20457276701927185, + "learning_rate": 8.699347054648688e-05, + "loss": 0.041373640298843384, + "step": 91670 + }, + { + "epoch": 13.013484740951029, + "grad_norm": 0.9398382306098938, + "learning_rate": 8.699205110007098e-05, + "loss": 0.029924097657203674, + "step": 91680 + }, + { + "epoch": 13.014904187366927, + "grad_norm": 0.24676869809627533, + "learning_rate": 8.699063165365507e-05, + "loss": 0.02562679350376129, + "step": 91690 + }, + { + "epoch": 13.016323633782825, + "grad_norm": 0.5171564221382141, + "learning_rate": 8.698921220723918e-05, + "loss": 0.0322356641292572, + "step": 91700 + }, + { + "epoch": 13.017743080198722, + "grad_norm": 3.6011831760406494, + "learning_rate": 8.698779276082328e-05, + "loss": 0.010308434069156647, + "step": 91710 + }, + { + "epoch": 13.01916252661462, + "grad_norm": 6.4173808097839355, + "learning_rate": 8.698637331440739e-05, + "loss": 0.031694459915161136, + "step": 91720 + }, + { + "epoch": 13.020581973030518, + "grad_norm": 3.998835563659668, + "learning_rate": 8.698495386799149e-05, + "loss": 0.0390471339225769, + "step": 91730 + }, + { + "epoch": 13.022001419446417, + "grad_norm": 0.6538220643997192, + "learning_rate": 8.698353442157559e-05, + "loss": 0.02336165904998779, + "step": 91740 + }, + { + "epoch": 13.023420865862313, + "grad_norm": 0.06361285597085953, + "learning_rate": 8.698211497515968e-05, + "loss": 0.028032290935516357, + "step": 91750 + }, + { + "epoch": 13.024840312278211, + "grad_norm": 0.8600550889968872, + "learning_rate": 8.69806955287438e-05, + "loss": 0.03384045958518982, + "step": 91760 + }, + { + "epoch": 13.02625975869411, + "grad_norm": 5.134207248687744, + "learning_rate": 8.697927608232789e-05, + "loss": 0.014662571251392365, + "step": 91770 + }, + { + "epoch": 13.027679205110006, + "grad_norm": 3.7682700157165527, + "learning_rate": 8.6977856635912e-05, + "loss": 0.013019509613513947, + "step": 91780 + }, + { + "epoch": 13.029098651525905, + "grad_norm": 0.17727281153202057, + "learning_rate": 8.69764371894961e-05, + "loss": 0.008013653755187988, + "step": 91790 + }, + { + "epoch": 13.030518097941803, + "grad_norm": 0.04075111076235771, + "learning_rate": 8.69750177430802e-05, + "loss": 0.019158007204532625, + "step": 91800 + }, + { + "epoch": 13.031937544357701, + "grad_norm": 5.8046722412109375, + "learning_rate": 8.697359829666431e-05, + "loss": 0.03163691759109497, + "step": 91810 + }, + { + "epoch": 13.033356990773598, + "grad_norm": 6.866926193237305, + "learning_rate": 8.69721788502484e-05, + "loss": 0.016943201422691345, + "step": 91820 + }, + { + "epoch": 13.034776437189496, + "grad_norm": 1.013522744178772, + "learning_rate": 8.697075940383252e-05, + "loss": 0.019465875625610352, + "step": 91830 + }, + { + "epoch": 13.036195883605394, + "grad_norm": 4.783105373382568, + "learning_rate": 8.69693399574166e-05, + "loss": 0.026390090584754944, + "step": 91840 + }, + { + "epoch": 13.037615330021291, + "grad_norm": 7.49722146987915, + "learning_rate": 8.696792051100071e-05, + "loss": 0.029819828271865845, + "step": 91850 + }, + { + "epoch": 13.03903477643719, + "grad_norm": 0.009413005784153938, + "learning_rate": 8.696650106458481e-05, + "loss": 0.013858243823051453, + "step": 91860 + }, + { + "epoch": 13.040454222853088, + "grad_norm": 9.378253936767578, + "learning_rate": 8.696508161816892e-05, + "loss": 0.02688908874988556, + "step": 91870 + }, + { + "epoch": 13.041873669268986, + "grad_norm": 1.6610782146453857, + "learning_rate": 8.696366217175302e-05, + "loss": 0.041138678789138794, + "step": 91880 + }, + { + "epoch": 13.043293115684882, + "grad_norm": 0.13649091124534607, + "learning_rate": 8.696224272533712e-05, + "loss": 0.009144684672355652, + "step": 91890 + }, + { + "epoch": 13.04471256210078, + "grad_norm": 0.15896634757518768, + "learning_rate": 8.696082327892123e-05, + "loss": 0.0353781670331955, + "step": 91900 + }, + { + "epoch": 13.046132008516679, + "grad_norm": 8.799978256225586, + "learning_rate": 8.695940383250532e-05, + "loss": 0.03496268391609192, + "step": 91910 + }, + { + "epoch": 13.047551454932576, + "grad_norm": 2.411184310913086, + "learning_rate": 8.695798438608943e-05, + "loss": 0.022661095857620238, + "step": 91920 + }, + { + "epoch": 13.048970901348474, + "grad_norm": 4.598940372467041, + "learning_rate": 8.695656493967353e-05, + "loss": 0.034996187686920165, + "step": 91930 + }, + { + "epoch": 13.050390347764372, + "grad_norm": 1.2868263721466064, + "learning_rate": 8.695514549325763e-05, + "loss": 0.06741084456443787, + "step": 91940 + }, + { + "epoch": 13.05180979418027, + "grad_norm": 5.6947832107543945, + "learning_rate": 8.695372604684173e-05, + "loss": 0.03682016432285309, + "step": 91950 + }, + { + "epoch": 13.053229240596167, + "grad_norm": 3.202509880065918, + "learning_rate": 8.695230660042584e-05, + "loss": 0.015489163994789123, + "step": 91960 + }, + { + "epoch": 13.054648687012065, + "grad_norm": 0.07919025421142578, + "learning_rate": 8.695088715400994e-05, + "loss": 0.0048958022147417065, + "step": 91970 + }, + { + "epoch": 13.056068133427964, + "grad_norm": 0.06812314689159393, + "learning_rate": 8.694946770759405e-05, + "loss": 0.02271898239850998, + "step": 91980 + }, + { + "epoch": 13.05748757984386, + "grad_norm": 1.773010015487671, + "learning_rate": 8.694804826117814e-05, + "loss": 0.024782709777355194, + "step": 91990 + }, + { + "epoch": 13.058907026259758, + "grad_norm": 0.07601115107536316, + "learning_rate": 8.694662881476224e-05, + "loss": 0.025223162770271302, + "step": 92000 + }, + { + "epoch": 13.058907026259758, + "eval_accuracy": 0.9814332040440008, + "eval_loss": 0.06536781042814255, + "eval_runtime": 31.0818, + "eval_samples_per_second": 505.988, + "eval_steps_per_second": 15.829, + "step": 92000 + }, + { + "epoch": 13.060326472675657, + "grad_norm": 0.6883729100227356, + "learning_rate": 8.694520936834635e-05, + "loss": 0.02039031982421875, + "step": 92010 + }, + { + "epoch": 13.061745919091555, + "grad_norm": 0.024349577724933624, + "learning_rate": 8.694378992193045e-05, + "loss": 0.016295403242111206, + "step": 92020 + }, + { + "epoch": 13.063165365507452, + "grad_norm": 0.008295398205518723, + "learning_rate": 8.694237047551456e-05, + "loss": 0.010328300297260284, + "step": 92030 + }, + { + "epoch": 13.06458481192335, + "grad_norm": 0.0963822677731514, + "learning_rate": 8.694095102909866e-05, + "loss": 0.03061448335647583, + "step": 92040 + }, + { + "epoch": 13.066004258339248, + "grad_norm": 5.504027843475342, + "learning_rate": 8.693953158268275e-05, + "loss": 0.10789980888366699, + "step": 92050 + }, + { + "epoch": 13.067423704755145, + "grad_norm": 7.110039710998535, + "learning_rate": 8.693811213626685e-05, + "loss": 0.03994630575180054, + "step": 92060 + }, + { + "epoch": 13.068843151171043, + "grad_norm": 0.028146283701062202, + "learning_rate": 8.693669268985096e-05, + "loss": 0.013843922317028046, + "step": 92070 + }, + { + "epoch": 13.070262597586941, + "grad_norm": 0.307533860206604, + "learning_rate": 8.693527324343506e-05, + "loss": 0.010846273601055145, + "step": 92080 + }, + { + "epoch": 13.07168204400284, + "grad_norm": 0.11381033062934875, + "learning_rate": 8.693385379701917e-05, + "loss": 0.08111209273338318, + "step": 92090 + }, + { + "epoch": 13.073101490418736, + "grad_norm": 0.8268828988075256, + "learning_rate": 8.693243435060327e-05, + "loss": 0.0200673907995224, + "step": 92100 + }, + { + "epoch": 13.074520936834634, + "grad_norm": 0.2431095540523529, + "learning_rate": 8.693101490418737e-05, + "loss": 0.01655070036649704, + "step": 92110 + }, + { + "epoch": 13.075940383250533, + "grad_norm": 2.5469141006469727, + "learning_rate": 8.692959545777148e-05, + "loss": 0.014377766847610473, + "step": 92120 + }, + { + "epoch": 13.07735982966643, + "grad_norm": 5.483829975128174, + "learning_rate": 8.692817601135557e-05, + "loss": 0.05107632875442505, + "step": 92130 + }, + { + "epoch": 13.078779276082328, + "grad_norm": 1.2713391780853271, + "learning_rate": 8.692675656493969e-05, + "loss": 0.021682539582252504, + "step": 92140 + }, + { + "epoch": 13.080198722498226, + "grad_norm": 6.252452850341797, + "learning_rate": 8.692533711852377e-05, + "loss": 0.04136396050453186, + "step": 92150 + }, + { + "epoch": 13.081618168914124, + "grad_norm": 0.012155876494944096, + "learning_rate": 8.692391767210788e-05, + "loss": 0.029287290573120118, + "step": 92160 + }, + { + "epoch": 13.08303761533002, + "grad_norm": 0.014887611381709576, + "learning_rate": 8.692249822569198e-05, + "loss": 0.012705713510513306, + "step": 92170 + }, + { + "epoch": 13.084457061745919, + "grad_norm": 0.02529442124068737, + "learning_rate": 8.692107877927609e-05, + "loss": 0.020171231031417845, + "step": 92180 + }, + { + "epoch": 13.085876508161817, + "grad_norm": 1.0245634317398071, + "learning_rate": 8.69196593328602e-05, + "loss": 0.019703012704849244, + "step": 92190 + }, + { + "epoch": 13.087295954577714, + "grad_norm": 1.0482151508331299, + "learning_rate": 8.691823988644428e-05, + "loss": 0.05168004035949707, + "step": 92200 + }, + { + "epoch": 13.088715400993612, + "grad_norm": 0.7686527967453003, + "learning_rate": 8.69168204400284e-05, + "loss": 0.007350246608257294, + "step": 92210 + }, + { + "epoch": 13.09013484740951, + "grad_norm": 0.020817680284380913, + "learning_rate": 8.691540099361249e-05, + "loss": 0.038484251499176024, + "step": 92220 + }, + { + "epoch": 13.091554293825409, + "grad_norm": 11.199978828430176, + "learning_rate": 8.69139815471966e-05, + "loss": 0.04380442500114441, + "step": 92230 + }, + { + "epoch": 13.092973740241305, + "grad_norm": 0.44425156712532043, + "learning_rate": 8.69125621007807e-05, + "loss": 0.05576305985450745, + "step": 92240 + }, + { + "epoch": 13.094393186657204, + "grad_norm": 0.09422028064727783, + "learning_rate": 8.69111426543648e-05, + "loss": 0.01708393394947052, + "step": 92250 + }, + { + "epoch": 13.095812633073102, + "grad_norm": 0.2461843192577362, + "learning_rate": 8.69097232079489e-05, + "loss": 0.027886903285980223, + "step": 92260 + }, + { + "epoch": 13.097232079488998, + "grad_norm": 0.05524897575378418, + "learning_rate": 8.6908303761533e-05, + "loss": 0.01066754013299942, + "step": 92270 + }, + { + "epoch": 13.098651525904897, + "grad_norm": 0.015682321041822433, + "learning_rate": 8.690688431511712e-05, + "loss": 0.011989720910787583, + "step": 92280 + }, + { + "epoch": 13.100070972320795, + "grad_norm": 0.3898911774158478, + "learning_rate": 8.690546486870121e-05, + "loss": 0.0231645867228508, + "step": 92290 + }, + { + "epoch": 13.101490418736693, + "grad_norm": 0.054293807595968246, + "learning_rate": 8.690404542228532e-05, + "loss": 0.0053061418235301975, + "step": 92300 + }, + { + "epoch": 13.10290986515259, + "grad_norm": 0.1619403213262558, + "learning_rate": 8.690262597586941e-05, + "loss": 0.07225641012191772, + "step": 92310 + }, + { + "epoch": 13.104329311568488, + "grad_norm": 0.12324342131614685, + "learning_rate": 8.690120652945352e-05, + "loss": 0.031182992458343505, + "step": 92320 + }, + { + "epoch": 13.105748757984387, + "grad_norm": 0.034161925315856934, + "learning_rate": 8.689978708303762e-05, + "loss": 0.03507125377655029, + "step": 92330 + }, + { + "epoch": 13.107168204400283, + "grad_norm": 0.6346161365509033, + "learning_rate": 8.689836763662173e-05, + "loss": 0.02153913825750351, + "step": 92340 + }, + { + "epoch": 13.108587650816181, + "grad_norm": 0.7318549156188965, + "learning_rate": 8.689694819020583e-05, + "loss": 0.014379370212554931, + "step": 92350 + }, + { + "epoch": 13.11000709723208, + "grad_norm": 2.773247241973877, + "learning_rate": 8.689552874378992e-05, + "loss": 0.03619228601455689, + "step": 92360 + }, + { + "epoch": 13.111426543647978, + "grad_norm": 4.637772083282471, + "learning_rate": 8.689410929737403e-05, + "loss": 0.038220956921577454, + "step": 92370 + }, + { + "epoch": 13.112845990063875, + "grad_norm": 0.9285563826560974, + "learning_rate": 8.689268985095813e-05, + "loss": 0.05401658415794373, + "step": 92380 + }, + { + "epoch": 13.114265436479773, + "grad_norm": 0.4850822389125824, + "learning_rate": 8.689127040454224e-05, + "loss": 0.030674123764038087, + "step": 92390 + }, + { + "epoch": 13.115684882895671, + "grad_norm": 0.06712858378887177, + "learning_rate": 8.688985095812634e-05, + "loss": 0.048642593622207644, + "step": 92400 + }, + { + "epoch": 13.117104329311568, + "grad_norm": 5.168540000915527, + "learning_rate": 8.688843151171044e-05, + "loss": 0.021758055686950682, + "step": 92410 + }, + { + "epoch": 13.118523775727466, + "grad_norm": 0.37047693133354187, + "learning_rate": 8.688701206529453e-05, + "loss": 0.03085559904575348, + "step": 92420 + }, + { + "epoch": 13.119943222143364, + "grad_norm": 0.7145349383354187, + "learning_rate": 8.688559261887864e-05, + "loss": 0.011856220662593842, + "step": 92430 + }, + { + "epoch": 13.121362668559263, + "grad_norm": 0.41536930203437805, + "learning_rate": 8.688417317246274e-05, + "loss": 0.11770002841949463, + "step": 92440 + }, + { + "epoch": 13.12278211497516, + "grad_norm": 1.966375708580017, + "learning_rate": 8.688275372604685e-05, + "loss": 0.010399091243743896, + "step": 92450 + }, + { + "epoch": 13.124201561391057, + "grad_norm": 1.3598144054412842, + "learning_rate": 8.688133427963095e-05, + "loss": 0.033311480283737184, + "step": 92460 + }, + { + "epoch": 13.125621007806956, + "grad_norm": 16.98819923400879, + "learning_rate": 8.687991483321505e-05, + "loss": 0.07230284214019775, + "step": 92470 + }, + { + "epoch": 13.127040454222852, + "grad_norm": 0.3322114646434784, + "learning_rate": 8.687849538679916e-05, + "loss": 0.03124167025089264, + "step": 92480 + }, + { + "epoch": 13.12845990063875, + "grad_norm": 0.3290840983390808, + "learning_rate": 8.687707594038326e-05, + "loss": 0.037420186400413516, + "step": 92490 + }, + { + "epoch": 13.129879347054649, + "grad_norm": 11.666303634643555, + "learning_rate": 8.687565649396737e-05, + "loss": 0.034588441252708435, + "step": 92500 + }, + { + "epoch": 13.129879347054649, + "eval_accuracy": 0.9861384879506581, + "eval_loss": 0.047509148716926575, + "eval_runtime": 31.0917, + "eval_samples_per_second": 505.826, + "eval_steps_per_second": 15.824, + "step": 92500 + }, + { + "epoch": 13.131298793470547, + "grad_norm": 0.64284348487854, + "learning_rate": 8.687423704755145e-05, + "loss": 0.023534731566905977, + "step": 92510 + }, + { + "epoch": 13.132718239886444, + "grad_norm": 1.2833255529403687, + "learning_rate": 8.687281760113556e-05, + "loss": 0.030319365859031677, + "step": 92520 + }, + { + "epoch": 13.134137686302342, + "grad_norm": 1.0851508378982544, + "learning_rate": 8.687139815471966e-05, + "loss": 0.01361302137374878, + "step": 92530 + }, + { + "epoch": 13.13555713271824, + "grad_norm": 0.20210212469100952, + "learning_rate": 8.686997870830377e-05, + "loss": 0.012545563280582428, + "step": 92540 + }, + { + "epoch": 13.136976579134137, + "grad_norm": 0.15086351335048676, + "learning_rate": 8.686855926188787e-05, + "loss": 0.022943997383117677, + "step": 92550 + }, + { + "epoch": 13.138396025550035, + "grad_norm": 8.776838302612305, + "learning_rate": 8.686713981547196e-05, + "loss": 0.05501660704612732, + "step": 92560 + }, + { + "epoch": 13.139815471965933, + "grad_norm": 0.151241272687912, + "learning_rate": 8.686572036905608e-05, + "loss": 0.0084109365940094, + "step": 92570 + }, + { + "epoch": 13.141234918381832, + "grad_norm": 1.0454845428466797, + "learning_rate": 8.686430092264017e-05, + "loss": 0.013821640610694885, + "step": 92580 + }, + { + "epoch": 13.142654364797728, + "grad_norm": 1.373452067375183, + "learning_rate": 8.686288147622428e-05, + "loss": 0.023174571990966796, + "step": 92590 + }, + { + "epoch": 13.144073811213627, + "grad_norm": 0.09910205751657486, + "learning_rate": 8.686146202980838e-05, + "loss": 0.01848770081996918, + "step": 92600 + }, + { + "epoch": 13.145493257629525, + "grad_norm": 5.731260299682617, + "learning_rate": 8.686004258339248e-05, + "loss": 0.01092785894870758, + "step": 92610 + }, + { + "epoch": 13.146912704045421, + "grad_norm": 5.1644415855407715, + "learning_rate": 8.685862313697658e-05, + "loss": 0.034691983461380006, + "step": 92620 + }, + { + "epoch": 13.14833215046132, + "grad_norm": 3.380138397216797, + "learning_rate": 8.685720369056069e-05, + "loss": 0.013407303392887116, + "step": 92630 + }, + { + "epoch": 13.149751596877218, + "grad_norm": 1.7176895141601562, + "learning_rate": 8.685578424414478e-05, + "loss": 0.020493271946907043, + "step": 92640 + }, + { + "epoch": 13.151171043293116, + "grad_norm": 1.9893896579742432, + "learning_rate": 8.68543647977289e-05, + "loss": 0.018180091679096223, + "step": 92650 + }, + { + "epoch": 13.152590489709013, + "grad_norm": 2.009446859359741, + "learning_rate": 8.685294535131299e-05, + "loss": 0.03541998267173767, + "step": 92660 + }, + { + "epoch": 13.154009936124911, + "grad_norm": 5.200847625732422, + "learning_rate": 8.685152590489709e-05, + "loss": 0.014680585265159607, + "step": 92670 + }, + { + "epoch": 13.15542938254081, + "grad_norm": 6.761829853057861, + "learning_rate": 8.68501064584812e-05, + "loss": 0.012983223795890808, + "step": 92680 + }, + { + "epoch": 13.156848828956706, + "grad_norm": 0.011524581350386143, + "learning_rate": 8.68486870120653e-05, + "loss": 0.008690010756254196, + "step": 92690 + }, + { + "epoch": 13.158268275372604, + "grad_norm": 0.10512061417102814, + "learning_rate": 8.684726756564941e-05, + "loss": 0.011312702298164367, + "step": 92700 + }, + { + "epoch": 13.159687721788503, + "grad_norm": 4.294369220733643, + "learning_rate": 8.68458481192335e-05, + "loss": 0.028549957275390624, + "step": 92710 + }, + { + "epoch": 13.161107168204401, + "grad_norm": 8.672225952148438, + "learning_rate": 8.68444286728176e-05, + "loss": 0.05349311828613281, + "step": 92720 + }, + { + "epoch": 13.162526614620297, + "grad_norm": 3.49619197845459, + "learning_rate": 8.68430092264017e-05, + "loss": 0.043832501769065856, + "step": 92730 + }, + { + "epoch": 13.163946061036196, + "grad_norm": 3.8927085399627686, + "learning_rate": 8.684158977998581e-05, + "loss": 0.024391159415245056, + "step": 92740 + }, + { + "epoch": 13.165365507452094, + "grad_norm": 0.07379254698753357, + "learning_rate": 8.684017033356991e-05, + "loss": 0.010248324275016785, + "step": 92750 + }, + { + "epoch": 13.16678495386799, + "grad_norm": 2.031379461288452, + "learning_rate": 8.683875088715402e-05, + "loss": 0.0023462004959583283, + "step": 92760 + }, + { + "epoch": 13.168204400283889, + "grad_norm": 0.2536148428916931, + "learning_rate": 8.683733144073812e-05, + "loss": 0.02251149117946625, + "step": 92770 + }, + { + "epoch": 13.169623846699787, + "grad_norm": 1.5524146556854248, + "learning_rate": 8.683591199432222e-05, + "loss": 0.013939085602760314, + "step": 92780 + }, + { + "epoch": 13.171043293115686, + "grad_norm": 0.209752157330513, + "learning_rate": 8.683449254790633e-05, + "loss": 0.03190666139125824, + "step": 92790 + }, + { + "epoch": 13.172462739531582, + "grad_norm": 0.20207785069942474, + "learning_rate": 8.683307310149042e-05, + "loss": 0.024697883427143096, + "step": 92800 + }, + { + "epoch": 13.17388218594748, + "grad_norm": 0.018222937360405922, + "learning_rate": 8.683165365507453e-05, + "loss": 0.020527932047843932, + "step": 92810 + }, + { + "epoch": 13.175301632363379, + "grad_norm": 0.008755608461797237, + "learning_rate": 8.683023420865862e-05, + "loss": 0.0073832511901855465, + "step": 92820 + }, + { + "epoch": 13.176721078779275, + "grad_norm": 4.745551109313965, + "learning_rate": 8.682881476224273e-05, + "loss": 0.012114915996789932, + "step": 92830 + }, + { + "epoch": 13.178140525195174, + "grad_norm": 0.12335074692964554, + "learning_rate": 8.682739531582683e-05, + "loss": 0.003640429675579071, + "step": 92840 + }, + { + "epoch": 13.179559971611072, + "grad_norm": 0.1021898165345192, + "learning_rate": 8.682597586941094e-05, + "loss": 0.0027994271367788315, + "step": 92850 + }, + { + "epoch": 13.18097941802697, + "grad_norm": 13.2909574508667, + "learning_rate": 8.682455642299504e-05, + "loss": 0.02914237380027771, + "step": 92860 + }, + { + "epoch": 13.182398864442867, + "grad_norm": 0.0916062444448471, + "learning_rate": 8.682313697657913e-05, + "loss": 0.026697584986686708, + "step": 92870 + }, + { + "epoch": 13.183818310858765, + "grad_norm": 0.274517297744751, + "learning_rate": 8.682171753016324e-05, + "loss": 0.008112631738185883, + "step": 92880 + }, + { + "epoch": 13.185237757274663, + "grad_norm": 1.534758448600769, + "learning_rate": 8.682029808374734e-05, + "loss": 0.04348610639572144, + "step": 92890 + }, + { + "epoch": 13.18665720369056, + "grad_norm": 0.2583373785018921, + "learning_rate": 8.681887863733145e-05, + "loss": 0.006182187795639038, + "step": 92900 + }, + { + "epoch": 13.188076650106458, + "grad_norm": 0.037416551262140274, + "learning_rate": 8.681745919091555e-05, + "loss": 0.04729160368442535, + "step": 92910 + }, + { + "epoch": 13.189496096522356, + "grad_norm": 0.8491297960281372, + "learning_rate": 8.681603974449965e-05, + "loss": 0.005653556436300278, + "step": 92920 + }, + { + "epoch": 13.190915542938255, + "grad_norm": 0.0102075831964612, + "learning_rate": 8.681462029808374e-05, + "loss": 0.020110359787940978, + "step": 92930 + }, + { + "epoch": 13.192334989354151, + "grad_norm": 0.16059798002243042, + "learning_rate": 8.681320085166785e-05, + "loss": 0.013830339908599854, + "step": 92940 + }, + { + "epoch": 13.19375443577005, + "grad_norm": 5.683797836303711, + "learning_rate": 8.681192334989354e-05, + "loss": 0.046293017268180844, + "step": 92950 + }, + { + "epoch": 13.195173882185948, + "grad_norm": 10.07907772064209, + "learning_rate": 8.681050390347765e-05, + "loss": 0.028960409760475158, + "step": 92960 + }, + { + "epoch": 13.196593328601844, + "grad_norm": 8.026473045349121, + "learning_rate": 8.680908445706175e-05, + "loss": 0.024904248118400574, + "step": 92970 + }, + { + "epoch": 13.198012775017743, + "grad_norm": 1.5866436958312988, + "learning_rate": 8.680766501064586e-05, + "loss": 0.0035437196493148804, + "step": 92980 + }, + { + "epoch": 13.199432221433641, + "grad_norm": 0.333400160074234, + "learning_rate": 8.680624556422996e-05, + "loss": 0.0219614177942276, + "step": 92990 + }, + { + "epoch": 13.20085166784954, + "grad_norm": 2.6004791259765625, + "learning_rate": 8.680482611781405e-05, + "loss": 0.020504592359066008, + "step": 93000 + }, + { + "epoch": 13.20085166784954, + "eval_accuracy": 0.9883639600686717, + "eval_loss": 0.04106009751558304, + "eval_runtime": 31.3606, + "eval_samples_per_second": 501.489, + "eval_steps_per_second": 15.688, + "step": 93000 + }, + { + "epoch": 13.202271114265436, + "grad_norm": 3.023986577987671, + "learning_rate": 8.680340667139815e-05, + "loss": 0.02961946725845337, + "step": 93010 + }, + { + "epoch": 13.203690560681334, + "grad_norm": 8.1243314743042, + "learning_rate": 8.680198722498226e-05, + "loss": 0.08623704910278321, + "step": 93020 + }, + { + "epoch": 13.205110007097232, + "grad_norm": 10.625955581665039, + "learning_rate": 8.680056777856637e-05, + "loss": 0.03337146937847137, + "step": 93030 + }, + { + "epoch": 13.206529453513129, + "grad_norm": 0.01585538126528263, + "learning_rate": 8.679914833215047e-05, + "loss": 0.04821697473526001, + "step": 93040 + }, + { + "epoch": 13.207948899929027, + "grad_norm": 7.857775688171387, + "learning_rate": 8.679772888573457e-05, + "loss": 0.024783408641815184, + "step": 93050 + }, + { + "epoch": 13.209368346344926, + "grad_norm": 1.573050618171692, + "learning_rate": 8.679630943931867e-05, + "loss": 0.012848149240016937, + "step": 93060 + }, + { + "epoch": 13.210787792760824, + "grad_norm": 8.90145206451416, + "learning_rate": 8.679488999290278e-05, + "loss": 0.04379624426364899, + "step": 93070 + }, + { + "epoch": 13.21220723917672, + "grad_norm": 4.176666259765625, + "learning_rate": 8.679347054648687e-05, + "loss": 0.022595225274562834, + "step": 93080 + }, + { + "epoch": 13.213626685592619, + "grad_norm": 3.151440143585205, + "learning_rate": 8.679205110007098e-05, + "loss": 0.013112765550613404, + "step": 93090 + }, + { + "epoch": 13.215046132008517, + "grad_norm": 0.012762556783854961, + "learning_rate": 8.679063165365507e-05, + "loss": 0.010376498848199845, + "step": 93100 + }, + { + "epoch": 13.216465578424414, + "grad_norm": 0.3497304618358612, + "learning_rate": 8.678921220723918e-05, + "loss": 0.022116436064243315, + "step": 93110 + }, + { + "epoch": 13.217885024840312, + "grad_norm": 8.522994995117188, + "learning_rate": 8.678779276082329e-05, + "loss": 0.0177663192152977, + "step": 93120 + }, + { + "epoch": 13.21930447125621, + "grad_norm": 13.146580696105957, + "learning_rate": 8.678637331440739e-05, + "loss": 0.014370155334472657, + "step": 93130 + }, + { + "epoch": 13.220723917672109, + "grad_norm": 0.05478169769048691, + "learning_rate": 8.67849538679915e-05, + "loss": 0.015843385457992555, + "step": 93140 + }, + { + "epoch": 13.222143364088005, + "grad_norm": 0.5907186269760132, + "learning_rate": 8.678353442157558e-05, + "loss": 0.014893335103988648, + "step": 93150 + }, + { + "epoch": 13.223562810503903, + "grad_norm": 0.2738039195537567, + "learning_rate": 8.678211497515969e-05, + "loss": 0.01633225381374359, + "step": 93160 + }, + { + "epoch": 13.224982256919802, + "grad_norm": 0.016736086457967758, + "learning_rate": 8.678069552874379e-05, + "loss": 0.008309248834848404, + "step": 93170 + }, + { + "epoch": 13.2264017033357, + "grad_norm": 0.06897296756505966, + "learning_rate": 8.67792760823279e-05, + "loss": 0.028543704748153688, + "step": 93180 + }, + { + "epoch": 13.227821149751597, + "grad_norm": 5.2853851318359375, + "learning_rate": 8.6777856635912e-05, + "loss": 0.015082527697086335, + "step": 93190 + }, + { + "epoch": 13.229240596167495, + "grad_norm": 15.268448829650879, + "learning_rate": 8.67764371894961e-05, + "loss": 0.01713217794895172, + "step": 93200 + }, + { + "epoch": 13.230660042583393, + "grad_norm": 1.7831941843032837, + "learning_rate": 8.677501774308021e-05, + "loss": 0.02639639675617218, + "step": 93210 + }, + { + "epoch": 13.23207948899929, + "grad_norm": 0.28602883219718933, + "learning_rate": 8.67735982966643e-05, + "loss": 0.031729042530059814, + "step": 93220 + }, + { + "epoch": 13.233498935415188, + "grad_norm": 0.7817912101745605, + "learning_rate": 8.677217885024842e-05, + "loss": 0.019844482839107513, + "step": 93230 + }, + { + "epoch": 13.234918381831086, + "grad_norm": 0.062219344079494476, + "learning_rate": 8.677075940383251e-05, + "loss": 0.04821741878986359, + "step": 93240 + }, + { + "epoch": 13.236337828246985, + "grad_norm": 0.8284728527069092, + "learning_rate": 8.676933995741661e-05, + "loss": 0.030682769417762757, + "step": 93250 + }, + { + "epoch": 13.237757274662881, + "grad_norm": 0.8677076101303101, + "learning_rate": 8.676792051100071e-05, + "loss": 0.007623846083879471, + "step": 93260 + }, + { + "epoch": 13.23917672107878, + "grad_norm": 0.1573665589094162, + "learning_rate": 8.676650106458482e-05, + "loss": 0.04373520612716675, + "step": 93270 + }, + { + "epoch": 13.240596167494678, + "grad_norm": 0.17571642994880676, + "learning_rate": 8.676508161816892e-05, + "loss": 0.04299502968788147, + "step": 93280 + }, + { + "epoch": 13.242015613910574, + "grad_norm": 2.4545087814331055, + "learning_rate": 8.676366217175303e-05, + "loss": 0.03129469752311707, + "step": 93290 + }, + { + "epoch": 13.243435060326473, + "grad_norm": 0.0385487824678421, + "learning_rate": 8.676238466997871e-05, + "loss": 0.08912227153778077, + "step": 93300 + }, + { + "epoch": 13.24485450674237, + "grad_norm": 0.9710257649421692, + "learning_rate": 8.676096522356282e-05, + "loss": 0.020866674184799195, + "step": 93310 + }, + { + "epoch": 13.24627395315827, + "grad_norm": 9.935172080993652, + "learning_rate": 8.675954577714692e-05, + "loss": 0.013743373751640319, + "step": 93320 + }, + { + "epoch": 13.247693399574166, + "grad_norm": 0.5209399461746216, + "learning_rate": 8.675812633073102e-05, + "loss": 0.014395973086357117, + "step": 93330 + }, + { + "epoch": 13.249112845990064, + "grad_norm": 0.09892766177654266, + "learning_rate": 8.675670688431511e-05, + "loss": 0.027313077449798585, + "step": 93340 + }, + { + "epoch": 13.250532292405962, + "grad_norm": 0.27284589409828186, + "learning_rate": 8.675528743789923e-05, + "loss": 0.030682840943336488, + "step": 93350 + }, + { + "epoch": 13.251951738821859, + "grad_norm": 1.8673031330108643, + "learning_rate": 8.675386799148332e-05, + "loss": 0.02939329147338867, + "step": 93360 + }, + { + "epoch": 13.253371185237757, + "grad_norm": 0.022475466132164, + "learning_rate": 8.675244854506743e-05, + "loss": 0.02068745642900467, + "step": 93370 + }, + { + "epoch": 13.254790631653655, + "grad_norm": 0.3351653516292572, + "learning_rate": 8.675102909865153e-05, + "loss": 0.04050408899784088, + "step": 93380 + }, + { + "epoch": 13.256210078069554, + "grad_norm": 1.726203203201294, + "learning_rate": 8.674960965223563e-05, + "loss": 0.029091688990592956, + "step": 93390 + }, + { + "epoch": 13.25762952448545, + "grad_norm": 4.862390518188477, + "learning_rate": 8.674819020581974e-05, + "loss": 0.00784970447421074, + "step": 93400 + }, + { + "epoch": 13.259048970901349, + "grad_norm": 2.797797203063965, + "learning_rate": 8.674677075940384e-05, + "loss": 0.025647896528244018, + "step": 93410 + }, + { + "epoch": 13.260468417317247, + "grad_norm": 0.0536530502140522, + "learning_rate": 8.674535131298795e-05, + "loss": 0.026149827241897582, + "step": 93420 + }, + { + "epoch": 13.261887863733143, + "grad_norm": 0.2407999038696289, + "learning_rate": 8.674393186657203e-05, + "loss": 0.029965820908546447, + "step": 93430 + }, + { + "epoch": 13.263307310149042, + "grad_norm": 7.153860569000244, + "learning_rate": 8.674251242015614e-05, + "loss": 0.04421953558921814, + "step": 93440 + }, + { + "epoch": 13.26472675656494, + "grad_norm": 0.01831003837287426, + "learning_rate": 8.674109297374024e-05, + "loss": 0.007175080478191376, + "step": 93450 + }, + { + "epoch": 13.266146202980838, + "grad_norm": 0.4497462809085846, + "learning_rate": 8.673967352732435e-05, + "loss": 0.03221384286880493, + "step": 93460 + }, + { + "epoch": 13.267565649396735, + "grad_norm": 0.14525070786476135, + "learning_rate": 8.673825408090845e-05, + "loss": 0.015599586069583893, + "step": 93470 + }, + { + "epoch": 13.268985095812633, + "grad_norm": 0.11716824769973755, + "learning_rate": 8.673683463449255e-05, + "loss": 0.02416829764842987, + "step": 93480 + }, + { + "epoch": 13.270404542228531, + "grad_norm": 0.3827669322490692, + "learning_rate": 8.673541518807666e-05, + "loss": 0.019712349772453307, + "step": 93490 + }, + { + "epoch": 13.271823988644428, + "grad_norm": 0.3044072091579437, + "learning_rate": 8.673399574166075e-05, + "loss": 0.01277415156364441, + "step": 93500 + }, + { + "epoch": 13.271823988644428, + "eval_accuracy": 0.9863292427036306, + "eval_loss": 0.04419074207544327, + "eval_runtime": 31.0429, + "eval_samples_per_second": 506.621, + "eval_steps_per_second": 15.849, + "step": 93500 + }, + { + "epoch": 13.273243435060326, + "grad_norm": 0.38797178864479065, + "learning_rate": 8.673257629524486e-05, + "loss": 0.03536054491996765, + "step": 93510 + }, + { + "epoch": 13.274662881476225, + "grad_norm": 0.00916266068816185, + "learning_rate": 8.673115684882896e-05, + "loss": 0.005046528205275536, + "step": 93520 + }, + { + "epoch": 13.276082327892123, + "grad_norm": 2.2260570526123047, + "learning_rate": 8.672973740241306e-05, + "loss": 0.06993077993392945, + "step": 93530 + }, + { + "epoch": 13.27750177430802, + "grad_norm": 0.5128903388977051, + "learning_rate": 8.672831795599716e-05, + "loss": 0.004808105900883675, + "step": 93540 + }, + { + "epoch": 13.278921220723918, + "grad_norm": 0.10632356256246567, + "learning_rate": 8.672689850958127e-05, + "loss": 0.027431467175483705, + "step": 93550 + }, + { + "epoch": 13.280340667139816, + "grad_norm": 0.857125461101532, + "learning_rate": 8.672547906316537e-05, + "loss": 0.0861474871635437, + "step": 93560 + }, + { + "epoch": 13.281760113555713, + "grad_norm": 0.12263695150613785, + "learning_rate": 8.672405961674948e-05, + "loss": 0.030454087257385253, + "step": 93570 + }, + { + "epoch": 13.283179559971611, + "grad_norm": 0.5237917900085449, + "learning_rate": 8.672264017033357e-05, + "loss": 0.004111305624246597, + "step": 93580 + }, + { + "epoch": 13.28459900638751, + "grad_norm": 0.19974131882190704, + "learning_rate": 8.672122072391767e-05, + "loss": 0.023677754402160644, + "step": 93590 + }, + { + "epoch": 13.286018452803408, + "grad_norm": 0.04327812418341637, + "learning_rate": 8.671980127750178e-05, + "loss": 0.00693313330411911, + "step": 93600 + }, + { + "epoch": 13.287437899219304, + "grad_norm": 0.00680148508399725, + "learning_rate": 8.671838183108588e-05, + "loss": 0.017087599635124205, + "step": 93610 + }, + { + "epoch": 13.288857345635202, + "grad_norm": 0.0410841628909111, + "learning_rate": 8.671696238466999e-05, + "loss": 0.010396718978881836, + "step": 93620 + }, + { + "epoch": 13.2902767920511, + "grad_norm": 1.3464971780776978, + "learning_rate": 8.671554293825407e-05, + "loss": 0.03353613018989563, + "step": 93630 + }, + { + "epoch": 13.291696238466997, + "grad_norm": 4.748608112335205, + "learning_rate": 8.671412349183819e-05, + "loss": 0.01375894695520401, + "step": 93640 + }, + { + "epoch": 13.293115684882896, + "grad_norm": 0.13896571099758148, + "learning_rate": 8.671270404542228e-05, + "loss": 0.030356216430664062, + "step": 93650 + }, + { + "epoch": 13.294535131298794, + "grad_norm": 0.026233388110995293, + "learning_rate": 8.67112845990064e-05, + "loss": 0.01820582151412964, + "step": 93660 + }, + { + "epoch": 13.295954577714692, + "grad_norm": 0.09100806713104248, + "learning_rate": 8.670986515259049e-05, + "loss": 0.022753316164016723, + "step": 93670 + }, + { + "epoch": 13.297374024130589, + "grad_norm": 0.013288362883031368, + "learning_rate": 8.67084457061746e-05, + "loss": 0.032885891199111936, + "step": 93680 + }, + { + "epoch": 13.298793470546487, + "grad_norm": 5.086958408355713, + "learning_rate": 8.67070262597587e-05, + "loss": 0.05920916795730591, + "step": 93690 + }, + { + "epoch": 13.300212916962385, + "grad_norm": 7.441182613372803, + "learning_rate": 8.67056068133428e-05, + "loss": 0.0234588697552681, + "step": 93700 + }, + { + "epoch": 13.301632363378282, + "grad_norm": 1.1889616250991821, + "learning_rate": 8.670418736692691e-05, + "loss": 0.01023445576429367, + "step": 93710 + }, + { + "epoch": 13.30305180979418, + "grad_norm": 0.1379549503326416, + "learning_rate": 8.6702767920511e-05, + "loss": 0.02517501711845398, + "step": 93720 + }, + { + "epoch": 13.304471256210078, + "grad_norm": 8.683314323425293, + "learning_rate": 8.670134847409512e-05, + "loss": 0.04826049506664276, + "step": 93730 + }, + { + "epoch": 13.305890702625977, + "grad_norm": 12.325695037841797, + "learning_rate": 8.66999290276792e-05, + "loss": 0.01828770339488983, + "step": 93740 + }, + { + "epoch": 13.307310149041873, + "grad_norm": 0.6374432444572449, + "learning_rate": 8.669850958126331e-05, + "loss": 0.012936475872993469, + "step": 93750 + }, + { + "epoch": 13.308729595457772, + "grad_norm": 3.119178295135498, + "learning_rate": 8.669709013484741e-05, + "loss": 0.02451300173997879, + "step": 93760 + }, + { + "epoch": 13.31014904187367, + "grad_norm": 14.731306076049805, + "learning_rate": 8.669567068843152e-05, + "loss": 0.06916946768760682, + "step": 93770 + }, + { + "epoch": 13.311568488289566, + "grad_norm": 1.009304165840149, + "learning_rate": 8.669425124201562e-05, + "loss": 0.028861042857170106, + "step": 93780 + }, + { + "epoch": 13.312987934705465, + "grad_norm": 0.009057086892426014, + "learning_rate": 8.669283179559971e-05, + "loss": 0.035729244351387024, + "step": 93790 + }, + { + "epoch": 13.314407381121363, + "grad_norm": 4.11885929107666, + "learning_rate": 8.669141234918382e-05, + "loss": 0.013351409137248993, + "step": 93800 + }, + { + "epoch": 13.315826827537261, + "grad_norm": 11.137663841247559, + "learning_rate": 8.668999290276792e-05, + "loss": 0.0382326602935791, + "step": 93810 + }, + { + "epoch": 13.317246273953158, + "grad_norm": 0.0717635303735733, + "learning_rate": 8.668857345635203e-05, + "loss": 0.022524161636829375, + "step": 93820 + }, + { + "epoch": 13.318665720369056, + "grad_norm": 4.912062168121338, + "learning_rate": 8.668715400993613e-05, + "loss": 0.05251979231834412, + "step": 93830 + }, + { + "epoch": 13.320085166784954, + "grad_norm": 0.7680724859237671, + "learning_rate": 8.668573456352023e-05, + "loss": 0.012150004506111145, + "step": 93840 + }, + { + "epoch": 13.321504613200851, + "grad_norm": 0.008376783691346645, + "learning_rate": 8.668431511710432e-05, + "loss": 0.036633032560348514, + "step": 93850 + }, + { + "epoch": 13.32292405961675, + "grad_norm": 6.526537895202637, + "learning_rate": 8.668289567068844e-05, + "loss": 0.0329626202583313, + "step": 93860 + }, + { + "epoch": 13.324343506032648, + "grad_norm": 0.5995200276374817, + "learning_rate": 8.668147622427253e-05, + "loss": 0.02415698319673538, + "step": 93870 + }, + { + "epoch": 13.325762952448546, + "grad_norm": 0.16087555885314941, + "learning_rate": 8.668005677785664e-05, + "loss": 0.027556967735290528, + "step": 93880 + }, + { + "epoch": 13.327182398864442, + "grad_norm": 0.6079142093658447, + "learning_rate": 8.667863733144074e-05, + "loss": 0.03587366044521332, + "step": 93890 + }, + { + "epoch": 13.32860184528034, + "grad_norm": 0.10928544402122498, + "learning_rate": 8.667721788502484e-05, + "loss": 0.049797934293746945, + "step": 93900 + }, + { + "epoch": 13.330021291696239, + "grad_norm": 0.17567208409309387, + "learning_rate": 8.667579843860895e-05, + "loss": 0.021293030679225923, + "step": 93910 + }, + { + "epoch": 13.331440738112136, + "grad_norm": 0.13670562207698822, + "learning_rate": 8.667437899219305e-05, + "loss": 0.008856196701526643, + "step": 93920 + }, + { + "epoch": 13.332860184528034, + "grad_norm": 0.24669358134269714, + "learning_rate": 8.667295954577716e-05, + "loss": 0.02744440734386444, + "step": 93930 + }, + { + "epoch": 13.334279630943932, + "grad_norm": 0.15545465052127838, + "learning_rate": 8.667154009936124e-05, + "loss": 0.007605926692485809, + "step": 93940 + }, + { + "epoch": 13.33569907735983, + "grad_norm": 0.403153657913208, + "learning_rate": 8.667012065294535e-05, + "loss": 0.017333367466926576, + "step": 93950 + }, + { + "epoch": 13.337118523775727, + "grad_norm": 0.053572166711091995, + "learning_rate": 8.666870120652945e-05, + "loss": 0.020798361301422118, + "step": 93960 + }, + { + "epoch": 13.338537970191625, + "grad_norm": 2.455256938934326, + "learning_rate": 8.666728176011356e-05, + "loss": 0.029119691252708434, + "step": 93970 + }, + { + "epoch": 13.339957416607524, + "grad_norm": 11.778632164001465, + "learning_rate": 8.666586231369767e-05, + "loss": 0.07675871849060059, + "step": 93980 + }, + { + "epoch": 13.34137686302342, + "grad_norm": 0.9911078214645386, + "learning_rate": 8.666444286728176e-05, + "loss": 0.005806304514408112, + "step": 93990 + }, + { + "epoch": 13.342796309439318, + "grad_norm": 0.21370607614517212, + "learning_rate": 8.666302342086587e-05, + "loss": 0.017598675191402437, + "step": 94000 + }, + { + "epoch": 13.342796309439318, + "eval_accuracy": 0.9859477331976855, + "eval_loss": 0.05328426882624626, + "eval_runtime": 31.1265, + "eval_samples_per_second": 505.261, + "eval_steps_per_second": 15.806, + "step": 94000 + }, + { + "epoch": 13.344215755855217, + "grad_norm": 0.42926645278930664, + "learning_rate": 8.666160397444996e-05, + "loss": 0.01731862425804138, + "step": 94010 + }, + { + "epoch": 13.345635202271115, + "grad_norm": 0.45466873049736023, + "learning_rate": 8.666018452803408e-05, + "loss": 0.014458924531936646, + "step": 94020 + }, + { + "epoch": 13.347054648687012, + "grad_norm": 0.20981672406196594, + "learning_rate": 8.665876508161817e-05, + "loss": 0.02389024496078491, + "step": 94030 + }, + { + "epoch": 13.34847409510291, + "grad_norm": 0.041170816868543625, + "learning_rate": 8.665734563520228e-05, + "loss": 0.005668449029326439, + "step": 94040 + }, + { + "epoch": 13.349893541518808, + "grad_norm": 0.0988631471991539, + "learning_rate": 8.665592618878637e-05, + "loss": 0.02667955160140991, + "step": 94050 + }, + { + "epoch": 13.351312987934705, + "grad_norm": 11.238661766052246, + "learning_rate": 8.665450674237048e-05, + "loss": 0.03834929466247559, + "step": 94060 + }, + { + "epoch": 13.352732434350603, + "grad_norm": 0.005214661359786987, + "learning_rate": 8.665308729595459e-05, + "loss": 0.023474456369876863, + "step": 94070 + }, + { + "epoch": 13.354151880766501, + "grad_norm": 0.0774008110165596, + "learning_rate": 8.665166784953869e-05, + "loss": 0.05068536996841431, + "step": 94080 + }, + { + "epoch": 13.3555713271824, + "grad_norm": 0.35609862208366394, + "learning_rate": 8.66502484031228e-05, + "loss": 0.02888200879096985, + "step": 94090 + }, + { + "epoch": 13.356990773598296, + "grad_norm": 0.4439794719219208, + "learning_rate": 8.664882895670688e-05, + "loss": 0.023132362961769105, + "step": 94100 + }, + { + "epoch": 13.358410220014195, + "grad_norm": 3.6220321655273438, + "learning_rate": 8.664740951029099e-05, + "loss": 0.04306410849094391, + "step": 94110 + }, + { + "epoch": 13.359829666430093, + "grad_norm": 1.7350175380706787, + "learning_rate": 8.664599006387509e-05, + "loss": 0.03134117722511291, + "step": 94120 + }, + { + "epoch": 13.36124911284599, + "grad_norm": 20.512210845947266, + "learning_rate": 8.66445706174592e-05, + "loss": 0.027336391806602477, + "step": 94130 + }, + { + "epoch": 13.362668559261888, + "grad_norm": 0.024507010355591774, + "learning_rate": 8.66431511710433e-05, + "loss": 0.023677150905132293, + "step": 94140 + }, + { + "epoch": 13.364088005677786, + "grad_norm": 0.743320643901825, + "learning_rate": 8.66417317246274e-05, + "loss": 0.02231661379337311, + "step": 94150 + }, + { + "epoch": 13.365507452093684, + "grad_norm": 2.4547150135040283, + "learning_rate": 8.66403122782115e-05, + "loss": 0.016991636157035826, + "step": 94160 + }, + { + "epoch": 13.36692689850958, + "grad_norm": 2.07804274559021, + "learning_rate": 8.66388928317956e-05, + "loss": 0.01103825718164444, + "step": 94170 + }, + { + "epoch": 13.36834634492548, + "grad_norm": 0.6872746348381042, + "learning_rate": 8.663747338537971e-05, + "loss": 0.01877520829439163, + "step": 94180 + }, + { + "epoch": 13.369765791341377, + "grad_norm": 0.15541785955429077, + "learning_rate": 8.663605393896381e-05, + "loss": 0.006646855175495148, + "step": 94190 + }, + { + "epoch": 13.371185237757274, + "grad_norm": 0.5737395286560059, + "learning_rate": 8.663463449254791e-05, + "loss": 0.02360581010580063, + "step": 94200 + }, + { + "epoch": 13.372604684173172, + "grad_norm": 1.936894416809082, + "learning_rate": 8.6633215046132e-05, + "loss": 0.09669648408889771, + "step": 94210 + }, + { + "epoch": 13.37402413058907, + "grad_norm": 0.7344012260437012, + "learning_rate": 8.663179559971612e-05, + "loss": 0.00841248333454132, + "step": 94220 + }, + { + "epoch": 13.375443577004969, + "grad_norm": 1.448033094406128, + "learning_rate": 8.663037615330021e-05, + "loss": 0.016919110715389252, + "step": 94230 + }, + { + "epoch": 13.376863023420865, + "grad_norm": 0.5463376641273499, + "learning_rate": 8.662895670688433e-05, + "loss": 0.00848272666335106, + "step": 94240 + }, + { + "epoch": 13.378282469836764, + "grad_norm": 1.1138267517089844, + "learning_rate": 8.662753726046842e-05, + "loss": 0.019846946001052856, + "step": 94250 + }, + { + "epoch": 13.379701916252662, + "grad_norm": 3.6157217025756836, + "learning_rate": 8.662611781405252e-05, + "loss": 0.04727371633052826, + "step": 94260 + }, + { + "epoch": 13.381121362668559, + "grad_norm": 0.05200067535042763, + "learning_rate": 8.662469836763663e-05, + "loss": 0.05219040513038635, + "step": 94270 + }, + { + "epoch": 13.382540809084457, + "grad_norm": 0.8348428010940552, + "learning_rate": 8.662327892122073e-05, + "loss": 0.005376967415213585, + "step": 94280 + }, + { + "epoch": 13.383960255500355, + "grad_norm": 8.173595428466797, + "learning_rate": 8.662185947480484e-05, + "loss": 0.07077938914299012, + "step": 94290 + }, + { + "epoch": 13.385379701916253, + "grad_norm": 6.709331512451172, + "learning_rate": 8.662044002838892e-05, + "loss": 0.05493432879447937, + "step": 94300 + }, + { + "epoch": 13.38679914833215, + "grad_norm": 1.8870387077331543, + "learning_rate": 8.661902058197303e-05, + "loss": 0.044583475589752196, + "step": 94310 + }, + { + "epoch": 13.388218594748048, + "grad_norm": 1.3507314920425415, + "learning_rate": 8.661760113555713e-05, + "loss": 0.015715819597244263, + "step": 94320 + }, + { + "epoch": 13.389638041163947, + "grad_norm": 0.0798988863825798, + "learning_rate": 8.661618168914124e-05, + "loss": 0.01828896105289459, + "step": 94330 + }, + { + "epoch": 13.391057487579843, + "grad_norm": 4.10715913772583, + "learning_rate": 8.661476224272534e-05, + "loss": 0.017719167470932006, + "step": 94340 + }, + { + "epoch": 13.392476933995741, + "grad_norm": 0.8093958497047424, + "learning_rate": 8.661334279630944e-05, + "loss": 0.04272227585315704, + "step": 94350 + }, + { + "epoch": 13.39389638041164, + "grad_norm": 0.11310017108917236, + "learning_rate": 8.661192334989355e-05, + "loss": 0.016726674139499666, + "step": 94360 + }, + { + "epoch": 13.395315826827538, + "grad_norm": 0.01135955099016428, + "learning_rate": 8.661050390347765e-05, + "loss": 0.013713881373405457, + "step": 94370 + }, + { + "epoch": 13.396735273243435, + "grad_norm": 1.1644693613052368, + "learning_rate": 8.660908445706176e-05, + "loss": 0.004590839147567749, + "step": 94380 + }, + { + "epoch": 13.398154719659333, + "grad_norm": 13.32076358795166, + "learning_rate": 8.660766501064585e-05, + "loss": 0.030149951577186584, + "step": 94390 + }, + { + "epoch": 13.399574166075231, + "grad_norm": 1.9010841846466064, + "learning_rate": 8.660624556422997e-05, + "loss": 0.03028929829597473, + "step": 94400 + }, + { + "epoch": 13.400993612491128, + "grad_norm": 15.902448654174805, + "learning_rate": 8.660482611781405e-05, + "loss": 0.07759106159210205, + "step": 94410 + }, + { + "epoch": 13.402413058907026, + "grad_norm": 4.793128490447998, + "learning_rate": 8.660340667139816e-05, + "loss": 0.05131710171699524, + "step": 94420 + }, + { + "epoch": 13.403832505322924, + "grad_norm": 2.036078929901123, + "learning_rate": 8.660198722498226e-05, + "loss": 0.004698502644896507, + "step": 94430 + }, + { + "epoch": 13.405251951738823, + "grad_norm": 0.16924896836280823, + "learning_rate": 8.660056777856637e-05, + "loss": 0.005888786166906357, + "step": 94440 + }, + { + "epoch": 13.40667139815472, + "grad_norm": 0.2803502380847931, + "learning_rate": 8.659914833215047e-05, + "loss": 0.02985817790031433, + "step": 94450 + }, + { + "epoch": 13.408090844570618, + "grad_norm": 10.484100341796875, + "learning_rate": 8.659772888573456e-05, + "loss": 0.0780340552330017, + "step": 94460 + }, + { + "epoch": 13.409510290986516, + "grad_norm": 0.08539588004350662, + "learning_rate": 8.659630943931867e-05, + "loss": 0.012192347645759582, + "step": 94470 + }, + { + "epoch": 13.410929737402412, + "grad_norm": 0.2572302222251892, + "learning_rate": 8.659488999290277e-05, + "loss": 0.008727478981018066, + "step": 94480 + }, + { + "epoch": 13.41234918381831, + "grad_norm": 0.186640664935112, + "learning_rate": 8.659347054648688e-05, + "loss": 0.058405518531799316, + "step": 94490 + }, + { + "epoch": 13.413768630234209, + "grad_norm": 0.7377075552940369, + "learning_rate": 8.659205110007098e-05, + "loss": 0.04591574370861053, + "step": 94500 + }, + { + "epoch": 13.413768630234209, + "eval_accuracy": 0.97990716602022, + "eval_loss": 0.06794610619544983, + "eval_runtime": 31.1242, + "eval_samples_per_second": 505.298, + "eval_steps_per_second": 15.808, + "step": 94500 + }, + { + "epoch": 13.415188076650107, + "grad_norm": 0.030627934262156487, + "learning_rate": 8.659063165365508e-05, + "loss": 0.022979114949703217, + "step": 94510 + }, + { + "epoch": 13.416607523066004, + "grad_norm": 3.4133455753326416, + "learning_rate": 8.658921220723917e-05, + "loss": 0.023091521859169007, + "step": 94520 + }, + { + "epoch": 13.418026969481902, + "grad_norm": 0.8833838701248169, + "learning_rate": 8.658779276082329e-05, + "loss": 0.015490736067295074, + "step": 94530 + }, + { + "epoch": 13.4194464158978, + "grad_norm": 0.6688404679298401, + "learning_rate": 8.658637331440738e-05, + "loss": 0.013762807846069336, + "step": 94540 + }, + { + "epoch": 13.420865862313697, + "grad_norm": 1.5333127975463867, + "learning_rate": 8.65849538679915e-05, + "loss": 0.016186425089836122, + "step": 94550 + }, + { + "epoch": 13.422285308729595, + "grad_norm": 0.24415715038776398, + "learning_rate": 8.658353442157559e-05, + "loss": 0.014823892712593078, + "step": 94560 + }, + { + "epoch": 13.423704755145494, + "grad_norm": 7.537097930908203, + "learning_rate": 8.658211497515969e-05, + "loss": 0.06450334787368775, + "step": 94570 + }, + { + "epoch": 13.425124201561392, + "grad_norm": 0.8393383622169495, + "learning_rate": 8.65806955287438e-05, + "loss": 0.03933723270893097, + "step": 94580 + }, + { + "epoch": 13.426543647977288, + "grad_norm": 5.340954303741455, + "learning_rate": 8.65792760823279e-05, + "loss": 0.02385113388299942, + "step": 94590 + }, + { + "epoch": 13.427963094393187, + "grad_norm": 2.525688648223877, + "learning_rate": 8.657785663591201e-05, + "loss": 0.07299022674560547, + "step": 94600 + }, + { + "epoch": 13.429382540809085, + "grad_norm": 0.3474077582359314, + "learning_rate": 8.657643718949609e-05, + "loss": 0.03391514718532562, + "step": 94610 + }, + { + "epoch": 13.430801987224982, + "grad_norm": 0.13079656660556793, + "learning_rate": 8.65750177430802e-05, + "loss": 0.0268646240234375, + "step": 94620 + }, + { + "epoch": 13.43222143364088, + "grad_norm": 0.34679096937179565, + "learning_rate": 8.65735982966643e-05, + "loss": 0.021389296650886534, + "step": 94630 + }, + { + "epoch": 13.433640880056778, + "grad_norm": 0.4974942207336426, + "learning_rate": 8.657217885024841e-05, + "loss": 0.003970606997609138, + "step": 94640 + }, + { + "epoch": 13.435060326472676, + "grad_norm": 4.566431045532227, + "learning_rate": 8.657075940383251e-05, + "loss": 0.009872384369373322, + "step": 94650 + }, + { + "epoch": 13.436479772888573, + "grad_norm": 0.22306831181049347, + "learning_rate": 8.65693399574166e-05, + "loss": 0.01926995664834976, + "step": 94660 + }, + { + "epoch": 13.437899219304471, + "grad_norm": 0.18319228291511536, + "learning_rate": 8.656792051100072e-05, + "loss": 0.005128199979662895, + "step": 94670 + }, + { + "epoch": 13.43931866572037, + "grad_norm": 1.5891705751419067, + "learning_rate": 8.656650106458481e-05, + "loss": 0.03338150680065155, + "step": 94680 + }, + { + "epoch": 13.440738112136266, + "grad_norm": 0.11429005116224289, + "learning_rate": 8.656508161816892e-05, + "loss": 0.0355482429265976, + "step": 94690 + }, + { + "epoch": 13.442157558552164, + "grad_norm": 0.0894903615117073, + "learning_rate": 8.656366217175302e-05, + "loss": 0.007524222135543823, + "step": 94700 + }, + { + "epoch": 13.443577004968063, + "grad_norm": 0.026772212237119675, + "learning_rate": 8.656224272533713e-05, + "loss": 0.024022915959358217, + "step": 94710 + }, + { + "epoch": 13.444996451383961, + "grad_norm": 3.8690943717956543, + "learning_rate": 8.656082327892122e-05, + "loss": 0.02425812780857086, + "step": 94720 + }, + { + "epoch": 13.446415897799858, + "grad_norm": 3.525021553039551, + "learning_rate": 8.655940383250533e-05, + "loss": 0.0785856544971466, + "step": 94730 + }, + { + "epoch": 13.447835344215756, + "grad_norm": 0.6416861414909363, + "learning_rate": 8.655798438608942e-05, + "loss": 0.018032850325107576, + "step": 94740 + }, + { + "epoch": 13.449254790631654, + "grad_norm": 0.18148157000541687, + "learning_rate": 8.655656493967354e-05, + "loss": 0.016194966435432435, + "step": 94750 + }, + { + "epoch": 13.45067423704755, + "grad_norm": 6.061134338378906, + "learning_rate": 8.655514549325763e-05, + "loss": 0.06400618553161622, + "step": 94760 + }, + { + "epoch": 13.452093683463449, + "grad_norm": 2.7746856212615967, + "learning_rate": 8.655372604684173e-05, + "loss": 0.017115673422813414, + "step": 94770 + }, + { + "epoch": 13.453513129879347, + "grad_norm": 0.9594934582710266, + "learning_rate": 8.655230660042584e-05, + "loss": 0.03562757968902588, + "step": 94780 + }, + { + "epoch": 13.454932576295246, + "grad_norm": 13.996777534484863, + "learning_rate": 8.655088715400994e-05, + "loss": 0.02590930163860321, + "step": 94790 + }, + { + "epoch": 13.456352022711142, + "grad_norm": 0.16600783169269562, + "learning_rate": 8.654946770759405e-05, + "loss": 0.018148021399974824, + "step": 94800 + }, + { + "epoch": 13.45777146912704, + "grad_norm": 1.3046753406524658, + "learning_rate": 8.654804826117815e-05, + "loss": 0.003467951714992523, + "step": 94810 + }, + { + "epoch": 13.459190915542939, + "grad_norm": 0.07563643157482147, + "learning_rate": 8.654662881476224e-05, + "loss": 0.016790592670440675, + "step": 94820 + }, + { + "epoch": 13.460610361958835, + "grad_norm": 0.4039790630340576, + "learning_rate": 8.654520936834634e-05, + "loss": 0.024634437263011934, + "step": 94830 + }, + { + "epoch": 13.462029808374734, + "grad_norm": 0.04403165355324745, + "learning_rate": 8.654378992193045e-05, + "loss": 0.02208777964115143, + "step": 94840 + }, + { + "epoch": 13.463449254790632, + "grad_norm": 4.115147113800049, + "learning_rate": 8.654237047551455e-05, + "loss": 0.008800669014453888, + "step": 94850 + }, + { + "epoch": 13.46486870120653, + "grad_norm": 0.6078055500984192, + "learning_rate": 8.654095102909866e-05, + "loss": 0.04899186193943024, + "step": 94860 + }, + { + "epoch": 13.466288147622427, + "grad_norm": 3.0943551063537598, + "learning_rate": 8.653953158268276e-05, + "loss": 0.04084070026874542, + "step": 94870 + }, + { + "epoch": 13.467707594038325, + "grad_norm": 0.12317100912332535, + "learning_rate": 8.653811213626686e-05, + "loss": 0.0065141826868057254, + "step": 94880 + }, + { + "epoch": 13.469127040454223, + "grad_norm": 1.6963871717453003, + "learning_rate": 8.653669268985097e-05, + "loss": 0.021698565781116487, + "step": 94890 + }, + { + "epoch": 13.47054648687012, + "grad_norm": 8.212491035461426, + "learning_rate": 8.653527324343506e-05, + "loss": 0.07611912488937378, + "step": 94900 + }, + { + "epoch": 13.471965933286018, + "grad_norm": 0.40329378843307495, + "learning_rate": 8.653385379701918e-05, + "loss": 0.020160472393035887, + "step": 94910 + }, + { + "epoch": 13.473385379701917, + "grad_norm": 0.9991295337677002, + "learning_rate": 8.653243435060326e-05, + "loss": 0.0287177711725235, + "step": 94920 + }, + { + "epoch": 13.474804826117815, + "grad_norm": 8.761367797851562, + "learning_rate": 8.653101490418737e-05, + "loss": 0.05888093113899231, + "step": 94930 + }, + { + "epoch": 13.476224272533711, + "grad_norm": 0.6481591463088989, + "learning_rate": 8.652959545777147e-05, + "loss": 0.019050560891628265, + "step": 94940 + }, + { + "epoch": 13.47764371894961, + "grad_norm": 0.06678199023008347, + "learning_rate": 8.652817601135558e-05, + "loss": 0.052511191368103026, + "step": 94950 + }, + { + "epoch": 13.479063165365508, + "grad_norm": 0.018576741218566895, + "learning_rate": 8.652675656493968e-05, + "loss": 0.037995684146881106, + "step": 94960 + }, + { + "epoch": 13.480482611781405, + "grad_norm": 0.029905835166573524, + "learning_rate": 8.652533711852377e-05, + "loss": 0.03668951392173767, + "step": 94970 + }, + { + "epoch": 13.481902058197303, + "grad_norm": 7.639389991760254, + "learning_rate": 8.652391767210788e-05, + "loss": 0.08303702473640442, + "step": 94980 + }, + { + "epoch": 13.483321504613201, + "grad_norm": 0.044263552874326706, + "learning_rate": 8.652249822569198e-05, + "loss": 0.04615318775177002, + "step": 94990 + }, + { + "epoch": 13.4847409510291, + "grad_norm": 6.359818458557129, + "learning_rate": 8.652107877927609e-05, + "loss": 0.01875213086605072, + "step": 95000 + }, + { + "epoch": 13.4847409510291, + "eval_accuracy": 0.9851211292681376, + "eval_loss": 0.052980098873376846, + "eval_runtime": 32.4787, + "eval_samples_per_second": 484.225, + "eval_steps_per_second": 15.148, + "step": 95000 + }, + { + "epoch": 13.486160397444996, + "grad_norm": 4.879207134246826, + "learning_rate": 8.651965933286019e-05, + "loss": 0.05217199921607971, + "step": 95010 + }, + { + "epoch": 13.487579843860894, + "grad_norm": 1.1770670413970947, + "learning_rate": 8.651823988644429e-05, + "loss": 0.012905190885066985, + "step": 95020 + }, + { + "epoch": 13.488999290276793, + "grad_norm": 0.13578958809375763, + "learning_rate": 8.651682044002838e-05, + "loss": 0.024967202544212343, + "step": 95030 + }, + { + "epoch": 13.490418736692689, + "grad_norm": 0.02280462346971035, + "learning_rate": 8.65154009936125e-05, + "loss": 0.00866934135556221, + "step": 95040 + }, + { + "epoch": 13.491838183108587, + "grad_norm": 1.2240324020385742, + "learning_rate": 8.651398154719659e-05, + "loss": 0.014538370072841644, + "step": 95050 + }, + { + "epoch": 13.493257629524486, + "grad_norm": 1.3486486673355103, + "learning_rate": 8.65125621007807e-05, + "loss": 0.035960334539413455, + "step": 95060 + }, + { + "epoch": 13.494677075940384, + "grad_norm": 0.38771259784698486, + "learning_rate": 8.65111426543648e-05, + "loss": 0.020747166872024537, + "step": 95070 + }, + { + "epoch": 13.49609652235628, + "grad_norm": 0.48206791281700134, + "learning_rate": 8.65097232079489e-05, + "loss": 0.037840792536735536, + "step": 95080 + }, + { + "epoch": 13.497515968772179, + "grad_norm": 8.286971092224121, + "learning_rate": 8.650830376153301e-05, + "loss": 0.0744866132736206, + "step": 95090 + }, + { + "epoch": 13.498935415188077, + "grad_norm": 4.623902797698975, + "learning_rate": 8.65068843151171e-05, + "loss": 0.027271512150764465, + "step": 95100 + }, + { + "epoch": 13.500354861603974, + "grad_norm": 0.2418900579214096, + "learning_rate": 8.650546486870122e-05, + "loss": 0.007887350022792816, + "step": 95110 + }, + { + "epoch": 13.501774308019872, + "grad_norm": 0.0539235882461071, + "learning_rate": 8.650404542228531e-05, + "loss": 0.011406297981739043, + "step": 95120 + }, + { + "epoch": 13.50319375443577, + "grad_norm": 0.07959360629320145, + "learning_rate": 8.650262597586941e-05, + "loss": 0.007848554849624633, + "step": 95130 + }, + { + "epoch": 13.504613200851669, + "grad_norm": 0.01729063130915165, + "learning_rate": 8.650120652945351e-05, + "loss": 0.028363698720932008, + "step": 95140 + }, + { + "epoch": 13.506032647267565, + "grad_norm": 1.3998548984527588, + "learning_rate": 8.649978708303762e-05, + "loss": 0.008569182455539703, + "step": 95150 + }, + { + "epoch": 13.507452093683463, + "grad_norm": 0.3003092408180237, + "learning_rate": 8.649836763662172e-05, + "loss": 0.02904551327228546, + "step": 95160 + }, + { + "epoch": 13.508871540099362, + "grad_norm": 7.425189971923828, + "learning_rate": 8.649694819020583e-05, + "loss": 0.018622586131095888, + "step": 95170 + }, + { + "epoch": 13.510290986515258, + "grad_norm": 0.015005892142653465, + "learning_rate": 8.649552874378993e-05, + "loss": 0.006870198249816895, + "step": 95180 + }, + { + "epoch": 13.511710432931157, + "grad_norm": 0.02674536220729351, + "learning_rate": 8.649410929737402e-05, + "loss": 0.0023563139140605925, + "step": 95190 + }, + { + "epoch": 13.513129879347055, + "grad_norm": 0.13404254615306854, + "learning_rate": 8.649268985095813e-05, + "loss": 0.009416007995605468, + "step": 95200 + }, + { + "epoch": 13.514549325762953, + "grad_norm": 0.2148085981607437, + "learning_rate": 8.649127040454223e-05, + "loss": 0.019373995065689088, + "step": 95210 + }, + { + "epoch": 13.51596877217885, + "grad_norm": 0.11902511119842529, + "learning_rate": 8.648985095812634e-05, + "loss": 0.02425927072763443, + "step": 95220 + }, + { + "epoch": 13.517388218594748, + "grad_norm": 0.9130202531814575, + "learning_rate": 8.648843151171043e-05, + "loss": 0.0049303267151117325, + "step": 95230 + }, + { + "epoch": 13.518807665010646, + "grad_norm": 0.2348577082157135, + "learning_rate": 8.648701206529454e-05, + "loss": 0.03883711695671081, + "step": 95240 + }, + { + "epoch": 13.520227111426543, + "grad_norm": 0.02371363900601864, + "learning_rate": 8.648559261887863e-05, + "loss": 0.049692931771278384, + "step": 95250 + }, + { + "epoch": 13.521646557842441, + "grad_norm": 1.7642638683319092, + "learning_rate": 8.648417317246275e-05, + "loss": 0.026677578687667847, + "step": 95260 + }, + { + "epoch": 13.52306600425834, + "grad_norm": 0.6842426657676697, + "learning_rate": 8.648275372604686e-05, + "loss": 0.0024492625147104264, + "step": 95270 + }, + { + "epoch": 13.524485450674238, + "grad_norm": 1.9417275190353394, + "learning_rate": 8.648133427963094e-05, + "loss": 0.005500277504324913, + "step": 95280 + }, + { + "epoch": 13.525904897090134, + "grad_norm": 0.02574915811419487, + "learning_rate": 8.647991483321505e-05, + "loss": 0.006903509795665741, + "step": 95290 + }, + { + "epoch": 13.527324343506033, + "grad_norm": 0.14615212380886078, + "learning_rate": 8.647849538679915e-05, + "loss": 0.014113113284111023, + "step": 95300 + }, + { + "epoch": 13.528743789921931, + "grad_norm": 0.8143966197967529, + "learning_rate": 8.647707594038326e-05, + "loss": 0.020416779816150664, + "step": 95310 + }, + { + "epoch": 13.530163236337827, + "grad_norm": 0.25222253799438477, + "learning_rate": 8.647565649396736e-05, + "loss": 0.08081262707710266, + "step": 95320 + }, + { + "epoch": 13.531582682753726, + "grad_norm": 2.65850830078125, + "learning_rate": 8.647423704755145e-05, + "loss": 0.009225034713745117, + "step": 95330 + }, + { + "epoch": 13.533002129169624, + "grad_norm": 0.4264056384563446, + "learning_rate": 8.647281760113555e-05, + "loss": 0.018319667875766756, + "step": 95340 + }, + { + "epoch": 13.534421575585522, + "grad_norm": 0.144210085272789, + "learning_rate": 8.647139815471966e-05, + "loss": 0.06945294141769409, + "step": 95350 + }, + { + "epoch": 13.535841022001419, + "grad_norm": 0.4107845723628998, + "learning_rate": 8.647012065294535e-05, + "loss": 0.038562634587287904, + "step": 95360 + }, + { + "epoch": 13.537260468417317, + "grad_norm": 6.5514912605285645, + "learning_rate": 8.646870120652946e-05, + "loss": 0.01930883377790451, + "step": 95370 + }, + { + "epoch": 13.538679914833216, + "grad_norm": 0.014782809652388096, + "learning_rate": 8.646728176011356e-05, + "loss": 0.06311725378036499, + "step": 95380 + }, + { + "epoch": 13.540099361249112, + "grad_norm": 0.554907500743866, + "learning_rate": 8.646586231369767e-05, + "loss": 0.04180940389633179, + "step": 95390 + }, + { + "epoch": 13.54151880766501, + "grad_norm": 1.3440015316009521, + "learning_rate": 8.646444286728176e-05, + "loss": 0.04394156336784363, + "step": 95400 + }, + { + "epoch": 13.542938254080909, + "grad_norm": 10.791940689086914, + "learning_rate": 8.646302342086586e-05, + "loss": 0.05378941297531128, + "step": 95410 + }, + { + "epoch": 13.544357700496807, + "grad_norm": 0.39911729097366333, + "learning_rate": 8.646160397444997e-05, + "loss": 0.0075015932321548465, + "step": 95420 + }, + { + "epoch": 13.545777146912704, + "grad_norm": 0.045785821974277496, + "learning_rate": 8.646018452803407e-05, + "loss": 0.029005283117294313, + "step": 95430 + }, + { + "epoch": 13.547196593328602, + "grad_norm": 0.010025689378380775, + "learning_rate": 8.645876508161818e-05, + "loss": 0.026248654723167418, + "step": 95440 + }, + { + "epoch": 13.5486160397445, + "grad_norm": 2.720860004425049, + "learning_rate": 8.645734563520228e-05, + "loss": 0.004437939077615738, + "step": 95450 + }, + { + "epoch": 13.550035486160397, + "grad_norm": 4.066869735717773, + "learning_rate": 8.645592618878638e-05, + "loss": 0.01580238789319992, + "step": 95460 + }, + { + "epoch": 13.551454932576295, + "grad_norm": 0.23954829573631287, + "learning_rate": 8.645450674237047e-05, + "loss": 0.011441273987293244, + "step": 95470 + }, + { + "epoch": 13.552874378992193, + "grad_norm": 12.648468017578125, + "learning_rate": 8.645308729595458e-05, + "loss": 0.024931295216083525, + "step": 95480 + }, + { + "epoch": 13.554293825408092, + "grad_norm": 0.7105810046195984, + "learning_rate": 8.645166784953868e-05, + "loss": 0.029153740406036376, + "step": 95490 + }, + { + "epoch": 13.555713271823988, + "grad_norm": 0.045782580971717834, + "learning_rate": 8.645024840312279e-05, + "loss": 0.01563691794872284, + "step": 95500 + }, + { + "epoch": 13.555713271823988, + "eval_accuracy": 0.9883003751510142, + "eval_loss": 0.04102558270096779, + "eval_runtime": 31.3169, + "eval_samples_per_second": 502.189, + "eval_steps_per_second": 15.71, + "step": 95500 + }, + { + "epoch": 13.557132718239886, + "grad_norm": 1.4033604860305786, + "learning_rate": 8.644882895670689e-05, + "loss": 0.03285573124885559, + "step": 95510 + }, + { + "epoch": 13.558552164655785, + "grad_norm": 1.4092446565628052, + "learning_rate": 8.644740951029099e-05, + "loss": 0.005036625638604164, + "step": 95520 + }, + { + "epoch": 13.559971611071681, + "grad_norm": 0.03376542776823044, + "learning_rate": 8.64459900638751e-05, + "loss": 0.021902407705783843, + "step": 95530 + }, + { + "epoch": 13.56139105748758, + "grad_norm": 0.022708173841238022, + "learning_rate": 8.64445706174592e-05, + "loss": 0.022810864448547363, + "step": 95540 + }, + { + "epoch": 13.562810503903478, + "grad_norm": 1.8710218667984009, + "learning_rate": 8.64431511710433e-05, + "loss": 0.029252752661705017, + "step": 95550 + }, + { + "epoch": 13.564229950319376, + "grad_norm": 0.5325424075126648, + "learning_rate": 8.644173172462739e-05, + "loss": 0.04581056237220764, + "step": 95560 + }, + { + "epoch": 13.565649396735273, + "grad_norm": 12.632488250732422, + "learning_rate": 8.64403122782115e-05, + "loss": 0.04354757368564606, + "step": 95570 + }, + { + "epoch": 13.567068843151171, + "grad_norm": 1.2110655307769775, + "learning_rate": 8.64388928317956e-05, + "loss": 0.021378204226493835, + "step": 95580 + }, + { + "epoch": 13.56848828956707, + "grad_norm": 0.4826580882072449, + "learning_rate": 8.643747338537971e-05, + "loss": 0.019974629580974578, + "step": 95590 + }, + { + "epoch": 13.569907735982966, + "grad_norm": 0.2284783273935318, + "learning_rate": 8.643605393896381e-05, + "loss": 0.04820126593112946, + "step": 95600 + }, + { + "epoch": 13.571327182398864, + "grad_norm": 1.714429497718811, + "learning_rate": 8.64346344925479e-05, + "loss": 0.0038776598870754243, + "step": 95610 + }, + { + "epoch": 13.572746628814762, + "grad_norm": 5.034719944000244, + "learning_rate": 8.643321504613202e-05, + "loss": 0.03729407787322998, + "step": 95620 + }, + { + "epoch": 13.57416607523066, + "grad_norm": 0.28962454199790955, + "learning_rate": 8.643179559971611e-05, + "loss": 0.01688399910926819, + "step": 95630 + }, + { + "epoch": 13.575585521646557, + "grad_norm": 0.36011746525764465, + "learning_rate": 8.643037615330022e-05, + "loss": 0.038062408566474915, + "step": 95640 + }, + { + "epoch": 13.577004968062456, + "grad_norm": 1.5610812902450562, + "learning_rate": 8.642895670688432e-05, + "loss": 0.06204630136489868, + "step": 95650 + }, + { + "epoch": 13.578424414478354, + "grad_norm": 0.0474555566906929, + "learning_rate": 8.642753726046842e-05, + "loss": 0.002244003117084503, + "step": 95660 + }, + { + "epoch": 13.57984386089425, + "grad_norm": 0.05775681510567665, + "learning_rate": 8.642611781405252e-05, + "loss": 0.023053470253944396, + "step": 95670 + }, + { + "epoch": 13.581263307310149, + "grad_norm": 0.07157592475414276, + "learning_rate": 8.642469836763663e-05, + "loss": 0.02459646463394165, + "step": 95680 + }, + { + "epoch": 13.582682753726047, + "grad_norm": 0.05639144778251648, + "learning_rate": 8.642327892122072e-05, + "loss": 0.009374721348285675, + "step": 95690 + }, + { + "epoch": 13.584102200141945, + "grad_norm": 0.32128065824508667, + "learning_rate": 8.642185947480483e-05, + "loss": 0.010814450681209564, + "step": 95700 + }, + { + "epoch": 13.585521646557842, + "grad_norm": 2.1439898014068604, + "learning_rate": 8.642044002838893e-05, + "loss": 0.03374948799610138, + "step": 95710 + }, + { + "epoch": 13.58694109297374, + "grad_norm": 0.643042266368866, + "learning_rate": 8.641902058197303e-05, + "loss": 0.017416924238204956, + "step": 95720 + }, + { + "epoch": 13.588360539389639, + "grad_norm": 0.3461391031742096, + "learning_rate": 8.641760113555714e-05, + "loss": 0.03184410333633423, + "step": 95730 + }, + { + "epoch": 13.589779985805535, + "grad_norm": 0.12013855576515198, + "learning_rate": 8.641618168914124e-05, + "loss": 0.029846155643463136, + "step": 95740 + }, + { + "epoch": 13.591199432221433, + "grad_norm": 2.5400469303131104, + "learning_rate": 8.641476224272535e-05, + "loss": 0.07584856748580933, + "step": 95750 + }, + { + "epoch": 13.592618878637332, + "grad_norm": 2.2745046615600586, + "learning_rate": 8.641334279630945e-05, + "loss": 0.014420546591281891, + "step": 95760 + }, + { + "epoch": 13.59403832505323, + "grad_norm": 0.3573761284351349, + "learning_rate": 8.641192334989354e-05, + "loss": 0.04467960298061371, + "step": 95770 + }, + { + "epoch": 13.595457771469126, + "grad_norm": 2.67861270904541, + "learning_rate": 8.641050390347764e-05, + "loss": 0.018556292355060577, + "step": 95780 + }, + { + "epoch": 13.596877217885025, + "grad_norm": 0.11460601538419724, + "learning_rate": 8.640908445706175e-05, + "loss": 0.01947108209133148, + "step": 95790 + }, + { + "epoch": 13.598296664300923, + "grad_norm": 2.6254916191101074, + "learning_rate": 8.640766501064585e-05, + "loss": 0.09533782601356507, + "step": 95800 + }, + { + "epoch": 13.59971611071682, + "grad_norm": 0.07714466750621796, + "learning_rate": 8.640624556422996e-05, + "loss": 0.011156822741031646, + "step": 95810 + }, + { + "epoch": 13.601135557132718, + "grad_norm": 0.8347751498222351, + "learning_rate": 8.640482611781406e-05, + "loss": 0.01339704990386963, + "step": 95820 + }, + { + "epoch": 13.602555003548616, + "grad_norm": 0.10340499132871628, + "learning_rate": 8.640340667139815e-05, + "loss": 0.03355931043624878, + "step": 95830 + }, + { + "epoch": 13.603974449964515, + "grad_norm": 1.814827561378479, + "learning_rate": 8.640198722498227e-05, + "loss": 0.018925678730010987, + "step": 95840 + }, + { + "epoch": 13.605393896380411, + "grad_norm": 0.0566093884408474, + "learning_rate": 8.640056777856636e-05, + "loss": 0.023309621214866637, + "step": 95850 + }, + { + "epoch": 13.60681334279631, + "grad_norm": 10.556805610656738, + "learning_rate": 8.639914833215047e-05, + "loss": 0.03882618546485901, + "step": 95860 + }, + { + "epoch": 13.608232789212208, + "grad_norm": 0.1373138278722763, + "learning_rate": 8.639772888573456e-05, + "loss": 0.030927222967147828, + "step": 95870 + }, + { + "epoch": 13.609652235628104, + "grad_norm": 0.23542313277721405, + "learning_rate": 8.639630943931867e-05, + "loss": 0.008216166496276855, + "step": 95880 + }, + { + "epoch": 13.611071682044003, + "grad_norm": 0.5358021855354309, + "learning_rate": 8.639488999290277e-05, + "loss": 0.026577457785606384, + "step": 95890 + }, + { + "epoch": 13.6124911284599, + "grad_norm": 0.3513150215148926, + "learning_rate": 8.639347054648688e-05, + "loss": 0.03022733926773071, + "step": 95900 + }, + { + "epoch": 13.6139105748758, + "grad_norm": 3.255770683288574, + "learning_rate": 8.639205110007097e-05, + "loss": 0.04010620713233948, + "step": 95910 + }, + { + "epoch": 13.615330021291696, + "grad_norm": 0.07835977524518967, + "learning_rate": 8.639063165365507e-05, + "loss": 0.0524819552898407, + "step": 95920 + }, + { + "epoch": 13.616749467707594, + "grad_norm": 7.067140579223633, + "learning_rate": 8.638921220723918e-05, + "loss": 0.007290441542863846, + "step": 95930 + }, + { + "epoch": 13.618168914123492, + "grad_norm": 0.5542891621589661, + "learning_rate": 8.638779276082328e-05, + "loss": 0.034179630875587466, + "step": 95940 + }, + { + "epoch": 13.619588360539389, + "grad_norm": 0.25055670738220215, + "learning_rate": 8.638637331440739e-05, + "loss": 0.016138841211795808, + "step": 95950 + }, + { + "epoch": 13.621007806955287, + "grad_norm": 1.4587302207946777, + "learning_rate": 8.638495386799149e-05, + "loss": 0.06437674760818482, + "step": 95960 + }, + { + "epoch": 13.622427253371185, + "grad_norm": 0.1022627130150795, + "learning_rate": 8.638353442157559e-05, + "loss": 0.019828467071056365, + "step": 95970 + }, + { + "epoch": 13.623846699787084, + "grad_norm": 4.383076190948486, + "learning_rate": 8.638211497515968e-05, + "loss": 0.045711484551429746, + "step": 95980 + }, + { + "epoch": 13.62526614620298, + "grad_norm": 0.9741289019584656, + "learning_rate": 8.63806955287438e-05, + "loss": 0.044488522410392764, + "step": 95990 + }, + { + "epoch": 13.626685592618879, + "grad_norm": 0.208095520734787, + "learning_rate": 8.637927608232789e-05, + "loss": 0.04469639658927917, + "step": 96000 + }, + { + "epoch": 13.626685592618879, + "eval_accuracy": 0.9734215044191518, + "eval_loss": 0.10497359931468964, + "eval_runtime": 30.8281, + "eval_samples_per_second": 510.151, + "eval_steps_per_second": 15.959, + "step": 96000 + }, + { + "epoch": 13.628105039034777, + "grad_norm": 6.797325611114502, + "learning_rate": 8.6377856635912e-05, + "loss": 0.036408495903015134, + "step": 96010 + }, + { + "epoch": 13.629524485450673, + "grad_norm": 1.130789875984192, + "learning_rate": 8.63764371894961e-05, + "loss": 0.017700037360191344, + "step": 96020 + }, + { + "epoch": 13.630943931866572, + "grad_norm": 3.6554105281829834, + "learning_rate": 8.63750177430802e-05, + "loss": 0.038382101058959964, + "step": 96030 + }, + { + "epoch": 13.63236337828247, + "grad_norm": 3.8934245109558105, + "learning_rate": 8.637359829666431e-05, + "loss": 0.07922664880752564, + "step": 96040 + }, + { + "epoch": 13.633782824698368, + "grad_norm": 0.1729537695646286, + "learning_rate": 8.63721788502484e-05, + "loss": 0.03476465344429016, + "step": 96050 + }, + { + "epoch": 13.635202271114265, + "grad_norm": 8.660470008850098, + "learning_rate": 8.637075940383252e-05, + "loss": 0.04056967496871948, + "step": 96060 + }, + { + "epoch": 13.636621717530163, + "grad_norm": 1.6571571826934814, + "learning_rate": 8.63693399574166e-05, + "loss": 0.01973864436149597, + "step": 96070 + }, + { + "epoch": 13.638041163946061, + "grad_norm": 0.2835308015346527, + "learning_rate": 8.636792051100071e-05, + "loss": 0.015158508718013764, + "step": 96080 + }, + { + "epoch": 13.639460610361958, + "grad_norm": 0.2651826739311218, + "learning_rate": 8.636650106458481e-05, + "loss": 0.006721274554729461, + "step": 96090 + }, + { + "epoch": 13.640880056777856, + "grad_norm": 0.014933490194380283, + "learning_rate": 8.636508161816892e-05, + "loss": 0.027929207682609557, + "step": 96100 + }, + { + "epoch": 13.642299503193755, + "grad_norm": 0.16669167578220367, + "learning_rate": 8.636366217175302e-05, + "loss": 0.016313910484313965, + "step": 96110 + }, + { + "epoch": 13.643718949609653, + "grad_norm": 2.4867875576019287, + "learning_rate": 8.636224272533713e-05, + "loss": 0.02824159860610962, + "step": 96120 + }, + { + "epoch": 13.64513839602555, + "grad_norm": 0.00703821936622262, + "learning_rate": 8.636082327892123e-05, + "loss": 0.01784539073705673, + "step": 96130 + }, + { + "epoch": 13.646557842441448, + "grad_norm": 11.87582015991211, + "learning_rate": 8.635940383250532e-05, + "loss": 0.02379693239927292, + "step": 96140 + }, + { + "epoch": 13.647977288857346, + "grad_norm": 0.019314516335725784, + "learning_rate": 8.635798438608943e-05, + "loss": 0.012368345260620117, + "step": 96150 + }, + { + "epoch": 13.649396735273243, + "grad_norm": 5.645758152008057, + "learning_rate": 8.635656493967353e-05, + "loss": 0.03507188558578491, + "step": 96160 + }, + { + "epoch": 13.650816181689141, + "grad_norm": 5.865706443786621, + "learning_rate": 8.635514549325764e-05, + "loss": 0.057054382562637326, + "step": 96170 + }, + { + "epoch": 13.65223562810504, + "grad_norm": 0.0784079059958458, + "learning_rate": 8.635372604684173e-05, + "loss": 0.006159195676445961, + "step": 96180 + }, + { + "epoch": 13.653655074520938, + "grad_norm": 0.8433167338371277, + "learning_rate": 8.635230660042584e-05, + "loss": 0.011216971278190612, + "step": 96190 + }, + { + "epoch": 13.655074520936834, + "grad_norm": 0.18653298914432526, + "learning_rate": 8.635088715400993e-05, + "loss": 0.03910190463066101, + "step": 96200 + }, + { + "epoch": 13.656493967352732, + "grad_norm": 0.7993428111076355, + "learning_rate": 8.634946770759404e-05, + "loss": 0.004709036275744438, + "step": 96210 + }, + { + "epoch": 13.65791341376863, + "grad_norm": 0.08979519456624985, + "learning_rate": 8.634804826117816e-05, + "loss": 0.028900161385536194, + "step": 96220 + }, + { + "epoch": 13.659332860184527, + "grad_norm": 0.010041122324764729, + "learning_rate": 8.634662881476224e-05, + "loss": 0.02693893015384674, + "step": 96230 + }, + { + "epoch": 13.660752306600425, + "grad_norm": 0.7909981608390808, + "learning_rate": 8.634520936834635e-05, + "loss": 0.008231808245182038, + "step": 96240 + }, + { + "epoch": 13.662171753016324, + "grad_norm": 8.569683074951172, + "learning_rate": 8.634378992193045e-05, + "loss": 0.027000784873962402, + "step": 96250 + }, + { + "epoch": 13.663591199432222, + "grad_norm": 1.8035515546798706, + "learning_rate": 8.634237047551456e-05, + "loss": 0.013083310425281524, + "step": 96260 + }, + { + "epoch": 13.665010645848119, + "grad_norm": 6.626659393310547, + "learning_rate": 8.634095102909866e-05, + "loss": 0.02948010265827179, + "step": 96270 + }, + { + "epoch": 13.666430092264017, + "grad_norm": 7.713639259338379, + "learning_rate": 8.633953158268275e-05, + "loss": 0.018895019590854645, + "step": 96280 + }, + { + "epoch": 13.667849538679915, + "grad_norm": 0.03560718894004822, + "learning_rate": 8.633811213626685e-05, + "loss": 0.02730792760848999, + "step": 96290 + }, + { + "epoch": 13.669268985095812, + "grad_norm": 0.49820488691329956, + "learning_rate": 8.633669268985096e-05, + "loss": 0.04874304831027985, + "step": 96300 + }, + { + "epoch": 13.67068843151171, + "grad_norm": 0.038449477404356, + "learning_rate": 8.633527324343507e-05, + "loss": 0.050292950868606565, + "step": 96310 + }, + { + "epoch": 13.672107877927608, + "grad_norm": 10.892190933227539, + "learning_rate": 8.633385379701917e-05, + "loss": 0.05344282388687134, + "step": 96320 + }, + { + "epoch": 13.673527324343507, + "grad_norm": 0.24301016330718994, + "learning_rate": 8.633243435060327e-05, + "loss": 0.024064372479915618, + "step": 96330 + }, + { + "epoch": 13.674946770759403, + "grad_norm": 0.02121180109679699, + "learning_rate": 8.633101490418737e-05, + "loss": 0.01656196266412735, + "step": 96340 + }, + { + "epoch": 13.676366217175302, + "grad_norm": 0.4311664402484894, + "learning_rate": 8.632959545777148e-05, + "loss": 0.01601491868495941, + "step": 96350 + }, + { + "epoch": 13.6777856635912, + "grad_norm": 0.7602726221084595, + "learning_rate": 8.632817601135557e-05, + "loss": 0.01864083707332611, + "step": 96360 + }, + { + "epoch": 13.679205110007096, + "grad_norm": 11.445473670959473, + "learning_rate": 8.632675656493968e-05, + "loss": 0.043167969584465025, + "step": 96370 + }, + { + "epoch": 13.680624556422995, + "grad_norm": 0.05061354488134384, + "learning_rate": 8.632533711852377e-05, + "loss": 0.012322446703910828, + "step": 96380 + }, + { + "epoch": 13.682044002838893, + "grad_norm": 0.009072613902390003, + "learning_rate": 8.632391767210788e-05, + "loss": 0.02431444972753525, + "step": 96390 + }, + { + "epoch": 13.683463449254791, + "grad_norm": 0.07909449934959412, + "learning_rate": 8.632249822569199e-05, + "loss": 0.01688341349363327, + "step": 96400 + }, + { + "epoch": 13.684882895670688, + "grad_norm": 9.46364974975586, + "learning_rate": 8.632107877927609e-05, + "loss": 0.028126460313796998, + "step": 96410 + }, + { + "epoch": 13.686302342086586, + "grad_norm": 7.244299411773682, + "learning_rate": 8.63196593328602e-05, + "loss": 0.03135330080986023, + "step": 96420 + }, + { + "epoch": 13.687721788502484, + "grad_norm": 7.555941104888916, + "learning_rate": 8.631823988644428e-05, + "loss": 0.040532243251800534, + "step": 96430 + }, + { + "epoch": 13.689141234918381, + "grad_norm": 1.6134663820266724, + "learning_rate": 8.631682044002839e-05, + "loss": 0.06105220913887024, + "step": 96440 + }, + { + "epoch": 13.69056068133428, + "grad_norm": 0.026344293728470802, + "learning_rate": 8.631540099361249e-05, + "loss": 0.03273046612739563, + "step": 96450 + }, + { + "epoch": 13.691980127750178, + "grad_norm": 3.042829990386963, + "learning_rate": 8.63139815471966e-05, + "loss": 0.012820084393024445, + "step": 96460 + }, + { + "epoch": 13.693399574166076, + "grad_norm": 0.07684643566608429, + "learning_rate": 8.63125621007807e-05, + "loss": 0.012618523836135865, + "step": 96470 + }, + { + "epoch": 13.694819020581972, + "grad_norm": 0.09263870865106583, + "learning_rate": 8.631114265436481e-05, + "loss": 0.01064542829990387, + "step": 96480 + }, + { + "epoch": 13.69623846699787, + "grad_norm": 5.022707939147949, + "learning_rate": 8.630972320794891e-05, + "loss": 0.01942266374826431, + "step": 96490 + }, + { + "epoch": 13.697657913413769, + "grad_norm": 1.1834547519683838, + "learning_rate": 8.6308303761533e-05, + "loss": 0.03992840051651001, + "step": 96500 + }, + { + "epoch": 13.697657913413769, + "eval_accuracy": 0.9791441470083296, + "eval_loss": 0.07206237316131592, + "eval_runtime": 32.6855, + "eval_samples_per_second": 481.161, + "eval_steps_per_second": 15.053, + "step": 96500 + }, + { + "epoch": 13.699077359829666, + "grad_norm": 0.13294564187526703, + "learning_rate": 8.630688431511712e-05, + "loss": 0.05398126244544983, + "step": 96510 + }, + { + "epoch": 13.700496806245564, + "grad_norm": 1.2916076183319092, + "learning_rate": 8.630546486870121e-05, + "loss": 0.03420543372631073, + "step": 96520 + }, + { + "epoch": 13.701916252661462, + "grad_norm": 0.049955952912569046, + "learning_rate": 8.630404542228532e-05, + "loss": 0.019472208619117738, + "step": 96530 + }, + { + "epoch": 13.70333569907736, + "grad_norm": 0.1284303069114685, + "learning_rate": 8.630262597586941e-05, + "loss": 0.02896394729614258, + "step": 96540 + }, + { + "epoch": 13.704755145493257, + "grad_norm": 0.5136931538581848, + "learning_rate": 8.630120652945352e-05, + "loss": 0.031944602727890015, + "step": 96550 + }, + { + "epoch": 13.706174591909155, + "grad_norm": 1.1831598281860352, + "learning_rate": 8.629978708303762e-05, + "loss": 0.010644739121198654, + "step": 96560 + }, + { + "epoch": 13.707594038325054, + "grad_norm": 0.017192769795656204, + "learning_rate": 8.629836763662173e-05, + "loss": 0.013148699700832368, + "step": 96570 + }, + { + "epoch": 13.70901348474095, + "grad_norm": 0.10975632816553116, + "learning_rate": 8.629694819020582e-05, + "loss": 0.01649189442396164, + "step": 96580 + }, + { + "epoch": 13.710432931156848, + "grad_norm": 1.0271081924438477, + "learning_rate": 8.629552874378992e-05, + "loss": 0.0365587055683136, + "step": 96590 + }, + { + "epoch": 13.711852377572747, + "grad_norm": 6.755058765411377, + "learning_rate": 8.629410929737403e-05, + "loss": 0.02149779498577118, + "step": 96600 + }, + { + "epoch": 13.713271823988645, + "grad_norm": 0.44150310754776, + "learning_rate": 8.629268985095813e-05, + "loss": 0.016236330568790435, + "step": 96610 + }, + { + "epoch": 13.714691270404542, + "grad_norm": 0.09499189257621765, + "learning_rate": 8.629127040454224e-05, + "loss": 0.023830153048038483, + "step": 96620 + }, + { + "epoch": 13.71611071682044, + "grad_norm": 0.6429286003112793, + "learning_rate": 8.628985095812634e-05, + "loss": 0.024831396341323853, + "step": 96630 + }, + { + "epoch": 13.717530163236338, + "grad_norm": 0.37857896089553833, + "learning_rate": 8.628843151171044e-05, + "loss": 0.04653356075286865, + "step": 96640 + }, + { + "epoch": 13.718949609652235, + "grad_norm": 0.8553446531295776, + "learning_rate": 8.628701206529453e-05, + "loss": 0.016309410333633423, + "step": 96650 + }, + { + "epoch": 13.720369056068133, + "grad_norm": 12.546866416931152, + "learning_rate": 8.628559261887864e-05, + "loss": 0.04090455174446106, + "step": 96660 + }, + { + "epoch": 13.721788502484031, + "grad_norm": 0.5049958229064941, + "learning_rate": 8.628417317246274e-05, + "loss": 0.053654146194458005, + "step": 96670 + }, + { + "epoch": 13.72320794889993, + "grad_norm": 0.32195183634757996, + "learning_rate": 8.628275372604685e-05, + "loss": 0.0933415949344635, + "step": 96680 + }, + { + "epoch": 13.724627395315826, + "grad_norm": 2.5842678546905518, + "learning_rate": 8.628133427963095e-05, + "loss": 0.028504377603530882, + "step": 96690 + }, + { + "epoch": 13.726046841731725, + "grad_norm": 0.4025324583053589, + "learning_rate": 8.627991483321505e-05, + "loss": 0.04696339964866638, + "step": 96700 + }, + { + "epoch": 13.727466288147623, + "grad_norm": 0.9358816742897034, + "learning_rate": 8.627849538679916e-05, + "loss": 0.02000337541103363, + "step": 96710 + }, + { + "epoch": 13.72888573456352, + "grad_norm": 0.13171012699604034, + "learning_rate": 8.627707594038326e-05, + "loss": 0.012757310271263122, + "step": 96720 + }, + { + "epoch": 13.730305180979418, + "grad_norm": 4.228630065917969, + "learning_rate": 8.627565649396737e-05, + "loss": 0.016465026140213012, + "step": 96730 + }, + { + "epoch": 13.731724627395316, + "grad_norm": 1.9195640087127686, + "learning_rate": 8.627423704755145e-05, + "loss": 0.06258389949798585, + "step": 96740 + }, + { + "epoch": 13.733144073811214, + "grad_norm": 0.1192382425069809, + "learning_rate": 8.627281760113556e-05, + "loss": 0.02845693230628967, + "step": 96750 + }, + { + "epoch": 13.73456352022711, + "grad_norm": 0.35611721873283386, + "learning_rate": 8.627139815471966e-05, + "loss": 0.024792948365211488, + "step": 96760 + }, + { + "epoch": 13.735982966643009, + "grad_norm": 0.27649229764938354, + "learning_rate": 8.626997870830377e-05, + "loss": 0.027475398778915406, + "step": 96770 + }, + { + "epoch": 13.737402413058907, + "grad_norm": 0.01736115850508213, + "learning_rate": 8.626855926188787e-05, + "loss": 0.0017727486789226531, + "step": 96780 + }, + { + "epoch": 13.738821859474804, + "grad_norm": 0.0943591445684433, + "learning_rate": 8.626713981547196e-05, + "loss": 0.053753846883773805, + "step": 96790 + }, + { + "epoch": 13.740241305890702, + "grad_norm": 5.441102981567383, + "learning_rate": 8.626572036905607e-05, + "loss": 0.018227586150169374, + "step": 96800 + }, + { + "epoch": 13.7416607523066, + "grad_norm": 0.41876062750816345, + "learning_rate": 8.626430092264017e-05, + "loss": 0.048694318532943724, + "step": 96810 + }, + { + "epoch": 13.743080198722499, + "grad_norm": 2.1366820335388184, + "learning_rate": 8.626288147622428e-05, + "loss": 0.020703762769699097, + "step": 96820 + }, + { + "epoch": 13.744499645138395, + "grad_norm": 0.06722243875265121, + "learning_rate": 8.626146202980838e-05, + "loss": 0.04667206406593323, + "step": 96830 + }, + { + "epoch": 13.745919091554294, + "grad_norm": 5.052031517028809, + "learning_rate": 8.626004258339249e-05, + "loss": 0.04078722596168518, + "step": 96840 + }, + { + "epoch": 13.747338537970192, + "grad_norm": 4.313608646392822, + "learning_rate": 8.625862313697658e-05, + "loss": 0.02702971696853638, + "step": 96850 + }, + { + "epoch": 13.748757984386089, + "grad_norm": 11.140143394470215, + "learning_rate": 8.625720369056069e-05, + "loss": 0.05737009048461914, + "step": 96860 + }, + { + "epoch": 13.750177430801987, + "grad_norm": 1.1045628786087036, + "learning_rate": 8.625578424414478e-05, + "loss": 0.017936806380748748, + "step": 96870 + }, + { + "epoch": 13.751596877217885, + "grad_norm": 0.9916929006576538, + "learning_rate": 8.62543647977289e-05, + "loss": 0.036751154065132144, + "step": 96880 + }, + { + "epoch": 13.753016323633783, + "grad_norm": 0.5851067900657654, + "learning_rate": 8.625294535131299e-05, + "loss": 0.06546493172645569, + "step": 96890 + }, + { + "epoch": 13.75443577004968, + "grad_norm": 2.6111655235290527, + "learning_rate": 8.625152590489709e-05, + "loss": 0.010462664812803269, + "step": 96900 + }, + { + "epoch": 13.755855216465578, + "grad_norm": 0.018237149342894554, + "learning_rate": 8.62501064584812e-05, + "loss": 0.01750764548778534, + "step": 96910 + }, + { + "epoch": 13.757274662881477, + "grad_norm": 0.44217807054519653, + "learning_rate": 8.62486870120653e-05, + "loss": 0.02455083727836609, + "step": 96920 + }, + { + "epoch": 13.758694109297373, + "grad_norm": 0.14735212922096252, + "learning_rate": 8.624726756564941e-05, + "loss": 0.05003917217254639, + "step": 96930 + }, + { + "epoch": 13.760113555713271, + "grad_norm": 8.448341369628906, + "learning_rate": 8.62458481192335e-05, + "loss": 0.011260174214839935, + "step": 96940 + }, + { + "epoch": 13.76153300212917, + "grad_norm": 0.5719733834266663, + "learning_rate": 8.62444286728176e-05, + "loss": 0.05195282101631164, + "step": 96950 + }, + { + "epoch": 13.762952448545068, + "grad_norm": 0.014794730581343174, + "learning_rate": 8.62430092264017e-05, + "loss": 0.02123808115720749, + "step": 96960 + }, + { + "epoch": 13.764371894960965, + "grad_norm": 0.8494911789894104, + "learning_rate": 8.624158977998581e-05, + "loss": 0.00920439586043358, + "step": 96970 + }, + { + "epoch": 13.765791341376863, + "grad_norm": 0.963769793510437, + "learning_rate": 8.624017033356991e-05, + "loss": 0.03247123658657074, + "step": 96980 + }, + { + "epoch": 13.767210787792761, + "grad_norm": 0.07355058938264847, + "learning_rate": 8.623875088715402e-05, + "loss": 0.015551319718360901, + "step": 96990 + }, + { + "epoch": 13.768630234208658, + "grad_norm": 0.03591252863407135, + "learning_rate": 8.623733144073812e-05, + "loss": 0.014192771911621094, + "step": 97000 + }, + { + "epoch": 13.768630234208658, + "eval_accuracy": 0.9810516945380555, + "eval_loss": 0.06501225382089615, + "eval_runtime": 32.7725, + "eval_samples_per_second": 479.884, + "eval_steps_per_second": 15.013, + "step": 97000 + }, + { + "epoch": 13.770049680624556, + "grad_norm": 0.28048276901245117, + "learning_rate": 8.623591199432221e-05, + "loss": 0.04204888939857483, + "step": 97010 + }, + { + "epoch": 13.771469127040454, + "grad_norm": 0.03193013370037079, + "learning_rate": 8.623449254790633e-05, + "loss": 0.03201920092105866, + "step": 97020 + }, + { + "epoch": 13.772888573456353, + "grad_norm": 0.08718361705541611, + "learning_rate": 8.623307310149042e-05, + "loss": 0.018675795197486876, + "step": 97030 + }, + { + "epoch": 13.77430801987225, + "grad_norm": 1.2164140939712524, + "learning_rate": 8.623165365507453e-05, + "loss": 0.028575024008750914, + "step": 97040 + }, + { + "epoch": 13.775727466288147, + "grad_norm": 12.365880966186523, + "learning_rate": 8.623023420865862e-05, + "loss": 0.0183505117893219, + "step": 97050 + }, + { + "epoch": 13.777146912704046, + "grad_norm": 4.400852680206299, + "learning_rate": 8.622881476224273e-05, + "loss": 0.00443883091211319, + "step": 97060 + }, + { + "epoch": 13.778566359119942, + "grad_norm": 0.044489286839962006, + "learning_rate": 8.622739531582683e-05, + "loss": 0.006819433718919754, + "step": 97070 + }, + { + "epoch": 13.77998580553584, + "grad_norm": 0.26168087124824524, + "learning_rate": 8.622597586941094e-05, + "loss": 0.031000277400016783, + "step": 97080 + }, + { + "epoch": 13.781405251951739, + "grad_norm": 0.05729484185576439, + "learning_rate": 8.622455642299503e-05, + "loss": 0.016029319167137145, + "step": 97090 + }, + { + "epoch": 13.782824698367637, + "grad_norm": 0.1679743528366089, + "learning_rate": 8.622313697657913e-05, + "loss": 0.007387077063322067, + "step": 97100 + }, + { + "epoch": 13.784244144783534, + "grad_norm": 1.0637929439544678, + "learning_rate": 8.622171753016324e-05, + "loss": 0.034793981909751893, + "step": 97110 + }, + { + "epoch": 13.785663591199432, + "grad_norm": 0.06055055558681488, + "learning_rate": 8.622029808374734e-05, + "loss": 0.0071873687207698825, + "step": 97120 + }, + { + "epoch": 13.78708303761533, + "grad_norm": 6.253574371337891, + "learning_rate": 8.621887863733145e-05, + "loss": 0.02324788421392441, + "step": 97130 + }, + { + "epoch": 13.788502484031227, + "grad_norm": 8.996084213256836, + "learning_rate": 8.621745919091555e-05, + "loss": 0.02676660418510437, + "step": 97140 + }, + { + "epoch": 13.789921930447125, + "grad_norm": 0.2644156217575073, + "learning_rate": 8.621603974449966e-05, + "loss": 0.00823134183883667, + "step": 97150 + }, + { + "epoch": 13.791341376863024, + "grad_norm": 7.25373649597168, + "learning_rate": 8.621462029808374e-05, + "loss": 0.02458227276802063, + "step": 97160 + }, + { + "epoch": 13.792760823278922, + "grad_norm": 13.660566329956055, + "learning_rate": 8.621320085166785e-05, + "loss": 0.027688038349151612, + "step": 97170 + }, + { + "epoch": 13.794180269694818, + "grad_norm": 0.9016245007514954, + "learning_rate": 8.621178140525195e-05, + "loss": 0.02340591847896576, + "step": 97180 + }, + { + "epoch": 13.795599716110717, + "grad_norm": 0.5798392295837402, + "learning_rate": 8.621036195883606e-05, + "loss": 0.024847133457660674, + "step": 97190 + }, + { + "epoch": 13.797019162526615, + "grad_norm": 0.1002761498093605, + "learning_rate": 8.620894251242016e-05, + "loss": 0.05150417685508728, + "step": 97200 + }, + { + "epoch": 13.798438608942512, + "grad_norm": 0.3393736481666565, + "learning_rate": 8.620752306600426e-05, + "loss": 0.07004774808883667, + "step": 97210 + }, + { + "epoch": 13.79985805535841, + "grad_norm": 7.55497407913208, + "learning_rate": 8.620610361958837e-05, + "loss": 0.008413270115852356, + "step": 97220 + }, + { + "epoch": 13.801277501774308, + "grad_norm": 1.928196668624878, + "learning_rate": 8.620468417317247e-05, + "loss": 0.02218780517578125, + "step": 97230 + }, + { + "epoch": 13.802696948190206, + "grad_norm": 2.2993032932281494, + "learning_rate": 8.620326472675658e-05, + "loss": 0.022586297988891602, + "step": 97240 + }, + { + "epoch": 13.804116394606103, + "grad_norm": 0.03581158444285393, + "learning_rate": 8.620184528034067e-05, + "loss": 0.00915946438908577, + "step": 97250 + }, + { + "epoch": 13.805535841022001, + "grad_norm": 0.1405903398990631, + "learning_rate": 8.620042583392477e-05, + "loss": 0.016153512895107268, + "step": 97260 + }, + { + "epoch": 13.8069552874379, + "grad_norm": 2.959599018096924, + "learning_rate": 8.619900638750887e-05, + "loss": 0.02759789824485779, + "step": 97270 + }, + { + "epoch": 13.808374733853796, + "grad_norm": 0.08808876574039459, + "learning_rate": 8.619758694109298e-05, + "loss": 0.05625124573707581, + "step": 97280 + }, + { + "epoch": 13.809794180269694, + "grad_norm": 3.4986207485198975, + "learning_rate": 8.619616749467708e-05, + "loss": 0.009423717856407166, + "step": 97290 + }, + { + "epoch": 13.811213626685593, + "grad_norm": 0.2367706447839737, + "learning_rate": 8.619474804826119e-05, + "loss": 0.03007735013961792, + "step": 97300 + }, + { + "epoch": 13.812633073101491, + "grad_norm": 1.6623224020004272, + "learning_rate": 8.619332860184528e-05, + "loss": 0.0030352432280778886, + "step": 97310 + }, + { + "epoch": 13.814052519517388, + "grad_norm": 0.6825453639030457, + "learning_rate": 8.619190915542938e-05, + "loss": 0.015154258906841278, + "step": 97320 + }, + { + "epoch": 13.815471965933286, + "grad_norm": 0.1828623265028, + "learning_rate": 8.619048970901349e-05, + "loss": 0.0347985178232193, + "step": 97330 + }, + { + "epoch": 13.816891412349184, + "grad_norm": 0.72845458984375, + "learning_rate": 8.618907026259759e-05, + "loss": 0.016802479326725007, + "step": 97340 + }, + { + "epoch": 13.81831085876508, + "grad_norm": 0.04905217885971069, + "learning_rate": 8.61876508161817e-05, + "loss": 0.028523996472358704, + "step": 97350 + }, + { + "epoch": 13.819730305180979, + "grad_norm": 0.05003027990460396, + "learning_rate": 8.618623136976579e-05, + "loss": 0.03651518523693085, + "step": 97360 + }, + { + "epoch": 13.821149751596877, + "grad_norm": 0.8443085551261902, + "learning_rate": 8.61848119233499e-05, + "loss": 0.06068788170814514, + "step": 97370 + }, + { + "epoch": 13.822569198012776, + "grad_norm": 3.9706737995147705, + "learning_rate": 8.6183392476934e-05, + "loss": 0.009664274752140045, + "step": 97380 + }, + { + "epoch": 13.823988644428672, + "grad_norm": 7.973998546600342, + "learning_rate": 8.61819730305181e-05, + "loss": 0.019925667345523833, + "step": 97390 + }, + { + "epoch": 13.82540809084457, + "grad_norm": 0.016353856772184372, + "learning_rate": 8.61805535841022e-05, + "loss": 0.012330979853868485, + "step": 97400 + }, + { + "epoch": 13.826827537260469, + "grad_norm": 0.08897315710783005, + "learning_rate": 8.61791341376863e-05, + "loss": 0.011504063010215759, + "step": 97410 + }, + { + "epoch": 13.828246983676365, + "grad_norm": 0.01945200376212597, + "learning_rate": 8.617771469127041e-05, + "loss": 0.04555101692676544, + "step": 97420 + }, + { + "epoch": 13.829666430092264, + "grad_norm": 10.132003784179688, + "learning_rate": 8.617629524485451e-05, + "loss": 0.02212253361940384, + "step": 97430 + }, + { + "epoch": 13.831085876508162, + "grad_norm": 0.14945967495441437, + "learning_rate": 8.617487579843862e-05, + "loss": 0.01767318695783615, + "step": 97440 + }, + { + "epoch": 13.83250532292406, + "grad_norm": 3.7053380012512207, + "learning_rate": 8.617345635202272e-05, + "loss": 0.020854970812797545, + "step": 97450 + }, + { + "epoch": 13.833924769339957, + "grad_norm": 5.3743109703063965, + "learning_rate": 8.617203690560681e-05, + "loss": 0.041898411512374875, + "step": 97460 + }, + { + "epoch": 13.835344215755855, + "grad_norm": 0.31361621618270874, + "learning_rate": 8.617061745919091e-05, + "loss": 0.010599453002214432, + "step": 97470 + }, + { + "epoch": 13.836763662171753, + "grad_norm": 0.018387148156762123, + "learning_rate": 8.616919801277502e-05, + "loss": 0.014518235623836518, + "step": 97480 + }, + { + "epoch": 13.83818310858765, + "grad_norm": 0.4590878188610077, + "learning_rate": 8.616777856635912e-05, + "loss": 0.032176035642623904, + "step": 97490 + }, + { + "epoch": 13.839602555003548, + "grad_norm": 0.02756396494805813, + "learning_rate": 8.616635911994323e-05, + "loss": 0.02949948012828827, + "step": 97500 + }, + { + "epoch": 13.839602555003548, + "eval_accuracy": 0.9776181089845488, + "eval_loss": 0.0747826024889946, + "eval_runtime": 30.7964, + "eval_samples_per_second": 510.676, + "eval_steps_per_second": 15.976, + "step": 97500 + }, + { + "epoch": 13.841022001419446, + "grad_norm": 0.6445552706718445, + "learning_rate": 8.616493967352734e-05, + "loss": 0.05971266627311707, + "step": 97510 + }, + { + "epoch": 13.842441447835345, + "grad_norm": 0.09805190563201904, + "learning_rate": 8.616352022711142e-05, + "loss": 0.024461531639099122, + "step": 97520 + }, + { + "epoch": 13.843860894251241, + "grad_norm": 6.539288520812988, + "learning_rate": 8.616210078069554e-05, + "loss": 0.019382362067699433, + "step": 97530 + }, + { + "epoch": 13.84528034066714, + "grad_norm": 1.607917070388794, + "learning_rate": 8.616068133427963e-05, + "loss": 0.011100460588932038, + "step": 97540 + }, + { + "epoch": 13.846699787083038, + "grad_norm": 0.9553514719009399, + "learning_rate": 8.615926188786374e-05, + "loss": 0.041931447386741635, + "step": 97550 + }, + { + "epoch": 13.848119233498934, + "grad_norm": 0.7937570214271545, + "learning_rate": 8.615784244144784e-05, + "loss": 0.026648005843162535, + "step": 97560 + }, + { + "epoch": 13.849538679914833, + "grad_norm": 0.1172422394156456, + "learning_rate": 8.615642299503194e-05, + "loss": 0.019187642633914946, + "step": 97570 + }, + { + "epoch": 13.850958126330731, + "grad_norm": 0.2581722140312195, + "learning_rate": 8.615500354861604e-05, + "loss": 0.021943846344947816, + "step": 97580 + }, + { + "epoch": 13.85237757274663, + "grad_norm": 1.9560277462005615, + "learning_rate": 8.615358410220015e-05, + "loss": 0.025108674168586732, + "step": 97590 + }, + { + "epoch": 13.853797019162526, + "grad_norm": 0.5822705626487732, + "learning_rate": 8.615216465578424e-05, + "loss": 0.03498307466506958, + "step": 97600 + }, + { + "epoch": 13.855216465578424, + "grad_norm": 3.110405921936035, + "learning_rate": 8.615074520936836e-05, + "loss": 0.030993205308914185, + "step": 97610 + }, + { + "epoch": 13.856635911994323, + "grad_norm": 5.9054741859436035, + "learning_rate": 8.614932576295245e-05, + "loss": 0.028361022472381592, + "step": 97620 + }, + { + "epoch": 13.858055358410219, + "grad_norm": 0.049585871398448944, + "learning_rate": 8.614790631653655e-05, + "loss": 0.016482987999916078, + "step": 97630 + }, + { + "epoch": 13.859474804826117, + "grad_norm": 2.339905261993408, + "learning_rate": 8.614648687012066e-05, + "loss": 0.03827682137489319, + "step": 97640 + }, + { + "epoch": 13.860894251242016, + "grad_norm": 0.04161804914474487, + "learning_rate": 8.614506742370476e-05, + "loss": 0.02619813084602356, + "step": 97650 + }, + { + "epoch": 13.862313697657914, + "grad_norm": 0.275062620639801, + "learning_rate": 8.614364797728887e-05, + "loss": 0.03152703642845154, + "step": 97660 + }, + { + "epoch": 13.86373314407381, + "grad_norm": 2.660701036453247, + "learning_rate": 8.614222853087295e-05, + "loss": 0.024106189608573914, + "step": 97670 + }, + { + "epoch": 13.865152590489709, + "grad_norm": 5.393669605255127, + "learning_rate": 8.614080908445706e-05, + "loss": 0.029433247447013856, + "step": 97680 + }, + { + "epoch": 13.866572036905607, + "grad_norm": 2.5080268383026123, + "learning_rate": 8.613938963804116e-05, + "loss": 0.048150715231895444, + "step": 97690 + }, + { + "epoch": 13.867991483321505, + "grad_norm": 1.032168984413147, + "learning_rate": 8.613797019162527e-05, + "loss": 0.00849594920873642, + "step": 97700 + }, + { + "epoch": 13.869410929737402, + "grad_norm": 0.4703840911388397, + "learning_rate": 8.613655074520938e-05, + "loss": 0.022085925936698912, + "step": 97710 + }, + { + "epoch": 13.8708303761533, + "grad_norm": 0.8159363865852356, + "learning_rate": 8.613513129879347e-05, + "loss": 0.023894134163856506, + "step": 97720 + }, + { + "epoch": 13.872249822569199, + "grad_norm": 1.7675553560256958, + "learning_rate": 8.613371185237758e-05, + "loss": 0.03863615989685058, + "step": 97730 + }, + { + "epoch": 13.873669268985095, + "grad_norm": 0.1795162409543991, + "learning_rate": 8.613229240596168e-05, + "loss": 0.047685247659683225, + "step": 97740 + }, + { + "epoch": 13.875088715400993, + "grad_norm": 0.5392060875892639, + "learning_rate": 8.613087295954579e-05, + "loss": 0.031848755478858945, + "step": 97750 + }, + { + "epoch": 13.876508161816892, + "grad_norm": 0.1371711939573288, + "learning_rate": 8.612945351312988e-05, + "loss": 0.003922409191727638, + "step": 97760 + }, + { + "epoch": 13.87792760823279, + "grad_norm": 13.345478057861328, + "learning_rate": 8.612803406671398e-05, + "loss": 0.026858839392662048, + "step": 97770 + }, + { + "epoch": 13.879347054648687, + "grad_norm": 1.7686020135879517, + "learning_rate": 8.612661462029808e-05, + "loss": 0.028010132908821105, + "step": 97780 + }, + { + "epoch": 13.880766501064585, + "grad_norm": 11.41087532043457, + "learning_rate": 8.612519517388219e-05, + "loss": 0.039809000492095944, + "step": 97790 + }, + { + "epoch": 13.882185947480483, + "grad_norm": 1.6744792461395264, + "learning_rate": 8.61237757274663e-05, + "loss": 0.0045928433537483215, + "step": 97800 + }, + { + "epoch": 13.88360539389638, + "grad_norm": 0.4395209848880768, + "learning_rate": 8.61223562810504e-05, + "loss": 0.01665394753217697, + "step": 97810 + }, + { + "epoch": 13.885024840312278, + "grad_norm": 1.1791696548461914, + "learning_rate": 8.61209368346345e-05, + "loss": 0.016723376512527467, + "step": 97820 + }, + { + "epoch": 13.886444286728176, + "grad_norm": 0.04103924706578255, + "learning_rate": 8.611951738821859e-05, + "loss": 0.03735288977622986, + "step": 97830 + }, + { + "epoch": 13.887863733144075, + "grad_norm": 0.374519407749176, + "learning_rate": 8.61180979418027e-05, + "loss": 0.03749838471412659, + "step": 97840 + }, + { + "epoch": 13.889283179559971, + "grad_norm": 3.321499824523926, + "learning_rate": 8.61166784953868e-05, + "loss": 0.014933030307292938, + "step": 97850 + }, + { + "epoch": 13.89070262597587, + "grad_norm": 1.2417290210723877, + "learning_rate": 8.611525904897091e-05, + "loss": 0.02480054199695587, + "step": 97860 + }, + { + "epoch": 13.892122072391768, + "grad_norm": 8.301517486572266, + "learning_rate": 8.611383960255501e-05, + "loss": 0.03353300094604492, + "step": 97870 + }, + { + "epoch": 13.893541518807664, + "grad_norm": 1.6440887451171875, + "learning_rate": 8.61124201561391e-05, + "loss": 0.029333028197288512, + "step": 97880 + }, + { + "epoch": 13.894960965223563, + "grad_norm": 0.038577500730752945, + "learning_rate": 8.611100070972322e-05, + "loss": 0.008193275332450867, + "step": 97890 + }, + { + "epoch": 13.896380411639461, + "grad_norm": 11.59971809387207, + "learning_rate": 8.610958126330731e-05, + "loss": 0.01720527410507202, + "step": 97900 + }, + { + "epoch": 13.89779985805536, + "grad_norm": 0.11012128740549088, + "learning_rate": 8.610816181689143e-05, + "loss": 0.028680214285850526, + "step": 97910 + }, + { + "epoch": 13.899219304471256, + "grad_norm": 0.0679960697889328, + "learning_rate": 8.610674237047552e-05, + "loss": 0.015310463309288026, + "step": 97920 + }, + { + "epoch": 13.900638750887154, + "grad_norm": 1.3655829429626465, + "learning_rate": 8.610532292405962e-05, + "loss": 0.03698239922523498, + "step": 97930 + }, + { + "epoch": 13.902058197303052, + "grad_norm": 0.7326348423957825, + "learning_rate": 8.610390347764372e-05, + "loss": 0.011951953172683716, + "step": 97940 + }, + { + "epoch": 13.903477643718949, + "grad_norm": 0.3693626821041107, + "learning_rate": 8.610248403122783e-05, + "loss": 0.02224213778972626, + "step": 97950 + }, + { + "epoch": 13.904897090134847, + "grad_norm": 0.18457281589508057, + "learning_rate": 8.610106458481193e-05, + "loss": 0.04400915801525116, + "step": 97960 + }, + { + "epoch": 13.906316536550746, + "grad_norm": 0.31119829416275024, + "learning_rate": 8.609964513839604e-05, + "loss": 0.025654134154319764, + "step": 97970 + }, + { + "epoch": 13.907735982966644, + "grad_norm": 0.9079119563102722, + "learning_rate": 8.609822569198013e-05, + "loss": 0.020143988728523254, + "step": 97980 + }, + { + "epoch": 13.90915542938254, + "grad_norm": 14.945606231689453, + "learning_rate": 8.609680624556423e-05, + "loss": 0.028208765387535095, + "step": 97990 + }, + { + "epoch": 13.910574875798439, + "grad_norm": 0.9307637214660645, + "learning_rate": 8.609538679914834e-05, + "loss": 0.0067070737481117245, + "step": 98000 + }, + { + "epoch": 13.910574875798439, + "eval_accuracy": 0.9830864119030965, + "eval_loss": 0.06170034408569336, + "eval_runtime": 32.3419, + "eval_samples_per_second": 486.273, + "eval_steps_per_second": 15.212, + "step": 98000 + }, + { + "epoch": 13.911994322214337, + "grad_norm": 0.025303415954113007, + "learning_rate": 8.609396735273244e-05, + "loss": 0.027243292331695555, + "step": 98010 + }, + { + "epoch": 13.913413768630233, + "grad_norm": 2.292128086090088, + "learning_rate": 8.609254790631655e-05, + "loss": 0.025096246600151063, + "step": 98020 + }, + { + "epoch": 13.914833215046132, + "grad_norm": 1.6361472606658936, + "learning_rate": 8.609112845990063e-05, + "loss": 0.014097224175930022, + "step": 98030 + }, + { + "epoch": 13.91625266146203, + "grad_norm": 0.0516996905207634, + "learning_rate": 8.608970901348475e-05, + "loss": 0.036923548579216, + "step": 98040 + }, + { + "epoch": 13.917672107877928, + "grad_norm": 2.1145389080047607, + "learning_rate": 8.608828956706884e-05, + "loss": 0.019678601622581483, + "step": 98050 + }, + { + "epoch": 13.919091554293825, + "grad_norm": 6.481072425842285, + "learning_rate": 8.608687012065295e-05, + "loss": 0.01554270088672638, + "step": 98060 + }, + { + "epoch": 13.920511000709723, + "grad_norm": 0.5275385975837708, + "learning_rate": 8.608545067423705e-05, + "loss": 0.05277801156044006, + "step": 98070 + }, + { + "epoch": 13.921930447125622, + "grad_norm": 0.02168644405901432, + "learning_rate": 8.608403122782115e-05, + "loss": 0.019489049911499023, + "step": 98080 + }, + { + "epoch": 13.923349893541518, + "grad_norm": 0.02501189522445202, + "learning_rate": 8.608261178140526e-05, + "loss": 0.08635119199752808, + "step": 98090 + }, + { + "epoch": 13.924769339957416, + "grad_norm": 0.07429813593626022, + "learning_rate": 8.608119233498936e-05, + "loss": 0.025414294004440306, + "step": 98100 + }, + { + "epoch": 13.926188786373315, + "grad_norm": 13.504375457763672, + "learning_rate": 8.607977288857347e-05, + "loss": 0.04383018016815186, + "step": 98110 + }, + { + "epoch": 13.927608232789213, + "grad_norm": 0.5328422784805298, + "learning_rate": 8.607835344215757e-05, + "loss": 0.015777355432510375, + "step": 98120 + }, + { + "epoch": 13.92902767920511, + "grad_norm": 0.07831300050020218, + "learning_rate": 8.607693399574166e-05, + "loss": 0.027874544262886047, + "step": 98130 + }, + { + "epoch": 13.930447125621008, + "grad_norm": 1.3268852233886719, + "learning_rate": 8.607551454932576e-05, + "loss": 0.04068158566951752, + "step": 98140 + }, + { + "epoch": 13.931866572036906, + "grad_norm": 3.3505053520202637, + "learning_rate": 8.607409510290987e-05, + "loss": 0.013634760677814484, + "step": 98150 + }, + { + "epoch": 13.933286018452803, + "grad_norm": 0.1177496388554573, + "learning_rate": 8.607267565649397e-05, + "loss": 0.012508463859558106, + "step": 98160 + }, + { + "epoch": 13.934705464868701, + "grad_norm": 0.4416254162788391, + "learning_rate": 8.607125621007808e-05, + "loss": 0.025961068272590638, + "step": 98170 + }, + { + "epoch": 13.9361249112846, + "grad_norm": 7.925416946411133, + "learning_rate": 8.606983676366218e-05, + "loss": 0.03427475094795227, + "step": 98180 + }, + { + "epoch": 13.937544357700498, + "grad_norm": 0.23578715324401855, + "learning_rate": 8.606841731724627e-05, + "loss": 0.02209036499261856, + "step": 98190 + }, + { + "epoch": 13.938963804116394, + "grad_norm": 0.14721371233463287, + "learning_rate": 8.606699787083038e-05, + "loss": 0.10272301435470581, + "step": 98200 + }, + { + "epoch": 13.940383250532292, + "grad_norm": 3.374873161315918, + "learning_rate": 8.606557842441448e-05, + "loss": 0.020183584094047545, + "step": 98210 + }, + { + "epoch": 13.94180269694819, + "grad_norm": 8.487064361572266, + "learning_rate": 8.606415897799859e-05, + "loss": 0.032809144258499144, + "step": 98220 + }, + { + "epoch": 13.943222143364087, + "grad_norm": 1.1708842515945435, + "learning_rate": 8.606273953158269e-05, + "loss": 0.053074246644973753, + "step": 98230 + }, + { + "epoch": 13.944641589779986, + "grad_norm": 21.59556007385254, + "learning_rate": 8.606132008516679e-05, + "loss": 0.10838912725448609, + "step": 98240 + }, + { + "epoch": 13.946061036195884, + "grad_norm": 0.26949456334114075, + "learning_rate": 8.605990063875089e-05, + "loss": 0.020542636513710022, + "step": 98250 + }, + { + "epoch": 13.947480482611782, + "grad_norm": 1.2219308614730835, + "learning_rate": 8.6058481192335e-05, + "loss": 0.022973744571208952, + "step": 98260 + }, + { + "epoch": 13.948899929027679, + "grad_norm": 2.3675146102905273, + "learning_rate": 8.60570617459191e-05, + "loss": 0.03723432123661041, + "step": 98270 + }, + { + "epoch": 13.950319375443577, + "grad_norm": 0.1154785230755806, + "learning_rate": 8.60556422995032e-05, + "loss": 0.03642845153808594, + "step": 98280 + }, + { + "epoch": 13.951738821859475, + "grad_norm": 1.0712108612060547, + "learning_rate": 8.60542228530873e-05, + "loss": 0.03615443110466003, + "step": 98290 + }, + { + "epoch": 13.953158268275372, + "grad_norm": 0.4232095777988434, + "learning_rate": 8.60528034066714e-05, + "loss": 0.02514898478984833, + "step": 98300 + }, + { + "epoch": 13.95457771469127, + "grad_norm": 0.08924500644207001, + "learning_rate": 8.605138396025551e-05, + "loss": 0.02497561275959015, + "step": 98310 + }, + { + "epoch": 13.955997161107168, + "grad_norm": 0.053555138409137726, + "learning_rate": 8.604996451383961e-05, + "loss": 0.010297687351703643, + "step": 98320 + }, + { + "epoch": 13.957416607523067, + "grad_norm": 0.3748902380466461, + "learning_rate": 8.604854506742372e-05, + "loss": 0.007654424756765366, + "step": 98330 + }, + { + "epoch": 13.958836053938963, + "grad_norm": 8.75135326385498, + "learning_rate": 8.60471256210078e-05, + "loss": 0.03046306371688843, + "step": 98340 + }, + { + "epoch": 13.960255500354862, + "grad_norm": 0.07814659923315048, + "learning_rate": 8.604570617459191e-05, + "loss": 0.006948675215244293, + "step": 98350 + }, + { + "epoch": 13.96167494677076, + "grad_norm": 0.058386798948049545, + "learning_rate": 8.604428672817601e-05, + "loss": 0.04257642328739166, + "step": 98360 + }, + { + "epoch": 13.963094393186656, + "grad_norm": 1.3675897121429443, + "learning_rate": 8.604286728176012e-05, + "loss": 0.03919885754585266, + "step": 98370 + }, + { + "epoch": 13.964513839602555, + "grad_norm": 10.515870094299316, + "learning_rate": 8.604144783534422e-05, + "loss": 0.03455580770969391, + "step": 98380 + }, + { + "epoch": 13.965933286018453, + "grad_norm": 6.172956466674805, + "learning_rate": 8.604002838892832e-05, + "loss": 0.03574661910533905, + "step": 98390 + }, + { + "epoch": 13.967352732434351, + "grad_norm": 0.2484154850244522, + "learning_rate": 8.603860894251243e-05, + "loss": 0.01784258782863617, + "step": 98400 + }, + { + "epoch": 13.968772178850248, + "grad_norm": 0.031984057277441025, + "learning_rate": 8.603718949609652e-05, + "loss": 0.010730892419815063, + "step": 98410 + }, + { + "epoch": 13.970191625266146, + "grad_norm": 0.11616003513336182, + "learning_rate": 8.603577004968064e-05, + "loss": 0.014920552074909211, + "step": 98420 + }, + { + "epoch": 13.971611071682045, + "grad_norm": 0.31973591446876526, + "learning_rate": 8.603435060326473e-05, + "loss": 0.022470778226852416, + "step": 98430 + }, + { + "epoch": 13.973030518097941, + "grad_norm": 0.07785550504922867, + "learning_rate": 8.603293115684883e-05, + "loss": 0.022737446427345275, + "step": 98440 + }, + { + "epoch": 13.97444996451384, + "grad_norm": 9.522161483764648, + "learning_rate": 8.603151171043293e-05, + "loss": 0.05277225971221924, + "step": 98450 + }, + { + "epoch": 13.975869410929738, + "grad_norm": 0.9376153349876404, + "learning_rate": 8.603009226401704e-05, + "loss": 0.03830648064613342, + "step": 98460 + }, + { + "epoch": 13.977288857345636, + "grad_norm": 0.26860079169273376, + "learning_rate": 8.602867281760114e-05, + "loss": 0.004900941625237465, + "step": 98470 + }, + { + "epoch": 13.978708303761533, + "grad_norm": 0.4131862223148346, + "learning_rate": 8.602725337118525e-05, + "loss": 0.022539208829402923, + "step": 98480 + }, + { + "epoch": 13.98012775017743, + "grad_norm": 0.473401814699173, + "learning_rate": 8.602583392476934e-05, + "loss": 0.02035791575908661, + "step": 98490 + }, + { + "epoch": 13.98154719659333, + "grad_norm": 0.4627954661846161, + "learning_rate": 8.602441447835344e-05, + "loss": 0.019416412711143492, + "step": 98500 + }, + { + "epoch": 13.98154719659333, + "eval_accuracy": 0.9834679214090418, + "eval_loss": 0.05638109892606735, + "eval_runtime": 31.4517, + "eval_samples_per_second": 500.036, + "eval_steps_per_second": 15.643, + "step": 98500 + }, + { + "epoch": 13.982966643009226, + "grad_norm": 0.14699269831180573, + "learning_rate": 8.602299503193755e-05, + "loss": 0.013596628606319428, + "step": 98510 + }, + { + "epoch": 13.984386089425124, + "grad_norm": 0.6649786233901978, + "learning_rate": 8.602157558552165e-05, + "loss": 0.02714385688304901, + "step": 98520 + }, + { + "epoch": 13.985805535841022, + "grad_norm": 4.457527160644531, + "learning_rate": 8.602015613910576e-05, + "loss": 0.03538078665733337, + "step": 98530 + }, + { + "epoch": 13.98722498225692, + "grad_norm": 12.919422149658203, + "learning_rate": 8.601873669268984e-05, + "loss": 0.04029761552810669, + "step": 98540 + }, + { + "epoch": 13.988644428672817, + "grad_norm": 0.2152722179889679, + "learning_rate": 8.601731724627396e-05, + "loss": 0.0575924813747406, + "step": 98550 + }, + { + "epoch": 13.990063875088715, + "grad_norm": 4.616553783416748, + "learning_rate": 8.601589779985805e-05, + "loss": 0.018635259568691255, + "step": 98560 + }, + { + "epoch": 13.991483321504614, + "grad_norm": 6.969879150390625, + "learning_rate": 8.601447835344216e-05, + "loss": 0.018483972549438475, + "step": 98570 + }, + { + "epoch": 13.99290276792051, + "grad_norm": 0.6723334789276123, + "learning_rate": 8.601305890702626e-05, + "loss": 0.00979367196559906, + "step": 98580 + }, + { + "epoch": 13.994322214336409, + "grad_norm": 1.566240906715393, + "learning_rate": 8.601163946061037e-05, + "loss": 0.007228370010852814, + "step": 98590 + }, + { + "epoch": 13.995741660752307, + "grad_norm": 1.199708104133606, + "learning_rate": 8.601022001419447e-05, + "loss": 0.016369998455047607, + "step": 98600 + }, + { + "epoch": 13.997161107168205, + "grad_norm": 0.20838220417499542, + "learning_rate": 8.600880056777857e-05, + "loss": 0.023891144990921022, + "step": 98610 + }, + { + "epoch": 13.998580553584102, + "grad_norm": 0.1336381584405899, + "learning_rate": 8.600738112136268e-05, + "loss": 0.028340649604797364, + "step": 98620 + }, + { + "epoch": 14.0, + "grad_norm": 0.5767505168914795, + "learning_rate": 8.600596167494678e-05, + "loss": 0.00616534873843193, + "step": 98630 + }, + { + "epoch": 14.001419446415898, + "grad_norm": 7.575362205505371, + "learning_rate": 8.600454222853089e-05, + "loss": 0.044139009714126584, + "step": 98640 + }, + { + "epoch": 14.002838892831795, + "grad_norm": 0.02623675763607025, + "learning_rate": 8.600312278211497e-05, + "loss": 0.017933820188045502, + "step": 98650 + }, + { + "epoch": 14.004258339247693, + "grad_norm": 0.4991348087787628, + "learning_rate": 8.600170333569908e-05, + "loss": 0.02468074709177017, + "step": 98660 + }, + { + "epoch": 14.005677785663591, + "grad_norm": 0.03932322561740875, + "learning_rate": 8.600028388928318e-05, + "loss": 0.04342496395111084, + "step": 98670 + }, + { + "epoch": 14.00709723207949, + "grad_norm": 7.684366226196289, + "learning_rate": 8.599886444286729e-05, + "loss": 0.05863423347473144, + "step": 98680 + }, + { + "epoch": 14.008516678495386, + "grad_norm": 0.6191020607948303, + "learning_rate": 8.599744499645139e-05, + "loss": 0.03286792933940887, + "step": 98690 + }, + { + "epoch": 14.009936124911285, + "grad_norm": 12.611638069152832, + "learning_rate": 8.599602555003548e-05, + "loss": 0.046319156885147095, + "step": 98700 + }, + { + "epoch": 14.011355571327183, + "grad_norm": 0.5372052192687988, + "learning_rate": 8.59946061036196e-05, + "loss": 0.01128637120127678, + "step": 98710 + }, + { + "epoch": 14.01277501774308, + "grad_norm": 1.9387632608413696, + "learning_rate": 8.599318665720369e-05, + "loss": 0.025460246205329894, + "step": 98720 + }, + { + "epoch": 14.014194464158978, + "grad_norm": 6.6452555656433105, + "learning_rate": 8.59917672107878e-05, + "loss": 0.015543276071548462, + "step": 98730 + }, + { + "epoch": 14.015613910574876, + "grad_norm": 0.17176109552383423, + "learning_rate": 8.59903477643719e-05, + "loss": 0.013405351340770722, + "step": 98740 + }, + { + "epoch": 14.017033356990774, + "grad_norm": 0.30507367849349976, + "learning_rate": 8.5988928317956e-05, + "loss": 0.02677640914916992, + "step": 98750 + }, + { + "epoch": 14.01845280340667, + "grad_norm": 0.07649233937263489, + "learning_rate": 8.59875088715401e-05, + "loss": 0.004966056346893311, + "step": 98760 + }, + { + "epoch": 14.01987224982257, + "grad_norm": 0.1671622395515442, + "learning_rate": 8.59860894251242e-05, + "loss": 0.06767297387123108, + "step": 98770 + }, + { + "epoch": 14.021291696238467, + "grad_norm": 2.377171516418457, + "learning_rate": 8.59846699787083e-05, + "loss": 0.027815097570419313, + "step": 98780 + }, + { + "epoch": 14.022711142654364, + "grad_norm": 0.3002987802028656, + "learning_rate": 8.598325053229241e-05, + "loss": 0.040025681257247925, + "step": 98790 + }, + { + "epoch": 14.024130589070262, + "grad_norm": 0.2978041470050812, + "learning_rate": 8.598183108587651e-05, + "loss": 0.005738424882292747, + "step": 98800 + }, + { + "epoch": 14.02555003548616, + "grad_norm": 0.4054115116596222, + "learning_rate": 8.598041163946061e-05, + "loss": 0.057332354784011844, + "step": 98810 + }, + { + "epoch": 14.026969481902059, + "grad_norm": 6.931030750274658, + "learning_rate": 8.597899219304472e-05, + "loss": 0.004104747623205185, + "step": 98820 + }, + { + "epoch": 14.028388928317955, + "grad_norm": 0.2317405343055725, + "learning_rate": 8.597757274662882e-05, + "loss": 0.00666595995426178, + "step": 98830 + }, + { + "epoch": 14.029808374733854, + "grad_norm": 0.5171235799789429, + "learning_rate": 8.597615330021293e-05, + "loss": 0.04026064872741699, + "step": 98840 + }, + { + "epoch": 14.031227821149752, + "grad_norm": 0.126663938164711, + "learning_rate": 8.597473385379701e-05, + "loss": 0.012223343551158904, + "step": 98850 + }, + { + "epoch": 14.032647267565649, + "grad_norm": 11.934538841247559, + "learning_rate": 8.597331440738112e-05, + "loss": 0.02199336588382721, + "step": 98860 + }, + { + "epoch": 14.034066713981547, + "grad_norm": 3.310514450073242, + "learning_rate": 8.597189496096522e-05, + "loss": 0.01725810319185257, + "step": 98870 + }, + { + "epoch": 14.035486160397445, + "grad_norm": 6.979939937591553, + "learning_rate": 8.597047551454933e-05, + "loss": 0.04737822115421295, + "step": 98880 + }, + { + "epoch": 14.036905606813344, + "grad_norm": 0.2535474896430969, + "learning_rate": 8.596905606813343e-05, + "loss": 0.026235219836235047, + "step": 98890 + }, + { + "epoch": 14.03832505322924, + "grad_norm": 0.12438727170228958, + "learning_rate": 8.596763662171753e-05, + "loss": 0.054426532983779904, + "step": 98900 + }, + { + "epoch": 14.039744499645138, + "grad_norm": 13.687928199768066, + "learning_rate": 8.596621717530164e-05, + "loss": 0.05139861702919006, + "step": 98910 + }, + { + "epoch": 14.041163946061037, + "grad_norm": 0.013612110167741776, + "learning_rate": 8.596479772888573e-05, + "loss": 0.047477427124977115, + "step": 98920 + }, + { + "epoch": 14.042583392476933, + "grad_norm": 3.528958559036255, + "learning_rate": 8.596337828246985e-05, + "loss": 0.0062147751450538635, + "step": 98930 + }, + { + "epoch": 14.044002838892832, + "grad_norm": 8.854484558105469, + "learning_rate": 8.596195883605394e-05, + "loss": 0.021570898592472076, + "step": 98940 + }, + { + "epoch": 14.04542228530873, + "grad_norm": 3.4759981632232666, + "learning_rate": 8.596053938963805e-05, + "loss": 0.07649534344673156, + "step": 98950 + }, + { + "epoch": 14.046841731724628, + "grad_norm": 1.6767079830169678, + "learning_rate": 8.595911994322214e-05, + "loss": 0.03111596405506134, + "step": 98960 + }, + { + "epoch": 14.048261178140525, + "grad_norm": 0.6731406450271606, + "learning_rate": 8.595770049680625e-05, + "loss": 0.006647625565528869, + "step": 98970 + }, + { + "epoch": 14.049680624556423, + "grad_norm": 1.234702229499817, + "learning_rate": 8.595628105039035e-05, + "loss": 0.09047362208366394, + "step": 98980 + }, + { + "epoch": 14.051100070972321, + "grad_norm": 2.347928524017334, + "learning_rate": 8.595486160397446e-05, + "loss": 0.02055363208055496, + "step": 98990 + }, + { + "epoch": 14.052519517388218, + "grad_norm": 0.2850717604160309, + "learning_rate": 8.595344215755857e-05, + "loss": 0.008768963813781738, + "step": 99000 + }, + { + "epoch": 14.052519517388218, + "eval_accuracy": 0.9847396197621924, + "eval_loss": 0.054401837289333344, + "eval_runtime": 30.7732, + "eval_samples_per_second": 511.061, + "eval_steps_per_second": 15.988, + "step": 99000 + }, + { + "epoch": 14.053938963804116, + "grad_norm": 8.854166984558105, + "learning_rate": 8.595202271114265e-05, + "loss": 0.01207282543182373, + "step": 99010 + }, + { + "epoch": 14.055358410220014, + "grad_norm": 0.08972510695457458, + "learning_rate": 8.595060326472676e-05, + "loss": 0.026438263058662415, + "step": 99020 + }, + { + "epoch": 14.056777856635913, + "grad_norm": 0.09185374528169632, + "learning_rate": 8.594918381831086e-05, + "loss": 0.006578221917152405, + "step": 99030 + }, + { + "epoch": 14.05819730305181, + "grad_norm": 0.0555814653635025, + "learning_rate": 8.594776437189497e-05, + "loss": 0.021403425931930543, + "step": 99040 + }, + { + "epoch": 14.059616749467708, + "grad_norm": 1.687256932258606, + "learning_rate": 8.594634492547907e-05, + "loss": 0.008669185638427734, + "step": 99050 + }, + { + "epoch": 14.061036195883606, + "grad_norm": 0.04107533022761345, + "learning_rate": 8.594492547906317e-05, + "loss": 0.06898521780967712, + "step": 99060 + }, + { + "epoch": 14.062455642299502, + "grad_norm": 2.3766849040985107, + "learning_rate": 8.594350603264726e-05, + "loss": 0.03251543939113617, + "step": 99070 + }, + { + "epoch": 14.0638750887154, + "grad_norm": 0.07620040327310562, + "learning_rate": 8.594208658623137e-05, + "loss": 0.027515420317649843, + "step": 99080 + }, + { + "epoch": 14.065294535131299, + "grad_norm": 0.17723844945430756, + "learning_rate": 8.594066713981549e-05, + "loss": 0.0391448974609375, + "step": 99090 + }, + { + "epoch": 14.066713981547197, + "grad_norm": 0.40072786808013916, + "learning_rate": 8.593924769339958e-05, + "loss": 0.016369281709194182, + "step": 99100 + }, + { + "epoch": 14.068133427963094, + "grad_norm": 20.97338104248047, + "learning_rate": 8.593782824698368e-05, + "loss": 0.03046499490737915, + "step": 99110 + }, + { + "epoch": 14.069552874378992, + "grad_norm": 0.13278134167194366, + "learning_rate": 8.593640880056778e-05, + "loss": 0.0066719576716423035, + "step": 99120 + }, + { + "epoch": 14.07097232079489, + "grad_norm": 1.1554104089736938, + "learning_rate": 8.593498935415189e-05, + "loss": 0.028399959206581116, + "step": 99130 + }, + { + "epoch": 14.072391767210787, + "grad_norm": 10.434669494628906, + "learning_rate": 8.593356990773599e-05, + "loss": 0.01027916967868805, + "step": 99140 + }, + { + "epoch": 14.073811213626685, + "grad_norm": 0.4504542946815491, + "learning_rate": 8.59321504613201e-05, + "loss": 0.00998721569776535, + "step": 99150 + }, + { + "epoch": 14.075230660042584, + "grad_norm": 2.134850263595581, + "learning_rate": 8.593073101490418e-05, + "loss": 0.044223248958587646, + "step": 99160 + }, + { + "epoch": 14.076650106458482, + "grad_norm": 0.6842259168624878, + "learning_rate": 8.592931156848829e-05, + "loss": 0.022297856211662293, + "step": 99170 + }, + { + "epoch": 14.078069552874378, + "grad_norm": 0.15943333506584167, + "learning_rate": 8.59278921220724e-05, + "loss": 0.0330827534198761, + "step": 99180 + }, + { + "epoch": 14.079488999290277, + "grad_norm": 0.26099467277526855, + "learning_rate": 8.59264726756565e-05, + "loss": 0.015528751909732819, + "step": 99190 + }, + { + "epoch": 14.080908445706175, + "grad_norm": 7.665553092956543, + "learning_rate": 8.592505322924061e-05, + "loss": 0.01843828409910202, + "step": 99200 + }, + { + "epoch": 14.082327892122072, + "grad_norm": 0.1282263994216919, + "learning_rate": 8.59236337828247e-05, + "loss": 0.005658290535211563, + "step": 99210 + }, + { + "epoch": 14.08374733853797, + "grad_norm": 3.389772653579712, + "learning_rate": 8.59222143364088e-05, + "loss": 0.009044589102268219, + "step": 99220 + }, + { + "epoch": 14.085166784953868, + "grad_norm": 1.2471179962158203, + "learning_rate": 8.59207948899929e-05, + "loss": 0.03725117146968841, + "step": 99230 + }, + { + "epoch": 14.086586231369767, + "grad_norm": 0.07664031535387039, + "learning_rate": 8.591937544357701e-05, + "loss": 0.014812260866165161, + "step": 99240 + }, + { + "epoch": 14.088005677785663, + "grad_norm": 0.05161131173372269, + "learning_rate": 8.591795599716111e-05, + "loss": 0.022272004187107085, + "step": 99250 + }, + { + "epoch": 14.089425124201561, + "grad_norm": 0.9902065992355347, + "learning_rate": 8.591653655074521e-05, + "loss": 0.03187122941017151, + "step": 99260 + }, + { + "epoch": 14.09084457061746, + "grad_norm": 6.050073623657227, + "learning_rate": 8.591511710432932e-05, + "loss": 0.011229197680950164, + "step": 99270 + }, + { + "epoch": 14.092264017033356, + "grad_norm": 2.0064609050750732, + "learning_rate": 8.591369765791342e-05, + "loss": 0.007860420644283295, + "step": 99280 + }, + { + "epoch": 14.093683463449254, + "grad_norm": 0.3100337088108063, + "learning_rate": 8.591227821149753e-05, + "loss": 0.008608365058898925, + "step": 99290 + }, + { + "epoch": 14.095102909865153, + "grad_norm": 0.5801976919174194, + "learning_rate": 8.591085876508162e-05, + "loss": 0.044814455509185794, + "step": 99300 + }, + { + "epoch": 14.096522356281051, + "grad_norm": 0.1426689773797989, + "learning_rate": 8.590943931866574e-05, + "loss": 0.013057531416416168, + "step": 99310 + }, + { + "epoch": 14.097941802696948, + "grad_norm": 0.8306183218955994, + "learning_rate": 8.590801987224982e-05, + "loss": 0.02446906715631485, + "step": 99320 + }, + { + "epoch": 14.099361249112846, + "grad_norm": 0.06384455412626266, + "learning_rate": 8.590660042583393e-05, + "loss": 0.012030948698520661, + "step": 99330 + }, + { + "epoch": 14.100780695528744, + "grad_norm": 1.4948430061340332, + "learning_rate": 8.590518097941803e-05, + "loss": 0.03669027090072632, + "step": 99340 + }, + { + "epoch": 14.10220014194464, + "grad_norm": 2.81807279586792, + "learning_rate": 8.590376153300214e-05, + "loss": 0.0058967161923646925, + "step": 99350 + }, + { + "epoch": 14.103619588360539, + "grad_norm": 0.1990390419960022, + "learning_rate": 8.590234208658624e-05, + "loss": 0.01164540946483612, + "step": 99360 + }, + { + "epoch": 14.105039034776437, + "grad_norm": 0.062245313078165054, + "learning_rate": 8.590092264017033e-05, + "loss": 0.006242537871003151, + "step": 99370 + }, + { + "epoch": 14.106458481192336, + "grad_norm": 6.959029197692871, + "learning_rate": 8.589950319375444e-05, + "loss": 0.03449139297008515, + "step": 99380 + }, + { + "epoch": 14.107877927608232, + "grad_norm": 0.16301681101322174, + "learning_rate": 8.589808374733854e-05, + "loss": 0.022822481393814088, + "step": 99390 + }, + { + "epoch": 14.10929737402413, + "grad_norm": 0.8691009283065796, + "learning_rate": 8.589666430092265e-05, + "loss": 0.01241927444934845, + "step": 99400 + }, + { + "epoch": 14.110716820440029, + "grad_norm": 2.5208559036254883, + "learning_rate": 8.589524485450675e-05, + "loss": 0.005818301066756249, + "step": 99410 + }, + { + "epoch": 14.112136266855925, + "grad_norm": 0.32502809166908264, + "learning_rate": 8.589382540809085e-05, + "loss": 0.006208596378564834, + "step": 99420 + }, + { + "epoch": 14.113555713271824, + "grad_norm": 0.0438520573079586, + "learning_rate": 8.589254790631655e-05, + "loss": 0.034111449122428895, + "step": 99430 + }, + { + "epoch": 14.114975159687722, + "grad_norm": 6.138270854949951, + "learning_rate": 8.589112845990064e-05, + "loss": 0.0335618793964386, + "step": 99440 + }, + { + "epoch": 14.11639460610362, + "grad_norm": 0.47917288541793823, + "learning_rate": 8.588970901348474e-05, + "loss": 0.006820973753929138, + "step": 99450 + }, + { + "epoch": 14.117814052519517, + "grad_norm": 0.24871699512004852, + "learning_rate": 8.588828956706885e-05, + "loss": 0.010762445628643036, + "step": 99460 + }, + { + "epoch": 14.119233498935415, + "grad_norm": 0.26667341589927673, + "learning_rate": 8.588687012065295e-05, + "loss": 0.031206348538398744, + "step": 99470 + }, + { + "epoch": 14.120652945351313, + "grad_norm": 4.056185245513916, + "learning_rate": 8.588545067423706e-05, + "loss": 0.018537884950637816, + "step": 99480 + }, + { + "epoch": 14.12207239176721, + "grad_norm": 0.356323778629303, + "learning_rate": 8.588403122782114e-05, + "loss": 0.025170964002609254, + "step": 99490 + }, + { + "epoch": 14.123491838183108, + "grad_norm": 0.5766083002090454, + "learning_rate": 8.588261178140525e-05, + "loss": 0.07454286813735962, + "step": 99500 + }, + { + "epoch": 14.123491838183108, + "eval_accuracy": 0.9858205633623705, + "eval_loss": 0.05155513063073158, + "eval_runtime": 31.5063, + "eval_samples_per_second": 499.169, + "eval_steps_per_second": 15.616, + "step": 99500 + }, + { + "epoch": 14.124911284599007, + "grad_norm": 0.03237629681825638, + "learning_rate": 8.588119233498935e-05, + "loss": 0.009366624057292938, + "step": 99510 + }, + { + "epoch": 14.126330731014905, + "grad_norm": 0.24052318930625916, + "learning_rate": 8.587977288857346e-05, + "loss": 0.00688902735710144, + "step": 99520 + }, + { + "epoch": 14.127750177430801, + "grad_norm": 3.427706480026245, + "learning_rate": 8.587835344215756e-05, + "loss": 0.010054501891136169, + "step": 99530 + }, + { + "epoch": 14.1291696238467, + "grad_norm": 0.6839547157287598, + "learning_rate": 8.587693399574166e-05, + "loss": 0.005801101773977279, + "step": 99540 + }, + { + "epoch": 14.130589070262598, + "grad_norm": 3.000164270401001, + "learning_rate": 8.587551454932577e-05, + "loss": 0.0038481026887893675, + "step": 99550 + }, + { + "epoch": 14.132008516678495, + "grad_norm": 1.2292991876602173, + "learning_rate": 8.587409510290987e-05, + "loss": 0.052852863073349, + "step": 99560 + }, + { + "epoch": 14.133427963094393, + "grad_norm": 0.055449653416872025, + "learning_rate": 8.587267565649398e-05, + "loss": 0.003483012318611145, + "step": 99570 + }, + { + "epoch": 14.134847409510291, + "grad_norm": 0.10466715693473816, + "learning_rate": 8.587125621007807e-05, + "loss": 0.03381909728050232, + "step": 99580 + }, + { + "epoch": 14.13626685592619, + "grad_norm": 0.04983215406537056, + "learning_rate": 8.586983676366217e-05, + "loss": 0.013138096034526824, + "step": 99590 + }, + { + "epoch": 14.137686302342086, + "grad_norm": 0.3967151939868927, + "learning_rate": 8.586841731724627e-05, + "loss": 0.01961733549833298, + "step": 99600 + }, + { + "epoch": 14.139105748757984, + "grad_norm": 0.10708726942539215, + "learning_rate": 8.586699787083038e-05, + "loss": 0.03929960429668426, + "step": 99610 + }, + { + "epoch": 14.140525195173883, + "grad_norm": 0.630052387714386, + "learning_rate": 8.586557842441448e-05, + "loss": 0.029604125022888183, + "step": 99620 + }, + { + "epoch": 14.14194464158978, + "grad_norm": 0.006481459829956293, + "learning_rate": 8.586415897799859e-05, + "loss": 0.044744950532913205, + "step": 99630 + }, + { + "epoch": 14.143364088005677, + "grad_norm": 0.10336804389953613, + "learning_rate": 8.586273953158269e-05, + "loss": 0.022768531739711762, + "step": 99640 + }, + { + "epoch": 14.144783534421576, + "grad_norm": 4.770381450653076, + "learning_rate": 8.586132008516678e-05, + "loss": 0.015540549159049987, + "step": 99650 + }, + { + "epoch": 14.146202980837474, + "grad_norm": 0.225839301943779, + "learning_rate": 8.58599006387509e-05, + "loss": 0.007052184641361236, + "step": 99660 + }, + { + "epoch": 14.14762242725337, + "grad_norm": 13.46019172668457, + "learning_rate": 8.585848119233499e-05, + "loss": 0.031240320205688475, + "step": 99670 + }, + { + "epoch": 14.149041873669269, + "grad_norm": 0.5248197317123413, + "learning_rate": 8.58570617459191e-05, + "loss": 0.06163159608840942, + "step": 99680 + }, + { + "epoch": 14.150461320085167, + "grad_norm": 0.07162128388881683, + "learning_rate": 8.58556422995032e-05, + "loss": 0.027028301358222963, + "step": 99690 + }, + { + "epoch": 14.151880766501064, + "grad_norm": 0.01644066721200943, + "learning_rate": 8.58542228530873e-05, + "loss": 0.01212691217660904, + "step": 99700 + }, + { + "epoch": 14.153300212916962, + "grad_norm": 0.15020127594470978, + "learning_rate": 8.58528034066714e-05, + "loss": 0.009765591472387314, + "step": 99710 + }, + { + "epoch": 14.15471965933286, + "grad_norm": 2.6016457080841064, + "learning_rate": 8.58513839602555e-05, + "loss": 0.010732583701610565, + "step": 99720 + }, + { + "epoch": 14.156139105748759, + "grad_norm": 0.014076477847993374, + "learning_rate": 8.58499645138396e-05, + "loss": 0.013157817721366882, + "step": 99730 + }, + { + "epoch": 14.157558552164655, + "grad_norm": 0.020325670018792152, + "learning_rate": 8.584854506742371e-05, + "loss": 0.007947267591953277, + "step": 99740 + }, + { + "epoch": 14.158977998580554, + "grad_norm": 4.29239559173584, + "learning_rate": 8.584712562100781e-05, + "loss": 0.04670752882957459, + "step": 99750 + }, + { + "epoch": 14.160397444996452, + "grad_norm": 0.648195743560791, + "learning_rate": 8.584570617459191e-05, + "loss": 0.03870112299919128, + "step": 99760 + }, + { + "epoch": 14.161816891412348, + "grad_norm": 6.478837966918945, + "learning_rate": 8.584428672817602e-05, + "loss": 0.029291576147079466, + "step": 99770 + }, + { + "epoch": 14.163236337828247, + "grad_norm": 0.534260630607605, + "learning_rate": 8.584286728176012e-05, + "loss": 0.010391275584697723, + "step": 99780 + }, + { + "epoch": 14.164655784244145, + "grad_norm": 0.26018503308296204, + "learning_rate": 8.584144783534423e-05, + "loss": 0.00734531432390213, + "step": 99790 + }, + { + "epoch": 14.166075230660043, + "grad_norm": 0.3219590187072754, + "learning_rate": 8.584002838892831e-05, + "loss": 0.004718105867505073, + "step": 99800 + }, + { + "epoch": 14.16749467707594, + "grad_norm": 0.05225837603211403, + "learning_rate": 8.583860894251242e-05, + "loss": 0.005737151578068733, + "step": 99810 + }, + { + "epoch": 14.168914123491838, + "grad_norm": 2.183525562286377, + "learning_rate": 8.583718949609652e-05, + "loss": 0.01032693013548851, + "step": 99820 + }, + { + "epoch": 14.170333569907736, + "grad_norm": 0.5399150252342224, + "learning_rate": 8.583577004968063e-05, + "loss": 0.006520085036754608, + "step": 99830 + }, + { + "epoch": 14.171753016323633, + "grad_norm": 0.18672649562358856, + "learning_rate": 8.583435060326473e-05, + "loss": 0.005176125466823578, + "step": 99840 + }, + { + "epoch": 14.173172462739531, + "grad_norm": 0.05119100213050842, + "learning_rate": 8.583293115684883e-05, + "loss": 0.023728413879871367, + "step": 99850 + }, + { + "epoch": 14.17459190915543, + "grad_norm": 0.018034903332591057, + "learning_rate": 8.583151171043294e-05, + "loss": 0.016659101843833922, + "step": 99860 + }, + { + "epoch": 14.176011355571328, + "grad_norm": 2.1384027004241943, + "learning_rate": 8.583009226401703e-05, + "loss": 0.037726446986198425, + "step": 99870 + }, + { + "epoch": 14.177430801987224, + "grad_norm": 2.5266449451446533, + "learning_rate": 8.582867281760114e-05, + "loss": 0.025884675979614257, + "step": 99880 + }, + { + "epoch": 14.178850248403123, + "grad_norm": 0.28664296865463257, + "learning_rate": 8.582725337118524e-05, + "loss": 0.026016849279403686, + "step": 99890 + }, + { + "epoch": 14.180269694819021, + "grad_norm": 0.27895665168762207, + "learning_rate": 8.582583392476934e-05, + "loss": 0.0053959134966135025, + "step": 99900 + }, + { + "epoch": 14.181689141234918, + "grad_norm": 0.005411416757851839, + "learning_rate": 8.582441447835344e-05, + "loss": 0.010221479833126068, + "step": 99910 + }, + { + "epoch": 14.183108587650816, + "grad_norm": 0.020072845742106438, + "learning_rate": 8.582299503193755e-05, + "loss": 0.020439541339874266, + "step": 99920 + }, + { + "epoch": 14.184528034066714, + "grad_norm": 0.9487782716751099, + "learning_rate": 8.582157558552165e-05, + "loss": 0.03342333137989044, + "step": 99930 + }, + { + "epoch": 14.185947480482612, + "grad_norm": 3.2585058212280273, + "learning_rate": 8.582015613910576e-05, + "loss": 0.009980223327875137, + "step": 99940 + }, + { + "epoch": 14.187366926898509, + "grad_norm": 2.015963315963745, + "learning_rate": 8.581873669268987e-05, + "loss": 0.039945772290229796, + "step": 99950 + }, + { + "epoch": 14.188786373314407, + "grad_norm": 4.911747932434082, + "learning_rate": 8.581731724627395e-05, + "loss": 0.04019148051738739, + "step": 99960 + }, + { + "epoch": 14.190205819730306, + "grad_norm": 1.3588132858276367, + "learning_rate": 8.581589779985806e-05, + "loss": 0.03294050395488739, + "step": 99970 + }, + { + "epoch": 14.191625266146202, + "grad_norm": 8.521455764770508, + "learning_rate": 8.581447835344216e-05, + "loss": 0.026378729939460756, + "step": 99980 + }, + { + "epoch": 14.1930447125621, + "grad_norm": 1.8677093982696533, + "learning_rate": 8.581305890702627e-05, + "loss": 0.01372207999229431, + "step": 99990 + }, + { + "epoch": 14.194464158977999, + "grad_norm": 0.10951168835163116, + "learning_rate": 8.581163946061037e-05, + "loss": 0.012156614661216735, + "step": 100000 + }, + { + "epoch": 14.194464158977999, + "eval_accuracy": 0.9814967889616583, + "eval_loss": 0.07002508640289307, + "eval_runtime": 30.1341, + "eval_samples_per_second": 521.9, + "eval_steps_per_second": 16.327, + "step": 100000 + }, + { + "epoch": 14.195883605393897, + "grad_norm": 4.687601566314697, + "learning_rate": 8.581022001419446e-05, + "loss": 0.08939755558967591, + "step": 100010 + }, + { + "epoch": 14.197303051809794, + "grad_norm": 14.153443336486816, + "learning_rate": 8.580880056777856e-05, + "loss": 0.0787558138370514, + "step": 100020 + }, + { + "epoch": 14.198722498225692, + "grad_norm": 0.00876574032008648, + "learning_rate": 8.580738112136267e-05, + "loss": 0.020130300521850587, + "step": 100030 + }, + { + "epoch": 14.20014194464159, + "grad_norm": 13.658935546875, + "learning_rate": 8.580596167494678e-05, + "loss": 0.01373588889837265, + "step": 100040 + }, + { + "epoch": 14.201561391057487, + "grad_norm": 0.07329968363046646, + "learning_rate": 8.580454222853088e-05, + "loss": 0.0472787082195282, + "step": 100050 + }, + { + "epoch": 14.202980837473385, + "grad_norm": 4.477980613708496, + "learning_rate": 8.580312278211498e-05, + "loss": 0.05152733325958252, + "step": 100060 + }, + { + "epoch": 14.204400283889283, + "grad_norm": 0.26643145084381104, + "learning_rate": 8.580170333569908e-05, + "loss": 0.06452604532241821, + "step": 100070 + }, + { + "epoch": 14.205819730305182, + "grad_norm": 0.09822005778551102, + "learning_rate": 8.580028388928319e-05, + "loss": 0.017590297758579253, + "step": 100080 + }, + { + "epoch": 14.207239176721078, + "grad_norm": 0.04513048753142357, + "learning_rate": 8.579886444286728e-05, + "loss": 0.04057992696762085, + "step": 100090 + }, + { + "epoch": 14.208658623136976, + "grad_norm": 1.4454002380371094, + "learning_rate": 8.57974449964514e-05, + "loss": 0.01286405324935913, + "step": 100100 + }, + { + "epoch": 14.210078069552875, + "grad_norm": 0.030584536492824554, + "learning_rate": 8.579602555003548e-05, + "loss": 0.02225019484758377, + "step": 100110 + }, + { + "epoch": 14.211497515968771, + "grad_norm": 4.225281715393066, + "learning_rate": 8.579460610361959e-05, + "loss": 0.008702501654624939, + "step": 100120 + }, + { + "epoch": 14.21291696238467, + "grad_norm": 0.37821707129478455, + "learning_rate": 8.57931866572037e-05, + "loss": 0.024403400719165802, + "step": 100130 + }, + { + "epoch": 14.214336408800568, + "grad_norm": 13.270866394042969, + "learning_rate": 8.57917672107878e-05, + "loss": 0.035308724641799925, + "step": 100140 + }, + { + "epoch": 14.215755855216466, + "grad_norm": 0.15924280881881714, + "learning_rate": 8.579034776437191e-05, + "loss": 0.00895916372537613, + "step": 100150 + }, + { + "epoch": 14.217175301632363, + "grad_norm": 0.23295794427394867, + "learning_rate": 8.578892831795599e-05, + "loss": 0.04624794721603394, + "step": 100160 + }, + { + "epoch": 14.218594748048261, + "grad_norm": 0.36088430881500244, + "learning_rate": 8.57875088715401e-05, + "loss": 0.011353875696659087, + "step": 100170 + }, + { + "epoch": 14.22001419446416, + "grad_norm": 0.03815864771604538, + "learning_rate": 8.57860894251242e-05, + "loss": 0.05593507289886475, + "step": 100180 + }, + { + "epoch": 14.221433640880056, + "grad_norm": 0.9573842287063599, + "learning_rate": 8.578466997870831e-05, + "loss": 0.041238024830818176, + "step": 100190 + }, + { + "epoch": 14.222853087295954, + "grad_norm": 2.3269762992858887, + "learning_rate": 8.578325053229241e-05, + "loss": 0.04610580801963806, + "step": 100200 + }, + { + "epoch": 14.224272533711853, + "grad_norm": 3.7672483921051025, + "learning_rate": 8.578183108587651e-05, + "loss": 0.01693085432052612, + "step": 100210 + }, + { + "epoch": 14.22569198012775, + "grad_norm": 16.042234420776367, + "learning_rate": 8.578041163946062e-05, + "loss": 0.04853768944740296, + "step": 100220 + }, + { + "epoch": 14.227111426543647, + "grad_norm": 9.958683967590332, + "learning_rate": 8.577899219304472e-05, + "loss": 0.012343405932188033, + "step": 100230 + }, + { + "epoch": 14.228530872959546, + "grad_norm": 0.010709248483181, + "learning_rate": 8.577757274662883e-05, + "loss": 0.028152650594711302, + "step": 100240 + }, + { + "epoch": 14.229950319375444, + "grad_norm": 6.041823863983154, + "learning_rate": 8.577615330021292e-05, + "loss": 0.0360468327999115, + "step": 100250 + }, + { + "epoch": 14.231369765791342, + "grad_norm": 1.348669409751892, + "learning_rate": 8.577473385379702e-05, + "loss": 0.011957320570945739, + "step": 100260 + }, + { + "epoch": 14.232789212207239, + "grad_norm": 2.1172597408294678, + "learning_rate": 8.577331440738112e-05, + "loss": 0.010528762638568879, + "step": 100270 + }, + { + "epoch": 14.234208658623137, + "grad_norm": 3.0857114791870117, + "learning_rate": 8.577189496096523e-05, + "loss": 0.032012763619422915, + "step": 100280 + }, + { + "epoch": 14.235628105039035, + "grad_norm": 0.1845846027135849, + "learning_rate": 8.577047551454933e-05, + "loss": 0.04174632728099823, + "step": 100290 + }, + { + "epoch": 14.237047551454932, + "grad_norm": 1.4138368368148804, + "learning_rate": 8.576905606813344e-05, + "loss": 0.00865376740694046, + "step": 100300 + }, + { + "epoch": 14.23846699787083, + "grad_norm": 0.2461751401424408, + "learning_rate": 8.576763662171754e-05, + "loss": 0.005132092162966728, + "step": 100310 + }, + { + "epoch": 14.239886444286729, + "grad_norm": 1.0818413496017456, + "learning_rate": 8.576621717530163e-05, + "loss": 0.027364933490753175, + "step": 100320 + }, + { + "epoch": 14.241305890702627, + "grad_norm": 0.22002390027046204, + "learning_rate": 8.576479772888574e-05, + "loss": 0.026398837566375732, + "step": 100330 + }, + { + "epoch": 14.242725337118523, + "grad_norm": 1.0248297452926636, + "learning_rate": 8.576337828246984e-05, + "loss": 0.005221531540155411, + "step": 100340 + }, + { + "epoch": 14.244144783534422, + "grad_norm": 2.140570878982544, + "learning_rate": 8.576195883605395e-05, + "loss": 0.041142240166664124, + "step": 100350 + }, + { + "epoch": 14.24556422995032, + "grad_norm": 0.22018738090991974, + "learning_rate": 8.576053938963805e-05, + "loss": 0.018037812411785127, + "step": 100360 + }, + { + "epoch": 14.246983676366217, + "grad_norm": 0.2746393084526062, + "learning_rate": 8.575911994322215e-05, + "loss": 0.016584034264087676, + "step": 100370 + }, + { + "epoch": 14.248403122782115, + "grad_norm": 1.3459296226501465, + "learning_rate": 8.575770049680624e-05, + "loss": 0.010618263483047485, + "step": 100380 + }, + { + "epoch": 14.249822569198013, + "grad_norm": 0.019148923456668854, + "learning_rate": 8.575628105039035e-05, + "loss": 0.06375048160552979, + "step": 100390 + }, + { + "epoch": 14.251242015613911, + "grad_norm": 0.5293266177177429, + "learning_rate": 8.575486160397445e-05, + "loss": 0.03687954843044281, + "step": 100400 + }, + { + "epoch": 14.252661462029808, + "grad_norm": 1.7792558670043945, + "learning_rate": 8.575344215755856e-05, + "loss": 0.011453892290592193, + "step": 100410 + }, + { + "epoch": 14.254080908445706, + "grad_norm": 0.006813056766986847, + "learning_rate": 8.575202271114266e-05, + "loss": 0.005444584414362907, + "step": 100420 + }, + { + "epoch": 14.255500354861605, + "grad_norm": 2.6219913959503174, + "learning_rate": 8.575060326472676e-05, + "loss": 0.021393463015556335, + "step": 100430 + }, + { + "epoch": 14.256919801277501, + "grad_norm": 6.097723007202148, + "learning_rate": 8.574918381831087e-05, + "loss": 0.027876609563827516, + "step": 100440 + }, + { + "epoch": 14.2583392476934, + "grad_norm": 0.08926290273666382, + "learning_rate": 8.574776437189497e-05, + "loss": 0.004753031581640243, + "step": 100450 + }, + { + "epoch": 14.259758694109298, + "grad_norm": 0.09094270318746567, + "learning_rate": 8.574634492547908e-05, + "loss": 0.013828447461128235, + "step": 100460 + }, + { + "epoch": 14.261178140525196, + "grad_norm": 6.245047569274902, + "learning_rate": 8.574492547906316e-05, + "loss": 0.04343651533126831, + "step": 100470 + }, + { + "epoch": 14.262597586941093, + "grad_norm": 0.009788050316274166, + "learning_rate": 8.574350603264727e-05, + "loss": 0.0223904013633728, + "step": 100480 + }, + { + "epoch": 14.264017033356991, + "grad_norm": 0.4701679050922394, + "learning_rate": 8.574208658623137e-05, + "loss": 0.008665598928928375, + "step": 100490 + }, + { + "epoch": 14.26543647977289, + "grad_norm": 1.7598905563354492, + "learning_rate": 8.574066713981548e-05, + "loss": 0.018955098092556, + "step": 100500 + }, + { + "epoch": 14.26543647977289, + "eval_accuracy": 0.986837922044891, + "eval_loss": 0.04708363860845566, + "eval_runtime": 30.9519, + "eval_samples_per_second": 508.111, + "eval_steps_per_second": 15.896, + "step": 100500 + }, + { + "epoch": 14.266855926188786, + "grad_norm": 0.8167769908905029, + "learning_rate": 8.573924769339958e-05, + "loss": 0.006450720131397247, + "step": 100510 + }, + { + "epoch": 14.268275372604684, + "grad_norm": 6.334622859954834, + "learning_rate": 8.573782824698367e-05, + "loss": 0.01875331699848175, + "step": 100520 + }, + { + "epoch": 14.269694819020582, + "grad_norm": 0.37761762738227844, + "learning_rate": 8.573640880056779e-05, + "loss": 0.035563099384307864, + "step": 100530 + }, + { + "epoch": 14.27111426543648, + "grad_norm": 7.297554016113281, + "learning_rate": 8.573498935415188e-05, + "loss": 0.027794861793518068, + "step": 100540 + }, + { + "epoch": 14.272533711852377, + "grad_norm": 0.4307068884372711, + "learning_rate": 8.5733569907736e-05, + "loss": 0.012988004088401794, + "step": 100550 + }, + { + "epoch": 14.273953158268275, + "grad_norm": 1.8834869861602783, + "learning_rate": 8.573215046132009e-05, + "loss": 0.01791456192731857, + "step": 100560 + }, + { + "epoch": 14.275372604684174, + "grad_norm": 0.041106026619672775, + "learning_rate": 8.573073101490419e-05, + "loss": 0.003982153534889221, + "step": 100570 + }, + { + "epoch": 14.27679205110007, + "grad_norm": 3.069298267364502, + "learning_rate": 8.572931156848829e-05, + "loss": 0.014299359917640687, + "step": 100580 + }, + { + "epoch": 14.278211497515969, + "grad_norm": 0.061462994664907455, + "learning_rate": 8.57278921220724e-05, + "loss": 0.056102311611175536, + "step": 100590 + }, + { + "epoch": 14.279630943931867, + "grad_norm": 9.913806915283203, + "learning_rate": 8.57264726756565e-05, + "loss": 0.035783016681671144, + "step": 100600 + }, + { + "epoch": 14.281050390347765, + "grad_norm": 0.4509928524494171, + "learning_rate": 8.57250532292406e-05, + "loss": 0.007971011102199554, + "step": 100610 + }, + { + "epoch": 14.282469836763662, + "grad_norm": 0.8236384987831116, + "learning_rate": 8.57236337828247e-05, + "loss": 0.013647985458374024, + "step": 100620 + }, + { + "epoch": 14.28388928317956, + "grad_norm": 1.0165517330169678, + "learning_rate": 8.57222143364088e-05, + "loss": 0.012246866524219514, + "step": 100630 + }, + { + "epoch": 14.285308729595458, + "grad_norm": 4.174709320068359, + "learning_rate": 8.572079488999291e-05, + "loss": 0.0925281286239624, + "step": 100640 + }, + { + "epoch": 14.286728176011355, + "grad_norm": 0.11360863596200943, + "learning_rate": 8.571937544357701e-05, + "loss": 0.004844916984438896, + "step": 100650 + }, + { + "epoch": 14.288147622427253, + "grad_norm": 16.6639347076416, + "learning_rate": 8.571795599716112e-05, + "loss": 0.037261354923248294, + "step": 100660 + }, + { + "epoch": 14.289567068843152, + "grad_norm": 0.04352438077330589, + "learning_rate": 8.571653655074522e-05, + "loss": 0.015262427926063537, + "step": 100670 + }, + { + "epoch": 14.29098651525905, + "grad_norm": 1.37468421459198, + "learning_rate": 8.571511710432931e-05, + "loss": 0.017841906845569612, + "step": 100680 + }, + { + "epoch": 14.292405961674946, + "grad_norm": 0.07316924631595612, + "learning_rate": 8.571369765791341e-05, + "loss": 0.02864176332950592, + "step": 100690 + }, + { + "epoch": 14.293825408090845, + "grad_norm": 1.4973715543746948, + "learning_rate": 8.571227821149752e-05, + "loss": 0.029614627361297607, + "step": 100700 + }, + { + "epoch": 14.295244854506743, + "grad_norm": 0.6571174263954163, + "learning_rate": 8.571085876508162e-05, + "loss": 0.00811554342508316, + "step": 100710 + }, + { + "epoch": 14.29666430092264, + "grad_norm": 0.6326042413711548, + "learning_rate": 8.570943931866573e-05, + "loss": 0.012401780486106873, + "step": 100720 + }, + { + "epoch": 14.298083747338538, + "grad_norm": 10.682669639587402, + "learning_rate": 8.570801987224983e-05, + "loss": 0.04365946650505066, + "step": 100730 + }, + { + "epoch": 14.299503193754436, + "grad_norm": 0.2900744676589966, + "learning_rate": 8.570660042583393e-05, + "loss": 0.022507643699645995, + "step": 100740 + }, + { + "epoch": 14.300922640170334, + "grad_norm": 1.0781408548355103, + "learning_rate": 8.570518097941804e-05, + "loss": 0.036583822965621945, + "step": 100750 + }, + { + "epoch": 14.302342086586231, + "grad_norm": 3.8918097019195557, + "learning_rate": 8.570376153300213e-05, + "loss": 0.048116791248321536, + "step": 100760 + }, + { + "epoch": 14.30376153300213, + "grad_norm": 12.145566940307617, + "learning_rate": 8.570234208658624e-05, + "loss": 0.037739336490631104, + "step": 100770 + }, + { + "epoch": 14.305180979418028, + "grad_norm": 5.226789951324463, + "learning_rate": 8.570092264017033e-05, + "loss": 0.011161120235919952, + "step": 100780 + }, + { + "epoch": 14.306600425833924, + "grad_norm": 7.265553951263428, + "learning_rate": 8.569950319375444e-05, + "loss": 0.039224272966384886, + "step": 100790 + }, + { + "epoch": 14.308019872249822, + "grad_norm": 0.03872114047408104, + "learning_rate": 8.569808374733854e-05, + "loss": 0.03458241820335388, + "step": 100800 + }, + { + "epoch": 14.30943931866572, + "grad_norm": 0.05727313831448555, + "learning_rate": 8.569666430092265e-05, + "loss": 0.060028254985809326, + "step": 100810 + }, + { + "epoch": 14.310858765081619, + "grad_norm": 5.03977632522583, + "learning_rate": 8.569524485450675e-05, + "loss": 0.018303254246711732, + "step": 100820 + }, + { + "epoch": 14.312278211497516, + "grad_norm": 0.09965986013412476, + "learning_rate": 8.569382540809084e-05, + "loss": 0.059241455793380735, + "step": 100830 + }, + { + "epoch": 14.313697657913414, + "grad_norm": 0.1746777594089508, + "learning_rate": 8.569240596167495e-05, + "loss": 0.025620871782302858, + "step": 100840 + }, + { + "epoch": 14.315117104329312, + "grad_norm": 0.008593921549618244, + "learning_rate": 8.569098651525905e-05, + "loss": 0.01247217133641243, + "step": 100850 + }, + { + "epoch": 14.316536550745209, + "grad_norm": 1.2999850511550903, + "learning_rate": 8.568956706884316e-05, + "loss": 0.02095559537410736, + "step": 100860 + }, + { + "epoch": 14.317955997161107, + "grad_norm": 0.6758670806884766, + "learning_rate": 8.568814762242726e-05, + "loss": 0.02123214602470398, + "step": 100870 + }, + { + "epoch": 14.319375443577005, + "grad_norm": 12.219331741333008, + "learning_rate": 8.568672817601136e-05, + "loss": 0.010732834041118623, + "step": 100880 + }, + { + "epoch": 14.320794889992904, + "grad_norm": 2.747171640396118, + "learning_rate": 8.568530872959545e-05, + "loss": 0.008057169616222382, + "step": 100890 + }, + { + "epoch": 14.3222143364088, + "grad_norm": 13.715052604675293, + "learning_rate": 8.568388928317956e-05, + "loss": 0.035181736946105956, + "step": 100900 + }, + { + "epoch": 14.323633782824698, + "grad_norm": 4.738697528839111, + "learning_rate": 8.568246983676366e-05, + "loss": 0.04868959188461304, + "step": 100910 + }, + { + "epoch": 14.325053229240597, + "grad_norm": 0.3947387635707855, + "learning_rate": 8.568105039034777e-05, + "loss": 0.0034671925008296967, + "step": 100920 + }, + { + "epoch": 14.326472675656493, + "grad_norm": 1.7262924909591675, + "learning_rate": 8.567963094393187e-05, + "loss": 0.03679071366786957, + "step": 100930 + }, + { + "epoch": 14.327892122072392, + "grad_norm": 5.729528427124023, + "learning_rate": 8.567821149751597e-05, + "loss": 0.03186109662055969, + "step": 100940 + }, + { + "epoch": 14.32931156848829, + "grad_norm": 10.659774780273438, + "learning_rate": 8.567679205110008e-05, + "loss": 0.05918008089065552, + "step": 100950 + }, + { + "epoch": 14.330731014904188, + "grad_norm": 0.5046432614326477, + "learning_rate": 8.567537260468418e-05, + "loss": 0.04549593031406403, + "step": 100960 + }, + { + "epoch": 14.332150461320085, + "grad_norm": 21.737239837646484, + "learning_rate": 8.567395315826829e-05, + "loss": 0.06181545257568359, + "step": 100970 + }, + { + "epoch": 14.333569907735983, + "grad_norm": 0.727480411529541, + "learning_rate": 8.567253371185237e-05, + "loss": 0.0723923921585083, + "step": 100980 + }, + { + "epoch": 14.334989354151881, + "grad_norm": 8.854576110839844, + "learning_rate": 8.567111426543648e-05, + "loss": 0.02641754150390625, + "step": 100990 + }, + { + "epoch": 14.336408800567778, + "grad_norm": 10.398995399475098, + "learning_rate": 8.566969481902058e-05, + "loss": 0.008688996732234954, + "step": 101000 + }, + { + "epoch": 14.336408800567778, + "eval_accuracy": 0.9741209385133847, + "eval_loss": 0.10290984809398651, + "eval_runtime": 30.9993, + "eval_samples_per_second": 507.334, + "eval_steps_per_second": 15.871, + "step": 101000 + }, + { + "epoch": 14.337828246983676, + "grad_norm": 2.215137243270874, + "learning_rate": 8.566827537260469e-05, + "loss": 0.024131688475608825, + "step": 101010 + }, + { + "epoch": 14.339247693399575, + "grad_norm": 0.03819451108574867, + "learning_rate": 8.566685592618879e-05, + "loss": 0.06235557794570923, + "step": 101020 + }, + { + "epoch": 14.340667139815473, + "grad_norm": 4.079460144042969, + "learning_rate": 8.56654364797729e-05, + "loss": 0.009826949238777161, + "step": 101030 + }, + { + "epoch": 14.34208658623137, + "grad_norm": 2.0592312812805176, + "learning_rate": 8.5664017033357e-05, + "loss": 0.022684365510940552, + "step": 101040 + }, + { + "epoch": 14.343506032647268, + "grad_norm": 0.500636637210846, + "learning_rate": 8.56625975869411e-05, + "loss": 0.0208365797996521, + "step": 101050 + }, + { + "epoch": 14.344925479063166, + "grad_norm": 0.7281819581985474, + "learning_rate": 8.56611781405252e-05, + "loss": 0.009615353494882583, + "step": 101060 + }, + { + "epoch": 14.346344925479062, + "grad_norm": 1.5502413511276245, + "learning_rate": 8.56597586941093e-05, + "loss": 0.08351185917854309, + "step": 101070 + }, + { + "epoch": 14.34776437189496, + "grad_norm": 0.9243219494819641, + "learning_rate": 8.565833924769341e-05, + "loss": 0.0019005615264177322, + "step": 101080 + }, + { + "epoch": 14.349183818310859, + "grad_norm": 0.04442301765084267, + "learning_rate": 8.56569198012775e-05, + "loss": 0.013065469264984132, + "step": 101090 + }, + { + "epoch": 14.350603264726757, + "grad_norm": 0.8222534656524658, + "learning_rate": 8.565550035486161e-05, + "loss": 0.019597794115543365, + "step": 101100 + }, + { + "epoch": 14.352022711142654, + "grad_norm": 1.4117416143417358, + "learning_rate": 8.56540809084457e-05, + "loss": 0.02556995749473572, + "step": 101110 + }, + { + "epoch": 14.353442157558552, + "grad_norm": 1.0706210136413574, + "learning_rate": 8.565266146202982e-05, + "loss": 0.025179722905158998, + "step": 101120 + }, + { + "epoch": 14.35486160397445, + "grad_norm": 0.07932776212692261, + "learning_rate": 8.565124201561391e-05, + "loss": 0.04019379019737244, + "step": 101130 + }, + { + "epoch": 14.356281050390347, + "grad_norm": 0.0066423784010112286, + "learning_rate": 8.564982256919801e-05, + "loss": 0.004692386835813522, + "step": 101140 + }, + { + "epoch": 14.357700496806245, + "grad_norm": 0.3343643844127655, + "learning_rate": 8.564840312278212e-05, + "loss": 0.01938415616750717, + "step": 101150 + }, + { + "epoch": 14.359119943222144, + "grad_norm": 0.09557344764471054, + "learning_rate": 8.564698367636622e-05, + "loss": 0.010768187046051026, + "step": 101160 + }, + { + "epoch": 14.360539389638042, + "grad_norm": 0.10270483791828156, + "learning_rate": 8.564556422995033e-05, + "loss": 0.028739473223686217, + "step": 101170 + }, + { + "epoch": 14.361958836053939, + "grad_norm": 2.6416287422180176, + "learning_rate": 8.564414478353443e-05, + "loss": 0.03519602417945862, + "step": 101180 + }, + { + "epoch": 14.363378282469837, + "grad_norm": 4.812742710113525, + "learning_rate": 8.564272533711852e-05, + "loss": 0.042847877740859984, + "step": 101190 + }, + { + "epoch": 14.364797728885735, + "grad_norm": 0.6029514670372009, + "learning_rate": 8.564130589070262e-05, + "loss": 0.05594496130943298, + "step": 101200 + }, + { + "epoch": 14.366217175301632, + "grad_norm": 5.7831196784973145, + "learning_rate": 8.563988644428673e-05, + "loss": 0.03804347813129425, + "step": 101210 + }, + { + "epoch": 14.36763662171753, + "grad_norm": 0.9964185357093811, + "learning_rate": 8.563846699787083e-05, + "loss": 0.007220058888196945, + "step": 101220 + }, + { + "epoch": 14.369056068133428, + "grad_norm": 0.31065285205841064, + "learning_rate": 8.563704755145494e-05, + "loss": 0.03414974212646484, + "step": 101230 + }, + { + "epoch": 14.370475514549327, + "grad_norm": 0.042580220848321915, + "learning_rate": 8.563562810503904e-05, + "loss": 0.028570058941841125, + "step": 101240 + }, + { + "epoch": 14.371894960965223, + "grad_norm": 0.6354950666427612, + "learning_rate": 8.563420865862314e-05, + "loss": 0.02657265365123749, + "step": 101250 + }, + { + "epoch": 14.373314407381121, + "grad_norm": 0.13414810597896576, + "learning_rate": 8.563278921220725e-05, + "loss": 0.0045540835708379745, + "step": 101260 + }, + { + "epoch": 14.37473385379702, + "grad_norm": 2.9977593421936035, + "learning_rate": 8.563136976579134e-05, + "loss": 0.028857874870300292, + "step": 101270 + }, + { + "epoch": 14.376153300212916, + "grad_norm": 0.01566135697066784, + "learning_rate": 8.562995031937545e-05, + "loss": 0.005179446935653686, + "step": 101280 + }, + { + "epoch": 14.377572746628815, + "grad_norm": 0.35850459337234497, + "learning_rate": 8.562853087295954e-05, + "loss": 0.03362755179405212, + "step": 101290 + }, + { + "epoch": 14.378992193044713, + "grad_norm": 0.2453339397907257, + "learning_rate": 8.562711142654365e-05, + "loss": 0.008520130068063736, + "step": 101300 + }, + { + "epoch": 14.380411639460611, + "grad_norm": 0.022847548127174377, + "learning_rate": 8.562569198012775e-05, + "loss": 0.03245726227760315, + "step": 101310 + }, + { + "epoch": 14.381831085876508, + "grad_norm": 0.7326244711875916, + "learning_rate": 8.562427253371186e-05, + "loss": 0.015918411314487457, + "step": 101320 + }, + { + "epoch": 14.383250532292406, + "grad_norm": 2.888026714324951, + "learning_rate": 8.562285308729597e-05, + "loss": 0.022091734409332275, + "step": 101330 + }, + { + "epoch": 14.384669978708304, + "grad_norm": 0.3161379098892212, + "learning_rate": 8.562143364088005e-05, + "loss": 0.019298197329044343, + "step": 101340 + }, + { + "epoch": 14.3860894251242, + "grad_norm": 0.21936295926570892, + "learning_rate": 8.562001419446416e-05, + "loss": 0.011969022452831268, + "step": 101350 + }, + { + "epoch": 14.3875088715401, + "grad_norm": 4.343912601470947, + "learning_rate": 8.561859474804826e-05, + "loss": 0.018465761840343476, + "step": 101360 + }, + { + "epoch": 14.388928317955997, + "grad_norm": 1.221403956413269, + "learning_rate": 8.561717530163237e-05, + "loss": 0.005646177381277084, + "step": 101370 + }, + { + "epoch": 14.390347764371896, + "grad_norm": 0.12888500094413757, + "learning_rate": 8.561575585521647e-05, + "loss": 0.06626540422439575, + "step": 101380 + }, + { + "epoch": 14.391767210787792, + "grad_norm": 0.5282251834869385, + "learning_rate": 8.561433640880058e-05, + "loss": 0.01441122591495514, + "step": 101390 + }, + { + "epoch": 14.39318665720369, + "grad_norm": 3.577329397201538, + "learning_rate": 8.561291696238466e-05, + "loss": 0.0062760643661022185, + "step": 101400 + }, + { + "epoch": 14.394606103619589, + "grad_norm": 1.0349695682525635, + "learning_rate": 8.561149751596878e-05, + "loss": 0.01954812407493591, + "step": 101410 + }, + { + "epoch": 14.396025550035485, + "grad_norm": 3.2017345428466797, + "learning_rate": 8.561007806955289e-05, + "loss": 0.014151862263679505, + "step": 101420 + }, + { + "epoch": 14.397444996451384, + "grad_norm": 1.0517817735671997, + "learning_rate": 8.560865862313698e-05, + "loss": 0.03300471901893616, + "step": 101430 + }, + { + "epoch": 14.398864442867282, + "grad_norm": 0.044288262724876404, + "learning_rate": 8.56072391767211e-05, + "loss": 0.004322785884141922, + "step": 101440 + }, + { + "epoch": 14.40028388928318, + "grad_norm": 1.9268205165863037, + "learning_rate": 8.560581973030518e-05, + "loss": 0.024862904846668244, + "step": 101450 + }, + { + "epoch": 14.401703335699077, + "grad_norm": 0.012334626168012619, + "learning_rate": 8.560440028388929e-05, + "loss": 0.019610564410686492, + "step": 101460 + }, + { + "epoch": 14.403122782114975, + "grad_norm": Infinity, + "learning_rate": 8.560298083747339e-05, + "loss": 0.09853307008743287, + "step": 101470 + }, + { + "epoch": 14.404542228530874, + "grad_norm": 12.455414772033691, + "learning_rate": 8.560170333569908e-05, + "loss": 0.048549103736877444, + "step": 101480 + }, + { + "epoch": 14.40596167494677, + "grad_norm": 0.7663528919219971, + "learning_rate": 8.560028388928318e-05, + "loss": 0.06977788805961609, + "step": 101490 + }, + { + "epoch": 14.407381121362668, + "grad_norm": 0.030371706932783127, + "learning_rate": 8.559886444286729e-05, + "loss": 0.01331292986869812, + "step": 101500 + }, + { + "epoch": 14.407381121362668, + "eval_accuracy": 0.9783175430787817, + "eval_loss": 0.09227219969034195, + "eval_runtime": 30.3574, + "eval_samples_per_second": 518.061, + "eval_steps_per_second": 16.207, + "step": 101500 + }, + { + "epoch": 14.408800567778567, + "grad_norm": 0.9815260767936707, + "learning_rate": 8.559744499645139e-05, + "loss": 0.04523753821849823, + "step": 101510 + }, + { + "epoch": 14.410220014194465, + "grad_norm": 4.4220428466796875, + "learning_rate": 8.559602555003549e-05, + "loss": 0.025368493795394898, + "step": 101520 + }, + { + "epoch": 14.411639460610361, + "grad_norm": 0.2869272530078888, + "learning_rate": 8.559460610361959e-05, + "loss": 0.006363069266080856, + "step": 101530 + }, + { + "epoch": 14.41305890702626, + "grad_norm": 0.7667129039764404, + "learning_rate": 8.55931866572037e-05, + "loss": 0.011126606166362763, + "step": 101540 + }, + { + "epoch": 14.414478353442158, + "grad_norm": 0.27252647280693054, + "learning_rate": 8.55917672107878e-05, + "loss": 0.04273441433906555, + "step": 101550 + }, + { + "epoch": 14.415897799858055, + "grad_norm": 11.699451446533203, + "learning_rate": 8.55903477643719e-05, + "loss": 0.016938979923725127, + "step": 101560 + }, + { + "epoch": 14.417317246273953, + "grad_norm": 3.4000279903411865, + "learning_rate": 8.5588928317956e-05, + "loss": 0.0042859077453613285, + "step": 101570 + }, + { + "epoch": 14.418736692689851, + "grad_norm": 0.18611261248588562, + "learning_rate": 8.55875088715401e-05, + "loss": 0.02584313750267029, + "step": 101580 + }, + { + "epoch": 14.42015613910575, + "grad_norm": 0.011047269217669964, + "learning_rate": 8.558608942512421e-05, + "loss": 0.017075327038764954, + "step": 101590 + }, + { + "epoch": 14.421575585521646, + "grad_norm": 7.181526184082031, + "learning_rate": 8.558466997870831e-05, + "loss": 0.010425643622875213, + "step": 101600 + }, + { + "epoch": 14.422995031937544, + "grad_norm": 0.1424635797739029, + "learning_rate": 8.558325053229242e-05, + "loss": 0.01081070452928543, + "step": 101610 + }, + { + "epoch": 14.424414478353443, + "grad_norm": 0.01988801546394825, + "learning_rate": 8.55818310858765e-05, + "loss": 0.004163437709212303, + "step": 101620 + }, + { + "epoch": 14.42583392476934, + "grad_norm": 0.0159855168312788, + "learning_rate": 8.558041163946061e-05, + "loss": 0.05156056880950928, + "step": 101630 + }, + { + "epoch": 14.427253371185238, + "grad_norm": 2.1559231281280518, + "learning_rate": 8.557899219304471e-05, + "loss": 0.0501996636390686, + "step": 101640 + }, + { + "epoch": 14.428672817601136, + "grad_norm": 7.339823246002197, + "learning_rate": 8.557757274662882e-05, + "loss": 0.02520217001438141, + "step": 101650 + }, + { + "epoch": 14.430092264017034, + "grad_norm": 4.2832746505737305, + "learning_rate": 8.557615330021292e-05, + "loss": 0.011065931618213653, + "step": 101660 + }, + { + "epoch": 14.43151171043293, + "grad_norm": 8.455238342285156, + "learning_rate": 8.557473385379702e-05, + "loss": 0.04223898947238922, + "step": 101670 + }, + { + "epoch": 14.432931156848829, + "grad_norm": 0.3069610893726349, + "learning_rate": 8.557331440738113e-05, + "loss": 0.01566736549139023, + "step": 101680 + }, + { + "epoch": 14.434350603264727, + "grad_norm": 6.469844341278076, + "learning_rate": 8.557189496096522e-05, + "loss": 0.026878052949905397, + "step": 101690 + }, + { + "epoch": 14.435770049680624, + "grad_norm": 6.7413554191589355, + "learning_rate": 8.557047551454934e-05, + "loss": 0.050300925970077515, + "step": 101700 + }, + { + "epoch": 14.437189496096522, + "grad_norm": 1.7050913572311401, + "learning_rate": 8.556905606813343e-05, + "loss": 0.01144518330693245, + "step": 101710 + }, + { + "epoch": 14.43860894251242, + "grad_norm": 8.869119644165039, + "learning_rate": 8.556763662171754e-05, + "loss": 0.021273121237754822, + "step": 101720 + }, + { + "epoch": 14.440028388928319, + "grad_norm": 0.823733389377594, + "learning_rate": 8.556621717530163e-05, + "loss": 0.015097922086715699, + "step": 101730 + }, + { + "epoch": 14.441447835344215, + "grad_norm": 2.2480180263519287, + "learning_rate": 8.556479772888574e-05, + "loss": 0.058521485328674315, + "step": 101740 + }, + { + "epoch": 14.442867281760114, + "grad_norm": 3.448361873626709, + "learning_rate": 8.556337828246984e-05, + "loss": 0.012908448278903962, + "step": 101750 + }, + { + "epoch": 14.444286728176012, + "grad_norm": 7.998870849609375, + "learning_rate": 8.556195883605395e-05, + "loss": 0.013298434019088746, + "step": 101760 + }, + { + "epoch": 14.445706174591908, + "grad_norm": 6.137280464172363, + "learning_rate": 8.556053938963804e-05, + "loss": 0.016340239346027373, + "step": 101770 + }, + { + "epoch": 14.447125621007807, + "grad_norm": 5.501545429229736, + "learning_rate": 8.555911994322214e-05, + "loss": 0.03287135660648346, + "step": 101780 + }, + { + "epoch": 14.448545067423705, + "grad_norm": 1.2825032472610474, + "learning_rate": 8.555770049680625e-05, + "loss": 0.043364471197128295, + "step": 101790 + }, + { + "epoch": 14.449964513839603, + "grad_norm": 10.023609161376953, + "learning_rate": 8.555628105039035e-05, + "loss": 0.028557685017585755, + "step": 101800 + }, + { + "epoch": 14.4513839602555, + "grad_norm": 1.2176203727722168, + "learning_rate": 8.555486160397446e-05, + "loss": 0.043269476294517516, + "step": 101810 + }, + { + "epoch": 14.452803406671398, + "grad_norm": 0.30955979228019714, + "learning_rate": 8.555344215755856e-05, + "loss": 0.01596580147743225, + "step": 101820 + }, + { + "epoch": 14.454222853087296, + "grad_norm": 0.026654871180653572, + "learning_rate": 8.555202271114266e-05, + "loss": 0.0021469760686159134, + "step": 101830 + }, + { + "epoch": 14.455642299503193, + "grad_norm": 3.4512853622436523, + "learning_rate": 8.555060326472675e-05, + "loss": 0.04226863384246826, + "step": 101840 + }, + { + "epoch": 14.457061745919091, + "grad_norm": 0.1448906511068344, + "learning_rate": 8.554918381831086e-05, + "loss": 0.05672473907470703, + "step": 101850 + }, + { + "epoch": 14.45848119233499, + "grad_norm": 1.955114722251892, + "learning_rate": 8.554776437189496e-05, + "loss": 0.012461932003498077, + "step": 101860 + }, + { + "epoch": 14.459900638750888, + "grad_norm": 0.15195594727993011, + "learning_rate": 8.554634492547907e-05, + "loss": 0.014708581566810607, + "step": 101870 + }, + { + "epoch": 14.461320085166784, + "grad_norm": 0.371501624584198, + "learning_rate": 8.554492547906317e-05, + "loss": 0.04243779182434082, + "step": 101880 + }, + { + "epoch": 14.462739531582683, + "grad_norm": 14.460862159729004, + "learning_rate": 8.554350603264727e-05, + "loss": 0.03571424782276154, + "step": 101890 + }, + { + "epoch": 14.464158977998581, + "grad_norm": 9.164223670959473, + "learning_rate": 8.554208658623138e-05, + "loss": 0.030497419834136962, + "step": 101900 + }, + { + "epoch": 14.465578424414478, + "grad_norm": 4.030511856079102, + "learning_rate": 8.554066713981548e-05, + "loss": 0.037016880512237546, + "step": 101910 + }, + { + "epoch": 14.466997870830376, + "grad_norm": 4.120282173156738, + "learning_rate": 8.553924769339959e-05, + "loss": 0.04612137973308563, + "step": 101920 + }, + { + "epoch": 14.468417317246274, + "grad_norm": 0.1960458755493164, + "learning_rate": 8.553782824698367e-05, + "loss": 0.011224465072154998, + "step": 101930 + }, + { + "epoch": 14.469836763662173, + "grad_norm": 1.459161639213562, + "learning_rate": 8.553640880056778e-05, + "loss": 0.02568800449371338, + "step": 101940 + }, + { + "epoch": 14.471256210078069, + "grad_norm": 0.141305074095726, + "learning_rate": 8.553498935415188e-05, + "loss": 0.018661434948444366, + "step": 101950 + }, + { + "epoch": 14.472675656493967, + "grad_norm": 1.7660448551177979, + "learning_rate": 8.553356990773599e-05, + "loss": 0.011808550357818604, + "step": 101960 + }, + { + "epoch": 14.474095102909866, + "grad_norm": 0.19055935740470886, + "learning_rate": 8.553215046132009e-05, + "loss": 0.013704870641231538, + "step": 101970 + }, + { + "epoch": 14.475514549325762, + "grad_norm": 0.24393945932388306, + "learning_rate": 8.553073101490418e-05, + "loss": 0.03193598985671997, + "step": 101980 + }, + { + "epoch": 14.47693399574166, + "grad_norm": 3.6609933376312256, + "learning_rate": 8.55293115684883e-05, + "loss": 0.0095102459192276, + "step": 101990 + }, + { + "epoch": 14.478353442157559, + "grad_norm": 8.48030948638916, + "learning_rate": 8.552789212207239e-05, + "loss": 0.027347564697265625, + "step": 102000 + }, + { + "epoch": 14.478353442157559, + "eval_accuracy": 0.9863292427036306, + "eval_loss": 0.04714876785874367, + "eval_runtime": 31.0801, + "eval_samples_per_second": 506.015, + "eval_steps_per_second": 15.83, + "step": 102000 + }, + { + "epoch": 14.479772888573457, + "grad_norm": 0.44472378492355347, + "learning_rate": 8.55264726756565e-05, + "loss": 0.00895627737045288, + "step": 102010 + }, + { + "epoch": 14.481192334989354, + "grad_norm": 0.016872040927410126, + "learning_rate": 8.55250532292406e-05, + "loss": 0.026319512724876405, + "step": 102020 + }, + { + "epoch": 14.482611781405252, + "grad_norm": 0.049058735370635986, + "learning_rate": 8.55236337828247e-05, + "loss": 0.020491470396518708, + "step": 102030 + }, + { + "epoch": 14.48403122782115, + "grad_norm": 0.06828688085079193, + "learning_rate": 8.55222143364088e-05, + "loss": 0.029941469430923462, + "step": 102040 + }, + { + "epoch": 14.485450674237047, + "grad_norm": 0.29337137937545776, + "learning_rate": 8.55207948899929e-05, + "loss": 0.010589533299207688, + "step": 102050 + }, + { + "epoch": 14.486870120652945, + "grad_norm": 1.7915858030319214, + "learning_rate": 8.5519375443577e-05, + "loss": 0.0408454954624176, + "step": 102060 + }, + { + "epoch": 14.488289567068843, + "grad_norm": 0.12905509769916534, + "learning_rate": 8.551795599716111e-05, + "loss": 0.040449097752571106, + "step": 102070 + }, + { + "epoch": 14.489709013484742, + "grad_norm": 1.493906021118164, + "learning_rate": 8.551653655074521e-05, + "loss": 0.01022346168756485, + "step": 102080 + }, + { + "epoch": 14.491128459900638, + "grad_norm": 1.35050368309021, + "learning_rate": 8.551511710432931e-05, + "loss": 0.00817287564277649, + "step": 102090 + }, + { + "epoch": 14.492547906316537, + "grad_norm": 0.04836224392056465, + "learning_rate": 8.551369765791342e-05, + "loss": 0.017035089433193207, + "step": 102100 + }, + { + "epoch": 14.493967352732435, + "grad_norm": 0.1411861628293991, + "learning_rate": 8.551227821149752e-05, + "loss": 0.04491946399211884, + "step": 102110 + }, + { + "epoch": 14.495386799148331, + "grad_norm": 0.44113588333129883, + "learning_rate": 8.551085876508163e-05, + "loss": 0.007547316700220108, + "step": 102120 + }, + { + "epoch": 14.49680624556423, + "grad_norm": 10.944716453552246, + "learning_rate": 8.550943931866573e-05, + "loss": 0.02461606413125992, + "step": 102130 + }, + { + "epoch": 14.498225691980128, + "grad_norm": 1.2483770847320557, + "learning_rate": 8.550801987224982e-05, + "loss": 0.012789353728294373, + "step": 102140 + }, + { + "epoch": 14.499645138396026, + "grad_norm": 6.052944660186768, + "learning_rate": 8.550660042583392e-05, + "loss": 0.010456231236457825, + "step": 102150 + }, + { + "epoch": 14.501064584811923, + "grad_norm": 1.0382543802261353, + "learning_rate": 8.550518097941803e-05, + "loss": 0.026821547746658327, + "step": 102160 + }, + { + "epoch": 14.502484031227821, + "grad_norm": 0.07013286650180817, + "learning_rate": 8.550376153300213e-05, + "loss": 0.027486056089401245, + "step": 102170 + }, + { + "epoch": 14.50390347764372, + "grad_norm": 0.03486839681863785, + "learning_rate": 8.550234208658624e-05, + "loss": 0.10720919370651245, + "step": 102180 + }, + { + "epoch": 14.505322924059616, + "grad_norm": 0.2477417290210724, + "learning_rate": 8.550092264017034e-05, + "loss": 0.014333716034889222, + "step": 102190 + }, + { + "epoch": 14.506742370475514, + "grad_norm": 0.23805853724479675, + "learning_rate": 8.549950319375443e-05, + "loss": 0.018902239203453065, + "step": 102200 + }, + { + "epoch": 14.508161816891413, + "grad_norm": 0.04487096145749092, + "learning_rate": 8.549808374733855e-05, + "loss": 0.013421098887920379, + "step": 102210 + }, + { + "epoch": 14.509581263307311, + "grad_norm": 0.6457217931747437, + "learning_rate": 8.549666430092264e-05, + "loss": 0.06798742413520813, + "step": 102220 + }, + { + "epoch": 14.511000709723207, + "grad_norm": 0.8722721934318542, + "learning_rate": 8.549524485450675e-05, + "loss": 0.019697029888629914, + "step": 102230 + }, + { + "epoch": 14.512420156139106, + "grad_norm": 0.5466771721839905, + "learning_rate": 8.549382540809084e-05, + "loss": 0.016688692569732665, + "step": 102240 + }, + { + "epoch": 14.513839602555004, + "grad_norm": 0.023246901109814644, + "learning_rate": 8.549240596167495e-05, + "loss": 0.005826687440276146, + "step": 102250 + }, + { + "epoch": 14.5152590489709, + "grad_norm": 0.37465423345565796, + "learning_rate": 8.549098651525905e-05, + "loss": 0.017506250739097597, + "step": 102260 + }, + { + "epoch": 14.516678495386799, + "grad_norm": 0.033044762909412384, + "learning_rate": 8.548956706884316e-05, + "loss": 0.05119068622589111, + "step": 102270 + }, + { + "epoch": 14.518097941802697, + "grad_norm": 8.139904975891113, + "learning_rate": 8.548814762242727e-05, + "loss": 0.053800547122955324, + "step": 102280 + }, + { + "epoch": 14.519517388218595, + "grad_norm": 5.402328014373779, + "learning_rate": 8.548672817601135e-05, + "loss": 0.009679973125457764, + "step": 102290 + }, + { + "epoch": 14.520936834634492, + "grad_norm": 0.05474318191409111, + "learning_rate": 8.548530872959546e-05, + "loss": 0.005274232476949692, + "step": 102300 + }, + { + "epoch": 14.52235628105039, + "grad_norm": 0.6504866480827332, + "learning_rate": 8.548388928317956e-05, + "loss": 0.02782217264175415, + "step": 102310 + }, + { + "epoch": 14.523775727466289, + "grad_norm": 0.7330292463302612, + "learning_rate": 8.548246983676367e-05, + "loss": 0.08225621581077576, + "step": 102320 + }, + { + "epoch": 14.525195173882185, + "grad_norm": 0.9900355339050293, + "learning_rate": 8.548105039034777e-05, + "loss": 0.006571047753095627, + "step": 102330 + }, + { + "epoch": 14.526614620298083, + "grad_norm": 0.028682060539722443, + "learning_rate": 8.547963094393187e-05, + "loss": 0.0058587446808815, + "step": 102340 + }, + { + "epoch": 14.528034066713982, + "grad_norm": 0.07497026026248932, + "learning_rate": 8.547821149751596e-05, + "loss": 0.004919090494513512, + "step": 102350 + }, + { + "epoch": 14.52945351312988, + "grad_norm": 1.8536876440048218, + "learning_rate": 8.547679205110007e-05, + "loss": 0.011548362672328949, + "step": 102360 + }, + { + "epoch": 14.530872959545777, + "grad_norm": 0.7886669039726257, + "learning_rate": 8.547537260468418e-05, + "loss": 0.021945886313915253, + "step": 102370 + }, + { + "epoch": 14.532292405961675, + "grad_norm": 0.4612744450569153, + "learning_rate": 8.547395315826828e-05, + "loss": 0.051427000761032106, + "step": 102380 + }, + { + "epoch": 14.533711852377573, + "grad_norm": 0.23230144381523132, + "learning_rate": 8.547253371185238e-05, + "loss": 0.03924002945423126, + "step": 102390 + }, + { + "epoch": 14.53513129879347, + "grad_norm": 3.556591033935547, + "learning_rate": 8.547111426543648e-05, + "loss": 0.04937024414539337, + "step": 102400 + }, + { + "epoch": 14.536550745209368, + "grad_norm": 0.04170704260468483, + "learning_rate": 8.546969481902059e-05, + "loss": 0.05127858519554138, + "step": 102410 + }, + { + "epoch": 14.537970191625266, + "grad_norm": 7.728824615478516, + "learning_rate": 8.546827537260469e-05, + "loss": 0.05602442026138306, + "step": 102420 + }, + { + "epoch": 14.539389638041165, + "grad_norm": 5.621062755584717, + "learning_rate": 8.54668559261888e-05, + "loss": 0.017265281081199645, + "step": 102430 + }, + { + "epoch": 14.540809084457061, + "grad_norm": 0.029028164222836494, + "learning_rate": 8.54654364797729e-05, + "loss": 0.007049372792243958, + "step": 102440 + }, + { + "epoch": 14.54222853087296, + "grad_norm": 0.06667573004961014, + "learning_rate": 8.546401703335699e-05, + "loss": 0.005283458903431892, + "step": 102450 + }, + { + "epoch": 14.543647977288858, + "grad_norm": 0.7624027729034424, + "learning_rate": 8.54625975869411e-05, + "loss": 0.017357051372528076, + "step": 102460 + }, + { + "epoch": 14.545067423704754, + "grad_norm": 0.05842301622033119, + "learning_rate": 8.54611781405252e-05, + "loss": 0.010766969621181488, + "step": 102470 + }, + { + "epoch": 14.546486870120653, + "grad_norm": 0.3239707350730896, + "learning_rate": 8.545975869410931e-05, + "loss": 0.010560451447963715, + "step": 102480 + }, + { + "epoch": 14.547906316536551, + "grad_norm": 2.594454526901245, + "learning_rate": 8.545833924769341e-05, + "loss": 0.03495635986328125, + "step": 102490 + }, + { + "epoch": 14.54932576295245, + "grad_norm": 0.0778871476650238, + "learning_rate": 8.54569198012775e-05, + "loss": 0.016888731718063356, + "step": 102500 + }, + { + "epoch": 14.54932576295245, + "eval_accuracy": 0.9813696191263432, + "eval_loss": 0.07888679951429367, + "eval_runtime": 30.2995, + "eval_samples_per_second": 519.051, + "eval_steps_per_second": 16.238, + "step": 102500 + }, + { + "epoch": 14.550745209368346, + "grad_norm": 0.047318752855062485, + "learning_rate": 8.54555003548616e-05, + "loss": 0.028882688283920287, + "step": 102510 + }, + { + "epoch": 14.552164655784244, + "grad_norm": 1.6493586301803589, + "learning_rate": 8.545408090844571e-05, + "loss": 0.04082152545452118, + "step": 102520 + }, + { + "epoch": 14.553584102200142, + "grad_norm": 1.9483829736709595, + "learning_rate": 8.545266146202981e-05, + "loss": 0.02645583152770996, + "step": 102530 + }, + { + "epoch": 14.555003548616039, + "grad_norm": 18.570716857910156, + "learning_rate": 8.545124201561392e-05, + "loss": 0.0315351814031601, + "step": 102540 + }, + { + "epoch": 14.556422995031937, + "grad_norm": 0.7574842572212219, + "learning_rate": 8.544982256919802e-05, + "loss": 0.012972161173820496, + "step": 102550 + }, + { + "epoch": 14.557842441447836, + "grad_norm": 0.3595932722091675, + "learning_rate": 8.544840312278212e-05, + "loss": 0.0756956934928894, + "step": 102560 + }, + { + "epoch": 14.559261887863734, + "grad_norm": 4.002671718597412, + "learning_rate": 8.544698367636623e-05, + "loss": 0.04419144093990326, + "step": 102570 + }, + { + "epoch": 14.56068133427963, + "grad_norm": 0.07301048189401627, + "learning_rate": 8.544556422995032e-05, + "loss": 0.02443731129169464, + "step": 102580 + }, + { + "epoch": 14.562100780695529, + "grad_norm": 0.05954832211136818, + "learning_rate": 8.544414478353444e-05, + "loss": 0.02469266653060913, + "step": 102590 + }, + { + "epoch": 14.563520227111427, + "grad_norm": 0.6604858636856079, + "learning_rate": 8.544272533711852e-05, + "loss": 0.0301352858543396, + "step": 102600 + }, + { + "epoch": 14.564939673527324, + "grad_norm": 0.09414374083280563, + "learning_rate": 8.544130589070263e-05, + "loss": 0.010161099582910537, + "step": 102610 + }, + { + "epoch": 14.566359119943222, + "grad_norm": 0.8994581699371338, + "learning_rate": 8.543988644428673e-05, + "loss": 0.0168968603014946, + "step": 102620 + }, + { + "epoch": 14.56777856635912, + "grad_norm": 0.01584504544734955, + "learning_rate": 8.543846699787084e-05, + "loss": 0.04087940454483032, + "step": 102630 + }, + { + "epoch": 14.569198012775018, + "grad_norm": 1.0661892890930176, + "learning_rate": 8.543704755145494e-05, + "loss": 0.013919854164123535, + "step": 102640 + }, + { + "epoch": 14.570617459190915, + "grad_norm": 10.946099281311035, + "learning_rate": 8.543562810503903e-05, + "loss": 0.023270314931869505, + "step": 102650 + }, + { + "epoch": 14.572036905606813, + "grad_norm": 1.4906280040740967, + "learning_rate": 8.543420865862314e-05, + "loss": 0.03611093461513519, + "step": 102660 + }, + { + "epoch": 14.573456352022712, + "grad_norm": 0.058776769787073135, + "learning_rate": 8.543278921220724e-05, + "loss": 0.011062215268611907, + "step": 102670 + }, + { + "epoch": 14.574875798438608, + "grad_norm": 1.3882954120635986, + "learning_rate": 8.543136976579135e-05, + "loss": 0.02433699816465378, + "step": 102680 + }, + { + "epoch": 14.576295244854506, + "grad_norm": 0.5981866121292114, + "learning_rate": 8.542995031937545e-05, + "loss": 0.017913120985031127, + "step": 102690 + }, + { + "epoch": 14.577714691270405, + "grad_norm": 0.05415025353431702, + "learning_rate": 8.542853087295955e-05, + "loss": 0.0577204167842865, + "step": 102700 + }, + { + "epoch": 14.579134137686303, + "grad_norm": 0.62937992811203, + "learning_rate": 8.542711142654364e-05, + "loss": 0.00876297652721405, + "step": 102710 + }, + { + "epoch": 14.5805535841022, + "grad_norm": 0.33898359537124634, + "learning_rate": 8.542569198012776e-05, + "loss": 0.05263091921806336, + "step": 102720 + }, + { + "epoch": 14.581973030518098, + "grad_norm": 0.11864381283521652, + "learning_rate": 8.542427253371185e-05, + "loss": 0.042475050687789916, + "step": 102730 + }, + { + "epoch": 14.583392476933996, + "grad_norm": 11.901185035705566, + "learning_rate": 8.542285308729596e-05, + "loss": 0.0330029308795929, + "step": 102740 + }, + { + "epoch": 14.584811923349893, + "grad_norm": 8.142967224121094, + "learning_rate": 8.542143364088006e-05, + "loss": 0.026455461978912354, + "step": 102750 + }, + { + "epoch": 14.586231369765791, + "grad_norm": 8.642476081848145, + "learning_rate": 8.542001419446416e-05, + "loss": 0.02869180738925934, + "step": 102760 + }, + { + "epoch": 14.58765081618169, + "grad_norm": 7.578298091888428, + "learning_rate": 8.541859474804827e-05, + "loss": 0.012754182517528533, + "step": 102770 + }, + { + "epoch": 14.589070262597588, + "grad_norm": 0.013344738632440567, + "learning_rate": 8.541717530163237e-05, + "loss": 0.047271886467933656, + "step": 102780 + }, + { + "epoch": 14.590489709013484, + "grad_norm": 0.1891198754310608, + "learning_rate": 8.541575585521648e-05, + "loss": 0.004413479566574096, + "step": 102790 + }, + { + "epoch": 14.591909155429382, + "grad_norm": 0.11714489012956619, + "learning_rate": 8.541433640880058e-05, + "loss": 0.012677499651908874, + "step": 102800 + }, + { + "epoch": 14.59332860184528, + "grad_norm": 0.3130483031272888, + "learning_rate": 8.541291696238467e-05, + "loss": 0.005448834598064422, + "step": 102810 + }, + { + "epoch": 14.594748048261177, + "grad_norm": 3.508953094482422, + "learning_rate": 8.541149751596877e-05, + "loss": 0.03599079251289368, + "step": 102820 + }, + { + "epoch": 14.596167494677076, + "grad_norm": 0.023073973134160042, + "learning_rate": 8.541007806955288e-05, + "loss": 0.024753133952617645, + "step": 102830 + }, + { + "epoch": 14.597586941092974, + "grad_norm": 0.10521383583545685, + "learning_rate": 8.540865862313698e-05, + "loss": 0.017889854311943055, + "step": 102840 + }, + { + "epoch": 14.599006387508872, + "grad_norm": 14.080901145935059, + "learning_rate": 8.540723917672109e-05, + "loss": 0.020883312821388243, + "step": 102850 + }, + { + "epoch": 14.600425833924769, + "grad_norm": 13.305614471435547, + "learning_rate": 8.540581973030519e-05, + "loss": 0.03402230143547058, + "step": 102860 + }, + { + "epoch": 14.601845280340667, + "grad_norm": 12.037869453430176, + "learning_rate": 8.540440028388928e-05, + "loss": 0.05208061933517456, + "step": 102870 + }, + { + "epoch": 14.603264726756565, + "grad_norm": 0.06191776320338249, + "learning_rate": 8.54029808374734e-05, + "loss": 0.03488814234733582, + "step": 102880 + }, + { + "epoch": 14.604684173172462, + "grad_norm": 3.047244071960449, + "learning_rate": 8.540156139105749e-05, + "loss": 0.031791788339614865, + "step": 102890 + }, + { + "epoch": 14.60610361958836, + "grad_norm": 0.5617519021034241, + "learning_rate": 8.54001419446416e-05, + "loss": 0.021275760233402254, + "step": 102900 + }, + { + "epoch": 14.607523066004259, + "grad_norm": 7.143520355224609, + "learning_rate": 8.539872249822569e-05, + "loss": 0.02951667606830597, + "step": 102910 + }, + { + "epoch": 14.608942512420157, + "grad_norm": 0.12056829035282135, + "learning_rate": 8.53973030518098e-05, + "loss": 0.006852982938289643, + "step": 102920 + }, + { + "epoch": 14.610361958836053, + "grad_norm": 2.9511711597442627, + "learning_rate": 8.53958836053939e-05, + "loss": 0.03665172159671783, + "step": 102930 + }, + { + "epoch": 14.611781405251952, + "grad_norm": 0.8545325398445129, + "learning_rate": 8.5394464158978e-05, + "loss": 0.028074532747268677, + "step": 102940 + }, + { + "epoch": 14.61320085166785, + "grad_norm": 0.16128212213516235, + "learning_rate": 8.53930447125621e-05, + "loss": 0.05112728476524353, + "step": 102950 + }, + { + "epoch": 14.614620298083747, + "grad_norm": 0.2537463307380676, + "learning_rate": 8.53916252661462e-05, + "loss": 0.030259785056114197, + "step": 102960 + }, + { + "epoch": 14.616039744499645, + "grad_norm": 0.07488785684108734, + "learning_rate": 8.539020581973031e-05, + "loss": 0.015859323740005492, + "step": 102970 + }, + { + "epoch": 14.617459190915543, + "grad_norm": 6.450938701629639, + "learning_rate": 8.538878637331441e-05, + "loss": 0.03825899958610535, + "step": 102980 + }, + { + "epoch": 14.618878637331441, + "grad_norm": 0.20200957357883453, + "learning_rate": 8.538736692689852e-05, + "loss": 0.010987403988838195, + "step": 102990 + }, + { + "epoch": 14.620298083747338, + "grad_norm": 2.432180166244507, + "learning_rate": 8.538594748048262e-05, + "loss": 0.020841056108474733, + "step": 103000 + }, + { + "epoch": 14.620298083747338, + "eval_accuracy": 0.9786990525847269, + "eval_loss": 0.07823944091796875, + "eval_runtime": 30.9251, + "eval_samples_per_second": 508.551, + "eval_steps_per_second": 15.909, + "step": 103000 + }, + { + "epoch": 14.621717530163236, + "grad_norm": 19.911706924438477, + "learning_rate": 8.538452803406672e-05, + "loss": 0.016798266768455507, + "step": 103010 + }, + { + "epoch": 14.623136976579135, + "grad_norm": 4.428212642669678, + "learning_rate": 8.538310858765081e-05, + "loss": 0.009555123746395111, + "step": 103020 + }, + { + "epoch": 14.624556422995031, + "grad_norm": 0.3057141900062561, + "learning_rate": 8.538168914123492e-05, + "loss": 0.03787533640861511, + "step": 103030 + }, + { + "epoch": 14.62597586941093, + "grad_norm": 0.4630641043186188, + "learning_rate": 8.538026969481902e-05, + "loss": 0.006291747093200684, + "step": 103040 + }, + { + "epoch": 14.627395315826828, + "grad_norm": 2.2272002696990967, + "learning_rate": 8.537885024840313e-05, + "loss": 0.011624373495578766, + "step": 103050 + }, + { + "epoch": 14.628814762242726, + "grad_norm": 10.10580825805664, + "learning_rate": 8.537743080198723e-05, + "loss": 0.049487939476966857, + "step": 103060 + }, + { + "epoch": 14.630234208658623, + "grad_norm": 3.810765027999878, + "learning_rate": 8.537601135557133e-05, + "loss": 0.053276628255844116, + "step": 103070 + }, + { + "epoch": 14.63165365507452, + "grad_norm": 0.27761077880859375, + "learning_rate": 8.537459190915544e-05, + "loss": 0.05323188304901123, + "step": 103080 + }, + { + "epoch": 14.63307310149042, + "grad_norm": 0.04070013016462326, + "learning_rate": 8.537317246273953e-05, + "loss": 0.0037203233689069746, + "step": 103090 + }, + { + "epoch": 14.634492547906316, + "grad_norm": 0.011536057107150555, + "learning_rate": 8.537175301632365e-05, + "loss": 0.019993194937705995, + "step": 103100 + }, + { + "epoch": 14.635911994322214, + "grad_norm": 0.10641834884881973, + "learning_rate": 8.537033356990774e-05, + "loss": 0.00649840384721756, + "step": 103110 + }, + { + "epoch": 14.637331440738112, + "grad_norm": 0.5350741744041443, + "learning_rate": 8.536891412349184e-05, + "loss": 0.009857784211635589, + "step": 103120 + }, + { + "epoch": 14.63875088715401, + "grad_norm": 0.06728602200746536, + "learning_rate": 8.536749467707594e-05, + "loss": 0.04699492752552033, + "step": 103130 + }, + { + "epoch": 14.640170333569907, + "grad_norm": 4.469810485839844, + "learning_rate": 8.536607523066005e-05, + "loss": 0.007526058703660965, + "step": 103140 + }, + { + "epoch": 14.641589779985805, + "grad_norm": 0.19321392476558685, + "learning_rate": 8.536465578424415e-05, + "loss": 0.012016779184341431, + "step": 103150 + }, + { + "epoch": 14.643009226401704, + "grad_norm": 0.7691546678543091, + "learning_rate": 8.536323633782826e-05, + "loss": 0.021825101971626282, + "step": 103160 + }, + { + "epoch": 14.6444286728176, + "grad_norm": 1.6768932342529297, + "learning_rate": 8.536181689141235e-05, + "loss": 0.014258480072021485, + "step": 103170 + }, + { + "epoch": 14.645848119233499, + "grad_norm": 0.3741374909877777, + "learning_rate": 8.536039744499645e-05, + "loss": 0.022825604677200316, + "step": 103180 + }, + { + "epoch": 14.647267565649397, + "grad_norm": 0.8053756356239319, + "learning_rate": 8.535897799858056e-05, + "loss": 0.013836804032325744, + "step": 103190 + }, + { + "epoch": 14.648687012065295, + "grad_norm": 0.02035684697329998, + "learning_rate": 8.535755855216466e-05, + "loss": 0.017511588335037232, + "step": 103200 + }, + { + "epoch": 14.650106458481192, + "grad_norm": 3.142700433731079, + "learning_rate": 8.535613910574877e-05, + "loss": 0.010556647181510925, + "step": 103210 + }, + { + "epoch": 14.65152590489709, + "grad_norm": 0.06115235015749931, + "learning_rate": 8.535471965933285e-05, + "loss": 0.013547767698764802, + "step": 103220 + }, + { + "epoch": 14.652945351312988, + "grad_norm": 0.029936334118247032, + "learning_rate": 8.535330021291697e-05, + "loss": 0.012430180609226228, + "step": 103230 + }, + { + "epoch": 14.654364797728885, + "grad_norm": 8.30500316619873, + "learning_rate": 8.535188076650106e-05, + "loss": 0.06894768476486206, + "step": 103240 + }, + { + "epoch": 14.655784244144783, + "grad_norm": 0.31151121854782104, + "learning_rate": 8.535046132008517e-05, + "loss": 0.02243182957172394, + "step": 103250 + }, + { + "epoch": 14.657203690560682, + "grad_norm": 3.7427921295166016, + "learning_rate": 8.534904187366927e-05, + "loss": 0.05491254329681396, + "step": 103260 + }, + { + "epoch": 14.65862313697658, + "grad_norm": 6.065624237060547, + "learning_rate": 8.534762242725337e-05, + "loss": 0.02661764919757843, + "step": 103270 + }, + { + "epoch": 14.660042583392476, + "grad_norm": 0.13813666999340057, + "learning_rate": 8.534620298083748e-05, + "loss": 0.007654508948326111, + "step": 103280 + }, + { + "epoch": 14.661462029808375, + "grad_norm": 0.08765114843845367, + "learning_rate": 8.534478353442158e-05, + "loss": 0.017231097817420958, + "step": 103290 + }, + { + "epoch": 14.662881476224273, + "grad_norm": 0.5863285064697266, + "learning_rate": 8.534336408800569e-05, + "loss": 0.030512651801109313, + "step": 103300 + }, + { + "epoch": 14.66430092264017, + "grad_norm": 0.07344143837690353, + "learning_rate": 8.534194464158979e-05, + "loss": 0.009811799228191375, + "step": 103310 + }, + { + "epoch": 14.665720369056068, + "grad_norm": 6.590825080871582, + "learning_rate": 8.534052519517388e-05, + "loss": 0.008233676850795745, + "step": 103320 + }, + { + "epoch": 14.667139815471966, + "grad_norm": 6.864149570465088, + "learning_rate": 8.533910574875798e-05, + "loss": 0.010529959201812744, + "step": 103330 + }, + { + "epoch": 14.668559261887864, + "grad_norm": 0.1694544553756714, + "learning_rate": 8.533768630234209e-05, + "loss": 0.022904330492019655, + "step": 103340 + }, + { + "epoch": 14.669978708303761, + "grad_norm": 0.5476866960525513, + "learning_rate": 8.533626685592619e-05, + "loss": 0.030253446102142333, + "step": 103350 + }, + { + "epoch": 14.67139815471966, + "grad_norm": 0.27971404790878296, + "learning_rate": 8.53348474095103e-05, + "loss": 0.030365151166915894, + "step": 103360 + }, + { + "epoch": 14.672817601135558, + "grad_norm": 0.12163878977298737, + "learning_rate": 8.53334279630944e-05, + "loss": 0.018135757744312288, + "step": 103370 + }, + { + "epoch": 14.674237047551454, + "grad_norm": 1.1759569644927979, + "learning_rate": 8.53320085166785e-05, + "loss": 0.016406714916229248, + "step": 103380 + }, + { + "epoch": 14.675656493967352, + "grad_norm": 9.473148345947266, + "learning_rate": 8.53305890702626e-05, + "loss": 0.08271357417106628, + "step": 103390 + }, + { + "epoch": 14.67707594038325, + "grad_norm": 12.95916748046875, + "learning_rate": 8.53291696238467e-05, + "loss": 0.05103471279144287, + "step": 103400 + }, + { + "epoch": 14.678495386799149, + "grad_norm": 0.7540145516395569, + "learning_rate": 8.532775017743081e-05, + "loss": 0.06290702819824219, + "step": 103410 + }, + { + "epoch": 14.679914833215046, + "grad_norm": 0.44665971398353577, + "learning_rate": 8.53263307310149e-05, + "loss": 0.015360213816165924, + "step": 103420 + }, + { + "epoch": 14.681334279630944, + "grad_norm": 0.027483150362968445, + "learning_rate": 8.532491128459901e-05, + "loss": 0.013640022277832032, + "step": 103430 + }, + { + "epoch": 14.682753726046842, + "grad_norm": 0.5103288292884827, + "learning_rate": 8.53234918381831e-05, + "loss": 0.003732307255268097, + "step": 103440 + }, + { + "epoch": 14.684173172462739, + "grad_norm": 0.006598861422389746, + "learning_rate": 8.532207239176722e-05, + "loss": 0.001814265176653862, + "step": 103450 + }, + { + "epoch": 14.685592618878637, + "grad_norm": 0.10724376887083054, + "learning_rate": 8.532065294535131e-05, + "loss": 0.004641066119074821, + "step": 103460 + }, + { + "epoch": 14.687012065294535, + "grad_norm": 0.1681101769208908, + "learning_rate": 8.531923349893542e-05, + "loss": 0.021464285254478455, + "step": 103470 + }, + { + "epoch": 14.688431511710434, + "grad_norm": 0.023226885125041008, + "learning_rate": 8.531781405251952e-05, + "loss": 0.012756478786468507, + "step": 103480 + }, + { + "epoch": 14.68985095812633, + "grad_norm": 1.445003867149353, + "learning_rate": 8.531639460610362e-05, + "loss": 0.017902496457099914, + "step": 103490 + }, + { + "epoch": 14.691270404542228, + "grad_norm": 0.3953350782394409, + "learning_rate": 8.531497515968773e-05, + "loss": 0.012959374487400055, + "step": 103500 + }, + { + "epoch": 14.691270404542228, + "eval_accuracy": 0.9875373561391237, + "eval_loss": 0.041505590081214905, + "eval_runtime": 31.1689, + "eval_samples_per_second": 504.573, + "eval_steps_per_second": 15.785, + "step": 103500 + }, + { + "epoch": 14.692689850958127, + "grad_norm": 0.17475774884223938, + "learning_rate": 8.531355571327183e-05, + "loss": 0.023621699213981627, + "step": 103510 + }, + { + "epoch": 14.694109297374023, + "grad_norm": 3.899446487426758, + "learning_rate": 8.531213626685594e-05, + "loss": 0.03020642101764679, + "step": 103520 + }, + { + "epoch": 14.695528743789922, + "grad_norm": 3.107422113418579, + "learning_rate": 8.531071682044002e-05, + "loss": 0.03576629757881165, + "step": 103530 + }, + { + "epoch": 14.69694819020582, + "grad_norm": 0.12557634711265564, + "learning_rate": 8.530929737402413e-05, + "loss": 0.026172888278961182, + "step": 103540 + }, + { + "epoch": 14.698367636621718, + "grad_norm": 0.07338567078113556, + "learning_rate": 8.530787792760823e-05, + "loss": 0.0036056622862815856, + "step": 103550 + }, + { + "epoch": 14.699787083037615, + "grad_norm": 1.2753487825393677, + "learning_rate": 8.530660042583393e-05, + "loss": 0.030322042107582093, + "step": 103560 + }, + { + "epoch": 14.701206529453513, + "grad_norm": 0.45679599046707153, + "learning_rate": 8.530518097941803e-05, + "loss": 0.013920623064041137, + "step": 103570 + }, + { + "epoch": 14.702625975869411, + "grad_norm": 0.35328221321105957, + "learning_rate": 8.530376153300214e-05, + "loss": 0.03288788497447968, + "step": 103580 + }, + { + "epoch": 14.704045422285308, + "grad_norm": 0.04782756417989731, + "learning_rate": 8.530234208658624e-05, + "loss": 0.0034589260816574098, + "step": 103590 + }, + { + "epoch": 14.705464868701206, + "grad_norm": 1.0690654516220093, + "learning_rate": 8.530092264017033e-05, + "loss": 0.005941495299339294, + "step": 103600 + }, + { + "epoch": 14.706884315117104, + "grad_norm": 6.147830486297607, + "learning_rate": 8.529950319375444e-05, + "loss": 0.01815780848264694, + "step": 103610 + }, + { + "epoch": 14.708303761533003, + "grad_norm": 0.023603355512022972, + "learning_rate": 8.529808374733854e-05, + "loss": 0.010245384275913238, + "step": 103620 + }, + { + "epoch": 14.7097232079489, + "grad_norm": 2.9636645317077637, + "learning_rate": 8.529666430092265e-05, + "loss": 0.016753530502319335, + "step": 103630 + }, + { + "epoch": 14.711142654364798, + "grad_norm": 0.7192009091377258, + "learning_rate": 8.529524485450675e-05, + "loss": 0.030516594648361206, + "step": 103640 + }, + { + "epoch": 14.712562100780696, + "grad_norm": 0.4792954623699188, + "learning_rate": 8.529382540809085e-05, + "loss": 0.022221264243125916, + "step": 103650 + }, + { + "epoch": 14.713981547196592, + "grad_norm": 5.789441108703613, + "learning_rate": 8.529240596167494e-05, + "loss": 0.012740533053874969, + "step": 103660 + }, + { + "epoch": 14.71540099361249, + "grad_norm": 4.176222801208496, + "learning_rate": 8.529098651525905e-05, + "loss": 0.009820845723152161, + "step": 103670 + }, + { + "epoch": 14.716820440028389, + "grad_norm": 0.06941059231758118, + "learning_rate": 8.528956706884315e-05, + "loss": 0.013245144486427307, + "step": 103680 + }, + { + "epoch": 14.718239886444287, + "grad_norm": 7.595728397369385, + "learning_rate": 8.528814762242726e-05, + "loss": 0.04703215062618256, + "step": 103690 + }, + { + "epoch": 14.719659332860184, + "grad_norm": 0.09465227276086807, + "learning_rate": 8.528672817601136e-05, + "loss": 0.029085099697113037, + "step": 103700 + }, + { + "epoch": 14.721078779276082, + "grad_norm": 0.2000981569290161, + "learning_rate": 8.528530872959546e-05, + "loss": 0.008482570946216583, + "step": 103710 + }, + { + "epoch": 14.72249822569198, + "grad_norm": 9.368997573852539, + "learning_rate": 8.528388928317957e-05, + "loss": 0.01685175597667694, + "step": 103720 + }, + { + "epoch": 14.723917672107877, + "grad_norm": 2.1341898441314697, + "learning_rate": 8.528246983676367e-05, + "loss": 0.010621855407953263, + "step": 103730 + }, + { + "epoch": 14.725337118523775, + "grad_norm": 4.980072021484375, + "learning_rate": 8.528105039034778e-05, + "loss": 0.04767651557922363, + "step": 103740 + }, + { + "epoch": 14.726756564939674, + "grad_norm": 8.494141578674316, + "learning_rate": 8.527963094393186e-05, + "loss": 0.007886487245559692, + "step": 103750 + }, + { + "epoch": 14.728176011355572, + "grad_norm": 0.6012231707572937, + "learning_rate": 8.527821149751597e-05, + "loss": 0.028650522232055664, + "step": 103760 + }, + { + "epoch": 14.729595457771469, + "grad_norm": 0.052750129252672195, + "learning_rate": 8.527679205110007e-05, + "loss": 0.011216586828231812, + "step": 103770 + }, + { + "epoch": 14.731014904187367, + "grad_norm": 0.11503159999847412, + "learning_rate": 8.527537260468418e-05, + "loss": 0.03377627432346344, + "step": 103780 + }, + { + "epoch": 14.732434350603265, + "grad_norm": 4.543156147003174, + "learning_rate": 8.527395315826828e-05, + "loss": 0.011570049822330475, + "step": 103790 + }, + { + "epoch": 14.733853797019162, + "grad_norm": 4.640334606170654, + "learning_rate": 8.527253371185239e-05, + "loss": 0.01897972822189331, + "step": 103800 + }, + { + "epoch": 14.73527324343506, + "grad_norm": 0.9010857939720154, + "learning_rate": 8.527111426543649e-05, + "loss": 0.009830554574728012, + "step": 103810 + }, + { + "epoch": 14.736692689850958, + "grad_norm": 16.031429290771484, + "learning_rate": 8.526969481902058e-05, + "loss": 0.023026317358016968, + "step": 103820 + }, + { + "epoch": 14.738112136266857, + "grad_norm": 0.9265277981758118, + "learning_rate": 8.52682753726047e-05, + "loss": 0.007235559821128845, + "step": 103830 + }, + { + "epoch": 14.739531582682753, + "grad_norm": 0.044633179903030396, + "learning_rate": 8.526685592618879e-05, + "loss": 0.07692786455154418, + "step": 103840 + }, + { + "epoch": 14.740951029098651, + "grad_norm": 0.18914464116096497, + "learning_rate": 8.52654364797729e-05, + "loss": 0.042169079184532166, + "step": 103850 + }, + { + "epoch": 14.74237047551455, + "grad_norm": 8.333174705505371, + "learning_rate": 8.526401703335699e-05, + "loss": 0.032961970567703246, + "step": 103860 + }, + { + "epoch": 14.743789921930446, + "grad_norm": 1.251344084739685, + "learning_rate": 8.52625975869411e-05, + "loss": 0.016368305683135985, + "step": 103870 + }, + { + "epoch": 14.745209368346345, + "grad_norm": 3.432236433029175, + "learning_rate": 8.52611781405252e-05, + "loss": 0.049349617958068845, + "step": 103880 + }, + { + "epoch": 14.746628814762243, + "grad_norm": 0.08397255092859268, + "learning_rate": 8.52597586941093e-05, + "loss": 0.010075490176677703, + "step": 103890 + }, + { + "epoch": 14.748048261178141, + "grad_norm": 1.3642832040786743, + "learning_rate": 8.52583392476934e-05, + "loss": 0.011244092881679536, + "step": 103900 + }, + { + "epoch": 14.749467707594038, + "grad_norm": 2.000920295715332, + "learning_rate": 8.52569198012775e-05, + "loss": 0.012861920893192292, + "step": 103910 + }, + { + "epoch": 14.750887154009936, + "grad_norm": 0.006451313849538565, + "learning_rate": 8.525550035486161e-05, + "loss": 0.021595826745033263, + "step": 103920 + }, + { + "epoch": 14.752306600425834, + "grad_norm": 0.347922682762146, + "learning_rate": 8.525408090844571e-05, + "loss": 0.020030558109283447, + "step": 103930 + }, + { + "epoch": 14.75372604684173, + "grad_norm": 0.16533195972442627, + "learning_rate": 8.525266146202982e-05, + "loss": 0.012387216091156006, + "step": 103940 + }, + { + "epoch": 14.75514549325763, + "grad_norm": 0.17642009258270264, + "learning_rate": 8.525124201561392e-05, + "loss": 0.004976727813482284, + "step": 103950 + }, + { + "epoch": 14.756564939673527, + "grad_norm": 0.16800779104232788, + "learning_rate": 8.524982256919801e-05, + "loss": 0.005400242283940315, + "step": 103960 + }, + { + "epoch": 14.757984386089426, + "grad_norm": 0.4111100435256958, + "learning_rate": 8.524840312278211e-05, + "loss": 0.044538623094558714, + "step": 103970 + }, + { + "epoch": 14.759403832505322, + "grad_norm": 7.620819091796875, + "learning_rate": 8.524698367636622e-05, + "loss": 0.03084087371826172, + "step": 103980 + }, + { + "epoch": 14.76082327892122, + "grad_norm": 0.06103351712226868, + "learning_rate": 8.524556422995032e-05, + "loss": 0.034201741218566895, + "step": 103990 + }, + { + "epoch": 14.762242725337119, + "grad_norm": 0.07585401087999344, + "learning_rate": 8.524414478353443e-05, + "loss": 0.01607244312763214, + "step": 104000 + }, + { + "epoch": 14.762242725337119, + "eval_accuracy": 0.9835950912443568, + "eval_loss": 0.05828787013888359, + "eval_runtime": 30.4152, + "eval_samples_per_second": 517.077, + "eval_steps_per_second": 16.176, + "step": 104000 + }, + { + "epoch": 14.763662171753015, + "grad_norm": 1.165725827217102, + "learning_rate": 8.524272533711853e-05, + "loss": 0.02569354772567749, + "step": 104010 + }, + { + "epoch": 14.765081618168914, + "grad_norm": 8.496212005615234, + "learning_rate": 8.524130589070263e-05, + "loss": 0.022003328800201415, + "step": 104020 + }, + { + "epoch": 14.766501064584812, + "grad_norm": 2.057748317718506, + "learning_rate": 8.523988644428674e-05, + "loss": 0.01967623233795166, + "step": 104030 + }, + { + "epoch": 14.76792051100071, + "grad_norm": 8.422942161560059, + "learning_rate": 8.523846699787083e-05, + "loss": 0.06901317238807678, + "step": 104040 + }, + { + "epoch": 14.769339957416607, + "grad_norm": 0.8119853734970093, + "learning_rate": 8.523704755145494e-05, + "loss": 0.024189670383930207, + "step": 104050 + }, + { + "epoch": 14.770759403832505, + "grad_norm": 0.35583141446113586, + "learning_rate": 8.523562810503903e-05, + "loss": 0.01172533631324768, + "step": 104060 + }, + { + "epoch": 14.772178850248403, + "grad_norm": 0.700731635093689, + "learning_rate": 8.523420865862314e-05, + "loss": 0.01899768114089966, + "step": 104070 + }, + { + "epoch": 14.7735982966643, + "grad_norm": 0.21481211483478546, + "learning_rate": 8.523278921220724e-05, + "loss": 0.07336496114730835, + "step": 104080 + }, + { + "epoch": 14.775017743080198, + "grad_norm": 0.10782082378864288, + "learning_rate": 8.523136976579135e-05, + "loss": 0.07388808131217957, + "step": 104090 + }, + { + "epoch": 14.776437189496097, + "grad_norm": 12.216814994812012, + "learning_rate": 8.522995031937545e-05, + "loss": 0.0686156153678894, + "step": 104100 + }, + { + "epoch": 14.777856635911995, + "grad_norm": 0.35206928849220276, + "learning_rate": 8.522853087295954e-05, + "loss": 0.021819183230400087, + "step": 104110 + }, + { + "epoch": 14.779276082327891, + "grad_norm": 2.7776386737823486, + "learning_rate": 8.522711142654365e-05, + "loss": 0.06748580932617188, + "step": 104120 + }, + { + "epoch": 14.78069552874379, + "grad_norm": 2.451956033706665, + "learning_rate": 8.522569198012775e-05, + "loss": 0.012218570709228516, + "step": 104130 + }, + { + "epoch": 14.782114975159688, + "grad_norm": 1.476254940032959, + "learning_rate": 8.522427253371186e-05, + "loss": 0.01046212911605835, + "step": 104140 + }, + { + "epoch": 14.783534421575585, + "grad_norm": 0.10849824547767639, + "learning_rate": 8.522285308729596e-05, + "loss": 0.017551289498806, + "step": 104150 + }, + { + "epoch": 14.784953867991483, + "grad_norm": 9.550311088562012, + "learning_rate": 8.522143364088007e-05, + "loss": 0.02532818615436554, + "step": 104160 + }, + { + "epoch": 14.786373314407381, + "grad_norm": 0.09233774244785309, + "learning_rate": 8.522001419446415e-05, + "loss": 0.02874256670475006, + "step": 104170 + }, + { + "epoch": 14.78779276082328, + "grad_norm": 0.6827500462532043, + "learning_rate": 8.521859474804826e-05, + "loss": 0.012810790538787841, + "step": 104180 + }, + { + "epoch": 14.789212207239176, + "grad_norm": 7.668457984924316, + "learning_rate": 8.521717530163236e-05, + "loss": 0.0762434184551239, + "step": 104190 + }, + { + "epoch": 14.790631653655074, + "grad_norm": 0.3374784588813782, + "learning_rate": 8.521575585521647e-05, + "loss": 0.006024296581745148, + "step": 104200 + }, + { + "epoch": 14.792051100070973, + "grad_norm": 5.660341262817383, + "learning_rate": 8.521433640880057e-05, + "loss": 0.04634143710136414, + "step": 104210 + }, + { + "epoch": 14.79347054648687, + "grad_norm": 3.8380041122436523, + "learning_rate": 8.521291696238467e-05, + "loss": 0.05055670738220215, + "step": 104220 + }, + { + "epoch": 14.794889992902768, + "grad_norm": 6.826344966888428, + "learning_rate": 8.521149751596878e-05, + "loss": 0.020168723165988924, + "step": 104230 + }, + { + "epoch": 14.796309439318666, + "grad_norm": 0.6084926128387451, + "learning_rate": 8.521007806955288e-05, + "loss": 0.005605394020676613, + "step": 104240 + }, + { + "epoch": 14.797728885734564, + "grad_norm": 0.4939149022102356, + "learning_rate": 8.520865862313699e-05, + "loss": 0.04106263220310211, + "step": 104250 + }, + { + "epoch": 14.79914833215046, + "grad_norm": 0.3379480242729187, + "learning_rate": 8.520723917672108e-05, + "loss": 0.038764914870262145, + "step": 104260 + }, + { + "epoch": 14.800567778566359, + "grad_norm": 0.012404486536979675, + "learning_rate": 8.520581973030518e-05, + "loss": 0.01627178639173508, + "step": 104270 + }, + { + "epoch": 14.801987224982257, + "grad_norm": 0.01644447073340416, + "learning_rate": 8.520440028388928e-05, + "loss": 0.06491001844406127, + "step": 104280 + }, + { + "epoch": 14.803406671398154, + "grad_norm": 0.019314207136631012, + "learning_rate": 8.520298083747339e-05, + "loss": 0.025806555151939393, + "step": 104290 + }, + { + "epoch": 14.804826117814052, + "grad_norm": 5.47023868560791, + "learning_rate": 8.520156139105749e-05, + "loss": 0.044958552718162535, + "step": 104300 + }, + { + "epoch": 14.80624556422995, + "grad_norm": 3.8105883598327637, + "learning_rate": 8.52001419446416e-05, + "loss": 0.026090803742408752, + "step": 104310 + }, + { + "epoch": 14.807665010645849, + "grad_norm": 9.174649238586426, + "learning_rate": 8.51987224982257e-05, + "loss": 0.03889913856983185, + "step": 104320 + }, + { + "epoch": 14.809084457061745, + "grad_norm": 7.475338459014893, + "learning_rate": 8.51973030518098e-05, + "loss": 0.035494810342788695, + "step": 104330 + }, + { + "epoch": 14.810503903477644, + "grad_norm": 4.019817352294922, + "learning_rate": 8.51958836053939e-05, + "loss": 0.009954603016376495, + "step": 104340 + }, + { + "epoch": 14.811923349893542, + "grad_norm": 0.2154158651828766, + "learning_rate": 8.5194464158978e-05, + "loss": 0.028025662899017333, + "step": 104350 + }, + { + "epoch": 14.813342796309438, + "grad_norm": 0.098808154463768, + "learning_rate": 8.519304471256211e-05, + "loss": 0.013450905680656433, + "step": 104360 + }, + { + "epoch": 14.814762242725337, + "grad_norm": 1.249442458152771, + "learning_rate": 8.51916252661462e-05, + "loss": 0.014495487511157989, + "step": 104370 + }, + { + "epoch": 14.816181689141235, + "grad_norm": 7.714137554168701, + "learning_rate": 8.519020581973031e-05, + "loss": 0.03295493721961975, + "step": 104380 + }, + { + "epoch": 14.817601135557133, + "grad_norm": 1.7971025705337524, + "learning_rate": 8.51887863733144e-05, + "loss": 0.03349592685699463, + "step": 104390 + }, + { + "epoch": 14.81902058197303, + "grad_norm": 3.5626816749572754, + "learning_rate": 8.518736692689852e-05, + "loss": 0.010706099867820739, + "step": 104400 + }, + { + "epoch": 14.820440028388928, + "grad_norm": 0.09448494762182236, + "learning_rate": 8.518594748048261e-05, + "loss": 0.021884602308273316, + "step": 104410 + }, + { + "epoch": 14.821859474804826, + "grad_norm": 0.14466921985149384, + "learning_rate": 8.518452803406671e-05, + "loss": 0.00417226180434227, + "step": 104420 + }, + { + "epoch": 14.823278921220723, + "grad_norm": 0.17145518958568573, + "learning_rate": 8.518310858765082e-05, + "loss": 0.006554578989744186, + "step": 104430 + }, + { + "epoch": 14.824698367636621, + "grad_norm": 0.002625130582600832, + "learning_rate": 8.518168914123492e-05, + "loss": 0.009584589302539826, + "step": 104440 + }, + { + "epoch": 14.82611781405252, + "grad_norm": 0.03746289014816284, + "learning_rate": 8.518026969481903e-05, + "loss": 0.037434864044189456, + "step": 104450 + }, + { + "epoch": 14.827537260468418, + "grad_norm": 11.766426086425781, + "learning_rate": 8.517885024840313e-05, + "loss": 0.031741362810134885, + "step": 104460 + }, + { + "epoch": 14.828956706884314, + "grad_norm": 0.08067937195301056, + "learning_rate": 8.517743080198722e-05, + "loss": 0.015521128475666047, + "step": 104470 + }, + { + "epoch": 14.830376153300213, + "grad_norm": 0.03898005187511444, + "learning_rate": 8.517601135557132e-05, + "loss": 0.0194305419921875, + "step": 104480 + }, + { + "epoch": 14.831795599716111, + "grad_norm": 0.11849980056285858, + "learning_rate": 8.517459190915543e-05, + "loss": 0.005333187431097031, + "step": 104490 + }, + { + "epoch": 14.833215046132008, + "grad_norm": 0.06989427655935287, + "learning_rate": 8.517317246273953e-05, + "loss": 0.026983022689819336, + "step": 104500 + }, + { + "epoch": 14.833215046132008, + "eval_accuracy": 0.9844216951739048, + "eval_loss": 0.056505098938941956, + "eval_runtime": 31.0822, + "eval_samples_per_second": 505.98, + "eval_steps_per_second": 15.829, + "step": 104500 + }, + { + "epoch": 14.834634492547906, + "grad_norm": 2.388498544692993, + "learning_rate": 8.517175301632364e-05, + "loss": 0.03193688988685608, + "step": 104510 + }, + { + "epoch": 14.836053938963804, + "grad_norm": 0.0659419372677803, + "learning_rate": 8.517033356990775e-05, + "loss": 0.0162614181637764, + "step": 104520 + }, + { + "epoch": 14.837473385379703, + "grad_norm": 4.265310287475586, + "learning_rate": 8.516891412349184e-05, + "loss": 0.027747917175292968, + "step": 104530 + }, + { + "epoch": 14.838892831795599, + "grad_norm": 5.0632195472717285, + "learning_rate": 8.516749467707595e-05, + "loss": 0.0163781076669693, + "step": 104540 + }, + { + "epoch": 14.840312278211497, + "grad_norm": 0.06357064843177795, + "learning_rate": 8.516607523066004e-05, + "loss": 0.0017443560063838959, + "step": 104550 + }, + { + "epoch": 14.841731724627396, + "grad_norm": 0.15850457549095154, + "learning_rate": 8.516465578424415e-05, + "loss": 0.025368887186050414, + "step": 104560 + }, + { + "epoch": 14.843151171043292, + "grad_norm": 0.29624274373054504, + "learning_rate": 8.516323633782825e-05, + "loss": 0.014364491403102874, + "step": 104570 + }, + { + "epoch": 14.84457061745919, + "grad_norm": 6.2883524894714355, + "learning_rate": 8.516181689141235e-05, + "loss": 0.020932359993457793, + "step": 104580 + }, + { + "epoch": 14.845990063875089, + "grad_norm": 1.254381775856018, + "learning_rate": 8.516039744499645e-05, + "loss": 0.011461752653121948, + "step": 104590 + }, + { + "epoch": 14.847409510290987, + "grad_norm": 6.394966125488281, + "learning_rate": 8.515897799858056e-05, + "loss": 0.03518040776252747, + "step": 104600 + }, + { + "epoch": 14.848828956706884, + "grad_norm": 17.94300651550293, + "learning_rate": 8.515755855216467e-05, + "loss": 0.051870590448379515, + "step": 104610 + }, + { + "epoch": 14.850248403122782, + "grad_norm": 1.1413216590881348, + "learning_rate": 8.515613910574877e-05, + "loss": 0.08706502914428711, + "step": 104620 + }, + { + "epoch": 14.85166784953868, + "grad_norm": 0.050446055829524994, + "learning_rate": 8.515471965933286e-05, + "loss": 0.08013435006141663, + "step": 104630 + }, + { + "epoch": 14.853087295954577, + "grad_norm": 1.871097445487976, + "learning_rate": 8.515330021291696e-05, + "loss": 0.02684931755065918, + "step": 104640 + }, + { + "epoch": 14.854506742370475, + "grad_norm": 0.2473142445087433, + "learning_rate": 8.515188076650107e-05, + "loss": 0.012901613116264343, + "step": 104650 + }, + { + "epoch": 14.855926188786373, + "grad_norm": 10.848540306091309, + "learning_rate": 8.515046132008517e-05, + "loss": 0.017069482803344728, + "step": 104660 + }, + { + "epoch": 14.857345635202272, + "grad_norm": 1.1023898124694824, + "learning_rate": 8.514904187366928e-05, + "loss": 0.02099357098340988, + "step": 104670 + }, + { + "epoch": 14.858765081618168, + "grad_norm": 7.283102512359619, + "learning_rate": 8.514762242725336e-05, + "loss": 0.026400291919708253, + "step": 104680 + }, + { + "epoch": 14.860184528034067, + "grad_norm": 0.23161716759204865, + "learning_rate": 8.514620298083747e-05, + "loss": 0.007943221926689148, + "step": 104690 + }, + { + "epoch": 14.861603974449965, + "grad_norm": 0.08439341187477112, + "learning_rate": 8.514478353442159e-05, + "loss": 0.016894285380840302, + "step": 104700 + }, + { + "epoch": 14.863023420865863, + "grad_norm": 0.10445648431777954, + "learning_rate": 8.514336408800568e-05, + "loss": 0.02399568408727646, + "step": 104710 + }, + { + "epoch": 14.86444286728176, + "grad_norm": 2.852975845336914, + "learning_rate": 8.51419446415898e-05, + "loss": 0.028228983283042908, + "step": 104720 + }, + { + "epoch": 14.865862313697658, + "grad_norm": 0.03454967215657234, + "learning_rate": 8.514052519517388e-05, + "loss": 0.009380853176116944, + "step": 104730 + }, + { + "epoch": 14.867281760113556, + "grad_norm": 2.4463937282562256, + "learning_rate": 8.513910574875799e-05, + "loss": 0.02598130702972412, + "step": 104740 + }, + { + "epoch": 14.868701206529453, + "grad_norm": 4.604282379150391, + "learning_rate": 8.513768630234209e-05, + "loss": 0.005938100814819336, + "step": 104750 + }, + { + "epoch": 14.870120652945351, + "grad_norm": 0.04022074490785599, + "learning_rate": 8.51362668559262e-05, + "loss": 0.031351137161254886, + "step": 104760 + }, + { + "epoch": 14.87154009936125, + "grad_norm": 15.1404390335083, + "learning_rate": 8.51348474095103e-05, + "loss": 0.04066168069839478, + "step": 104770 + }, + { + "epoch": 14.872959545777148, + "grad_norm": 15.48460578918457, + "learning_rate": 8.513342796309439e-05, + "loss": 0.017590782046318053, + "step": 104780 + }, + { + "epoch": 14.874378992193044, + "grad_norm": 0.7865493893623352, + "learning_rate": 8.51320085166785e-05, + "loss": 0.017658084630966187, + "step": 104790 + }, + { + "epoch": 14.875798438608943, + "grad_norm": 0.7198042273521423, + "learning_rate": 8.51305890702626e-05, + "loss": 0.017467735707759856, + "step": 104800 + }, + { + "epoch": 14.87721788502484, + "grad_norm": 5.571759223937988, + "learning_rate": 8.512916962384671e-05, + "loss": 0.018814486265182496, + "step": 104810 + }, + { + "epoch": 14.878637331440737, + "grad_norm": 9.609874725341797, + "learning_rate": 8.512775017743081e-05, + "loss": 0.010202006250619889, + "step": 104820 + }, + { + "epoch": 14.880056777856636, + "grad_norm": 12.367914199829102, + "learning_rate": 8.51263307310149e-05, + "loss": 0.011622205376625061, + "step": 104830 + }, + { + "epoch": 14.881476224272534, + "grad_norm": 2.200284004211426, + "learning_rate": 8.5124911284599e-05, + "loss": 0.012384110689163208, + "step": 104840 + }, + { + "epoch": 14.882895670688432, + "grad_norm": 0.00814757589250803, + "learning_rate": 8.512349183818311e-05, + "loss": 0.03865731358528137, + "step": 104850 + }, + { + "epoch": 14.884315117104329, + "grad_norm": 3.8396785259246826, + "learning_rate": 8.512207239176721e-05, + "loss": 0.018626007437705993, + "step": 104860 + }, + { + "epoch": 14.885734563520227, + "grad_norm": 0.9540119171142578, + "learning_rate": 8.512065294535132e-05, + "loss": 0.010299560427665711, + "step": 104870 + }, + { + "epoch": 14.887154009936125, + "grad_norm": 0.29044824838638306, + "learning_rate": 8.511923349893542e-05, + "loss": 0.0174009770154953, + "step": 104880 + }, + { + "epoch": 14.888573456352022, + "grad_norm": 0.9899982213973999, + "learning_rate": 8.511781405251952e-05, + "loss": 0.003324838727712631, + "step": 104890 + }, + { + "epoch": 14.88999290276792, + "grad_norm": 0.12451029568910599, + "learning_rate": 8.511639460610363e-05, + "loss": 0.01388794630765915, + "step": 104900 + }, + { + "epoch": 14.891412349183819, + "grad_norm": 0.10587727278470993, + "learning_rate": 8.511497515968773e-05, + "loss": 0.02231362909078598, + "step": 104910 + }, + { + "epoch": 14.892831795599717, + "grad_norm": 0.5449631810188293, + "learning_rate": 8.511355571327184e-05, + "loss": 0.008616983890533447, + "step": 104920 + }, + { + "epoch": 14.894251242015613, + "grad_norm": 4.594874858856201, + "learning_rate": 8.511213626685593e-05, + "loss": 0.039649614691734315, + "step": 104930 + }, + { + "epoch": 14.895670688431512, + "grad_norm": 0.43790388107299805, + "learning_rate": 8.511071682044003e-05, + "loss": 0.033021801710128786, + "step": 104940 + }, + { + "epoch": 14.89709013484741, + "grad_norm": 9.739229202270508, + "learning_rate": 8.510929737402413e-05, + "loss": 0.028501474857330324, + "step": 104950 + }, + { + "epoch": 14.898509581263307, + "grad_norm": 0.18172407150268555, + "learning_rate": 8.510787792760824e-05, + "loss": 0.05176345109939575, + "step": 104960 + }, + { + "epoch": 14.899929027679205, + "grad_norm": 2.6929240226745605, + "learning_rate": 8.510645848119234e-05, + "loss": 0.005494722351431847, + "step": 104970 + }, + { + "epoch": 14.901348474095103, + "grad_norm": 0.0787600576877594, + "learning_rate": 8.510503903477645e-05, + "loss": 0.007409898936748505, + "step": 104980 + }, + { + "epoch": 14.902767920511002, + "grad_norm": 0.07710441201925278, + "learning_rate": 8.510361958836055e-05, + "loss": 0.0217500239610672, + "step": 104990 + }, + { + "epoch": 14.904187366926898, + "grad_norm": 1.6725579500198364, + "learning_rate": 8.510220014194464e-05, + "loss": 0.007238331437110901, + "step": 105000 + }, + { + "epoch": 14.904187366926898, + "eval_accuracy": 0.986011318115343, + "eval_loss": 0.04534267634153366, + "eval_runtime": 33.5028, + "eval_samples_per_second": 469.423, + "eval_steps_per_second": 14.685, + "step": 105000 + }, + { + "epoch": 14.905606813342796, + "grad_norm": 0.47815820574760437, + "learning_rate": 8.510078069552875e-05, + "loss": 0.01293196976184845, + "step": 105010 + }, + { + "epoch": 14.907026259758695, + "grad_norm": 0.06179656460881233, + "learning_rate": 8.509936124911285e-05, + "loss": 0.01665736585855484, + "step": 105020 + }, + { + "epoch": 14.908445706174591, + "grad_norm": 0.7428871393203735, + "learning_rate": 8.509794180269696e-05, + "loss": 0.014929351210594178, + "step": 105030 + }, + { + "epoch": 14.90986515259049, + "grad_norm": 0.4426209330558777, + "learning_rate": 8.509652235628105e-05, + "loss": 0.06493237614631653, + "step": 105040 + }, + { + "epoch": 14.911284599006388, + "grad_norm": 0.8456557393074036, + "learning_rate": 8.509510290986516e-05, + "loss": 0.0359197735786438, + "step": 105050 + }, + { + "epoch": 14.912704045422286, + "grad_norm": 2.24787974357605, + "learning_rate": 8.509368346344925e-05, + "loss": 0.008549968898296356, + "step": 105060 + }, + { + "epoch": 14.914123491838183, + "grad_norm": 0.15779711306095123, + "learning_rate": 8.509226401703336e-05, + "loss": 0.017723798751831055, + "step": 105070 + }, + { + "epoch": 14.915542938254081, + "grad_norm": 9.117578506469727, + "learning_rate": 8.509084457061746e-05, + "loss": 0.029659080505371093, + "step": 105080 + }, + { + "epoch": 14.91696238466998, + "grad_norm": 13.629226684570312, + "learning_rate": 8.508942512420156e-05, + "loss": 0.027601355314254762, + "step": 105090 + }, + { + "epoch": 14.918381831085876, + "grad_norm": 0.10228786617517471, + "learning_rate": 8.508800567778567e-05, + "loss": 0.05371737480163574, + "step": 105100 + }, + { + "epoch": 14.919801277501774, + "grad_norm": 0.003000596771016717, + "learning_rate": 8.508658623136977e-05, + "loss": 0.003889523446559906, + "step": 105110 + }, + { + "epoch": 14.921220723917672, + "grad_norm": 0.06478263437747955, + "learning_rate": 8.508530872959545e-05, + "loss": 0.08041570186614991, + "step": 105120 + }, + { + "epoch": 14.92264017033357, + "grad_norm": 0.06725726276636124, + "learning_rate": 8.508388928317956e-05, + "loss": 0.03126749694347382, + "step": 105130 + }, + { + "epoch": 14.924059616749467, + "grad_norm": 0.15821486711502075, + "learning_rate": 8.508246983676366e-05, + "loss": 0.006352770328521729, + "step": 105140 + }, + { + "epoch": 14.925479063165366, + "grad_norm": 4.99887752532959, + "learning_rate": 8.508105039034777e-05, + "loss": 0.00685601457953453, + "step": 105150 + }, + { + "epoch": 14.926898509581264, + "grad_norm": 0.07320870459079742, + "learning_rate": 8.507963094393187e-05, + "loss": 0.04207580983638763, + "step": 105160 + }, + { + "epoch": 14.92831795599716, + "grad_norm": 3.1809349060058594, + "learning_rate": 8.507821149751597e-05, + "loss": 0.021016967296600342, + "step": 105170 + }, + { + "epoch": 14.929737402413059, + "grad_norm": 0.059679560363292694, + "learning_rate": 8.507679205110008e-05, + "loss": 0.037418097257614136, + "step": 105180 + }, + { + "epoch": 14.931156848828957, + "grad_norm": 0.04984293133020401, + "learning_rate": 8.507537260468418e-05, + "loss": 0.010998521745204926, + "step": 105190 + }, + { + "epoch": 14.932576295244855, + "grad_norm": 0.038183312863111496, + "learning_rate": 8.507395315826829e-05, + "loss": 0.00536215603351593, + "step": 105200 + }, + { + "epoch": 14.933995741660752, + "grad_norm": 0.20261302590370178, + "learning_rate": 8.507253371185238e-05, + "loss": 0.013916152715682983, + "step": 105210 + }, + { + "epoch": 14.93541518807665, + "grad_norm": 21.86165428161621, + "learning_rate": 8.507111426543648e-05, + "loss": 0.04513751268386841, + "step": 105220 + }, + { + "epoch": 14.936834634492548, + "grad_norm": 0.756188690662384, + "learning_rate": 8.506969481902058e-05, + "loss": 0.019983695447444917, + "step": 105230 + }, + { + "epoch": 14.938254080908445, + "grad_norm": 0.05859207734465599, + "learning_rate": 8.506827537260469e-05, + "loss": 0.019867606461048126, + "step": 105240 + }, + { + "epoch": 14.939673527324343, + "grad_norm": 0.04819387570023537, + "learning_rate": 8.506685592618879e-05, + "loss": 0.01914552301168442, + "step": 105250 + }, + { + "epoch": 14.941092973740242, + "grad_norm": 0.14617569744586945, + "learning_rate": 8.50654364797729e-05, + "loss": 0.026956775784492494, + "step": 105260 + }, + { + "epoch": 14.94251242015614, + "grad_norm": 0.10384272783994675, + "learning_rate": 8.5064017033357e-05, + "loss": 0.01044590026140213, + "step": 105270 + }, + { + "epoch": 14.943931866572036, + "grad_norm": 0.032726746052503586, + "learning_rate": 8.506259758694109e-05, + "loss": 0.009355773031711579, + "step": 105280 + }, + { + "epoch": 14.945351312987935, + "grad_norm": 0.22472988069057465, + "learning_rate": 8.50611781405252e-05, + "loss": 0.016062158346176147, + "step": 105290 + }, + { + "epoch": 14.946770759403833, + "grad_norm": 0.8198087811470032, + "learning_rate": 8.50597586941093e-05, + "loss": 0.0025589760392904282, + "step": 105300 + }, + { + "epoch": 14.94819020581973, + "grad_norm": 3.6993165016174316, + "learning_rate": 8.505833924769341e-05, + "loss": 0.005632463097572327, + "step": 105310 + }, + { + "epoch": 14.949609652235628, + "grad_norm": 1.08851957321167, + "learning_rate": 8.50569198012775e-05, + "loss": 0.06681792140007019, + "step": 105320 + }, + { + "epoch": 14.951029098651526, + "grad_norm": 3.3110740184783936, + "learning_rate": 8.50555003548616e-05, + "loss": 0.007581328600645065, + "step": 105330 + }, + { + "epoch": 14.952448545067424, + "grad_norm": 0.3741854429244995, + "learning_rate": 8.50540809084457e-05, + "loss": 0.022600403428077696, + "step": 105340 + }, + { + "epoch": 14.953867991483321, + "grad_norm": 0.7727049589157104, + "learning_rate": 8.505266146202981e-05, + "loss": 0.010692685097455978, + "step": 105350 + }, + { + "epoch": 14.95528743789922, + "grad_norm": 0.1877894401550293, + "learning_rate": 8.505124201561393e-05, + "loss": 0.017357349395751953, + "step": 105360 + }, + { + "epoch": 14.956706884315118, + "grad_norm": 9.413021087646484, + "learning_rate": 8.504982256919801e-05, + "loss": 0.05534917116165161, + "step": 105370 + }, + { + "epoch": 14.958126330731014, + "grad_norm": 5.803071975708008, + "learning_rate": 8.504840312278212e-05, + "loss": 0.019386471807956697, + "step": 105380 + }, + { + "epoch": 14.959545777146912, + "grad_norm": 0.6982179880142212, + "learning_rate": 8.504698367636622e-05, + "loss": 0.011141490936279298, + "step": 105390 + }, + { + "epoch": 14.96096522356281, + "grad_norm": 0.8867304921150208, + "learning_rate": 8.504556422995033e-05, + "loss": 0.019689857959747314, + "step": 105400 + }, + { + "epoch": 14.962384669978709, + "grad_norm": 13.065898895263672, + "learning_rate": 8.504414478353443e-05, + "loss": 0.05626608729362488, + "step": 105410 + }, + { + "epoch": 14.963804116394606, + "grad_norm": 4.895026206970215, + "learning_rate": 8.504272533711852e-05, + "loss": 0.026846492290496828, + "step": 105420 + }, + { + "epoch": 14.965223562810504, + "grad_norm": 5.024219989776611, + "learning_rate": 8.504130589070262e-05, + "loss": 0.01723182052373886, + "step": 105430 + }, + { + "epoch": 14.966643009226402, + "grad_norm": 6.151825904846191, + "learning_rate": 8.503988644428673e-05, + "loss": 0.028924581408500672, + "step": 105440 + }, + { + "epoch": 14.968062455642299, + "grad_norm": 0.17709432542324066, + "learning_rate": 8.503846699787084e-05, + "loss": 0.03688434362411499, + "step": 105450 + }, + { + "epoch": 14.969481902058197, + "grad_norm": 5.439521789550781, + "learning_rate": 8.503704755145494e-05, + "loss": 0.023295867443084716, + "step": 105460 + }, + { + "epoch": 14.970901348474095, + "grad_norm": 0.4720461964607239, + "learning_rate": 8.503562810503904e-05, + "loss": 0.018590964376926422, + "step": 105470 + }, + { + "epoch": 14.972320794889994, + "grad_norm": 8.585654258728027, + "learning_rate": 8.503420865862313e-05, + "loss": 0.05288488268852234, + "step": 105480 + }, + { + "epoch": 14.97374024130589, + "grad_norm": 4.889590263366699, + "learning_rate": 8.503278921220725e-05, + "loss": 0.01930091977119446, + "step": 105490 + }, + { + "epoch": 14.975159687721789, + "grad_norm": 0.030981246381998062, + "learning_rate": 8.503136976579134e-05, + "loss": 0.02204904705286026, + "step": 105500 + }, + { + "epoch": 14.975159687721789, + "eval_accuracy": 0.983849430914987, + "eval_loss": 0.0565863735973835, + "eval_runtime": 30.8719, + "eval_samples_per_second": 509.428, + "eval_steps_per_second": 15.937, + "step": 105500 + }, + { + "epoch": 14.976579134137687, + "grad_norm": 1.1705716848373413, + "learning_rate": 8.502995031937545e-05, + "loss": 0.008624742925167083, + "step": 105510 + }, + { + "epoch": 14.977998580553583, + "grad_norm": 0.7375067472457886, + "learning_rate": 8.502853087295955e-05, + "loss": 0.02464774250984192, + "step": 105520 + }, + { + "epoch": 14.979418026969482, + "grad_norm": 0.8426643013954163, + "learning_rate": 8.502711142654365e-05, + "loss": 0.060261762142181395, + "step": 105530 + }, + { + "epoch": 14.98083747338538, + "grad_norm": 4.958010673522949, + "learning_rate": 8.502569198012776e-05, + "loss": 0.008713224530220031, + "step": 105540 + }, + { + "epoch": 14.982256919801278, + "grad_norm": 0.7848474979400635, + "learning_rate": 8.502427253371186e-05, + "loss": 0.025871086120605468, + "step": 105550 + }, + { + "epoch": 14.983676366217175, + "grad_norm": 6.699347972869873, + "learning_rate": 8.502285308729597e-05, + "loss": 0.0266726553440094, + "step": 105560 + }, + { + "epoch": 14.985095812633073, + "grad_norm": 10.014344215393066, + "learning_rate": 8.502143364088007e-05, + "loss": 0.02668716311454773, + "step": 105570 + }, + { + "epoch": 14.986515259048971, + "grad_norm": 2.3931314945220947, + "learning_rate": 8.502001419446416e-05, + "loss": 0.02140100598335266, + "step": 105580 + }, + { + "epoch": 14.987934705464868, + "grad_norm": 0.6117798089981079, + "learning_rate": 8.501859474804826e-05, + "loss": 0.007049883902072907, + "step": 105590 + }, + { + "epoch": 14.989354151880766, + "grad_norm": 12.927014350891113, + "learning_rate": 8.501717530163237e-05, + "loss": 0.048466211557388304, + "step": 105600 + }, + { + "epoch": 14.990773598296665, + "grad_norm": 7.8452630043029785, + "learning_rate": 8.501575585521647e-05, + "loss": 0.014284107089042663, + "step": 105610 + }, + { + "epoch": 14.992193044712563, + "grad_norm": 0.233725905418396, + "learning_rate": 8.501433640880058e-05, + "loss": 0.04717913269996643, + "step": 105620 + }, + { + "epoch": 14.99361249112846, + "grad_norm": 0.1834932565689087, + "learning_rate": 8.501291696238468e-05, + "loss": 0.03222338557243347, + "step": 105630 + }, + { + "epoch": 14.995031937544358, + "grad_norm": 6.620406150817871, + "learning_rate": 8.501149751596877e-05, + "loss": 0.04589863419532776, + "step": 105640 + }, + { + "epoch": 14.996451383960256, + "grad_norm": 0.19498220086097717, + "learning_rate": 8.501007806955288e-05, + "loss": 0.0016890153288841247, + "step": 105650 + }, + { + "epoch": 14.997870830376153, + "grad_norm": 0.051954738795757294, + "learning_rate": 8.500865862313698e-05, + "loss": 0.01796208769083023, + "step": 105660 + }, + { + "epoch": 14.99929027679205, + "grad_norm": 0.3377091586589813, + "learning_rate": 8.50072391767211e-05, + "loss": 0.027283614873886107, + "step": 105670 + }, + { + "epoch": 15.00070972320795, + "grad_norm": 9.270378112792969, + "learning_rate": 8.500581973030518e-05, + "loss": 0.030120304226875304, + "step": 105680 + }, + { + "epoch": 15.002129169623847, + "grad_norm": 5.4814229011535645, + "learning_rate": 8.500440028388929e-05, + "loss": 0.025646063685417175, + "step": 105690 + }, + { + "epoch": 15.003548616039744, + "grad_norm": 0.20738860964775085, + "learning_rate": 8.500298083747339e-05, + "loss": 0.03865569829940796, + "step": 105700 + }, + { + "epoch": 15.004968062455642, + "grad_norm": 0.10520824044942856, + "learning_rate": 8.50015613910575e-05, + "loss": 0.010026749223470688, + "step": 105710 + }, + { + "epoch": 15.00638750887154, + "grad_norm": 10.182268142700195, + "learning_rate": 8.50001419446416e-05, + "loss": 0.023018835484981535, + "step": 105720 + }, + { + "epoch": 15.007806955287437, + "grad_norm": 0.039473164826631546, + "learning_rate": 8.499872249822569e-05, + "loss": 0.023001056909561158, + "step": 105730 + }, + { + "epoch": 15.009226401703335, + "grad_norm": 0.15117046236991882, + "learning_rate": 8.49973030518098e-05, + "loss": 0.0272275447845459, + "step": 105740 + }, + { + "epoch": 15.010645848119234, + "grad_norm": 2.397531747817993, + "learning_rate": 8.49958836053939e-05, + "loss": 0.03603595197200775, + "step": 105750 + }, + { + "epoch": 15.012065294535132, + "grad_norm": 0.23408842086791992, + "learning_rate": 8.499446415897801e-05, + "loss": 0.004264497011899948, + "step": 105760 + }, + { + "epoch": 15.013484740951029, + "grad_norm": 13.141559600830078, + "learning_rate": 8.499304471256211e-05, + "loss": 0.04609453082084656, + "step": 105770 + }, + { + "epoch": 15.014904187366927, + "grad_norm": 7.471634864807129, + "learning_rate": 8.49916252661462e-05, + "loss": 0.03790695369243622, + "step": 105780 + }, + { + "epoch": 15.016323633782825, + "grad_norm": 8.660481452941895, + "learning_rate": 8.49902058197303e-05, + "loss": 0.014696374535560608, + "step": 105790 + }, + { + "epoch": 15.017743080198722, + "grad_norm": 3.2237870693206787, + "learning_rate": 8.498878637331441e-05, + "loss": 0.03969519138336182, + "step": 105800 + }, + { + "epoch": 15.01916252661462, + "grad_norm": 7.434295177459717, + "learning_rate": 8.498736692689851e-05, + "loss": 0.06669918894767761, + "step": 105810 + }, + { + "epoch": 15.020581973030518, + "grad_norm": 0.9601563215255737, + "learning_rate": 8.498594748048262e-05, + "loss": 0.018045979738235473, + "step": 105820 + }, + { + "epoch": 15.022001419446417, + "grad_norm": 0.553522527217865, + "learning_rate": 8.498452803406672e-05, + "loss": 0.004522566124796868, + "step": 105830 + }, + { + "epoch": 15.023420865862313, + "grad_norm": 0.238037571310997, + "learning_rate": 8.498310858765082e-05, + "loss": 0.006286618113517761, + "step": 105840 + }, + { + "epoch": 15.024840312278211, + "grad_norm": 0.02164604514837265, + "learning_rate": 8.498168914123493e-05, + "loss": 0.06347473859786987, + "step": 105850 + }, + { + "epoch": 15.02625975869411, + "grad_norm": 6.417102336883545, + "learning_rate": 8.498026969481902e-05, + "loss": 0.03233658075332642, + "step": 105860 + }, + { + "epoch": 15.027679205110006, + "grad_norm": 0.07468276470899582, + "learning_rate": 8.497885024840314e-05, + "loss": 0.01961416006088257, + "step": 105870 + }, + { + "epoch": 15.029098651525905, + "grad_norm": 0.08328916877508163, + "learning_rate": 8.497743080198723e-05, + "loss": 0.02036486268043518, + "step": 105880 + }, + { + "epoch": 15.030518097941803, + "grad_norm": 0.13861192762851715, + "learning_rate": 8.497601135557133e-05, + "loss": 0.016381070017814636, + "step": 105890 + }, + { + "epoch": 15.031937544357701, + "grad_norm": 0.012162907980382442, + "learning_rate": 8.497459190915543e-05, + "loss": 0.025388899445533752, + "step": 105900 + }, + { + "epoch": 15.033356990773598, + "grad_norm": 0.12118402123451233, + "learning_rate": 8.497317246273954e-05, + "loss": 0.014161121845245362, + "step": 105910 + }, + { + "epoch": 15.034776437189496, + "grad_norm": 0.8070512413978577, + "learning_rate": 8.497175301632364e-05, + "loss": 0.005332186818122864, + "step": 105920 + }, + { + "epoch": 15.036195883605394, + "grad_norm": 1.0391985177993774, + "learning_rate": 8.497033356990775e-05, + "loss": 0.015077903866767883, + "step": 105930 + }, + { + "epoch": 15.037615330021291, + "grad_norm": 0.5412289500236511, + "learning_rate": 8.496891412349184e-05, + "loss": 0.003560176119208336, + "step": 105940 + }, + { + "epoch": 15.03903477643719, + "grad_norm": 0.17096680402755737, + "learning_rate": 8.496749467707594e-05, + "loss": 0.004259506985545158, + "step": 105950 + }, + { + "epoch": 15.040454222853088, + "grad_norm": 0.10446897894144058, + "learning_rate": 8.496607523066005e-05, + "loss": 0.013970400393009185, + "step": 105960 + }, + { + "epoch": 15.041873669268986, + "grad_norm": 6.217586994171143, + "learning_rate": 8.496465578424415e-05, + "loss": 0.012521201372146606, + "step": 105970 + }, + { + "epoch": 15.043293115684882, + "grad_norm": 5.13002347946167, + "learning_rate": 8.496323633782826e-05, + "loss": 0.009574723988771438, + "step": 105980 + }, + { + "epoch": 15.04471256210078, + "grad_norm": 0.07002128660678864, + "learning_rate": 8.496181689141234e-05, + "loss": 0.0025296185165643694, + "step": 105990 + }, + { + "epoch": 15.046132008516679, + "grad_norm": 0.8163052201271057, + "learning_rate": 8.496039744499646e-05, + "loss": 0.008115919679403305, + "step": 106000 + }, + { + "epoch": 15.046132008516679, + "eval_accuracy": 0.9864564125389458, + "eval_loss": 0.047789935022592545, + "eval_runtime": 32.3739, + "eval_samples_per_second": 485.792, + "eval_steps_per_second": 15.197, + "step": 106000 + }, + { + "epoch": 15.047551454932576, + "grad_norm": 1.0925426483154297, + "learning_rate": 8.495897799858055e-05, + "loss": 0.05336245894432068, + "step": 106010 + }, + { + "epoch": 15.048970901348474, + "grad_norm": 0.4548809230327606, + "learning_rate": 8.495755855216466e-05, + "loss": 0.033079445362091064, + "step": 106020 + }, + { + "epoch": 15.050390347764372, + "grad_norm": 0.33710336685180664, + "learning_rate": 8.495613910574876e-05, + "loss": 0.021994808316230775, + "step": 106030 + }, + { + "epoch": 15.05180979418027, + "grad_norm": 1.0806621313095093, + "learning_rate": 8.495471965933286e-05, + "loss": 0.0022130701690912247, + "step": 106040 + }, + { + "epoch": 15.053229240596167, + "grad_norm": 0.9031606316566467, + "learning_rate": 8.495330021291697e-05, + "loss": 0.003949935734272003, + "step": 106050 + }, + { + "epoch": 15.054648687012065, + "grad_norm": 0.3298092782497406, + "learning_rate": 8.495188076650107e-05, + "loss": 0.04351118803024292, + "step": 106060 + }, + { + "epoch": 15.056068133427964, + "grad_norm": 1.8871431350708008, + "learning_rate": 8.495046132008518e-05, + "loss": 0.022950419783592226, + "step": 106070 + }, + { + "epoch": 15.05748757984386, + "grad_norm": 3.592064142227173, + "learning_rate": 8.494904187366928e-05, + "loss": 0.03793198466300964, + "step": 106080 + }, + { + "epoch": 15.058907026259758, + "grad_norm": 0.11663145571947098, + "learning_rate": 8.494762242725337e-05, + "loss": 0.015685516595840453, + "step": 106090 + }, + { + "epoch": 15.060326472675657, + "grad_norm": 0.003959988709539175, + "learning_rate": 8.494620298083747e-05, + "loss": 0.0015076756477355956, + "step": 106100 + }, + { + "epoch": 15.061745919091555, + "grad_norm": 2.6824986934661865, + "learning_rate": 8.494478353442158e-05, + "loss": 0.033915793895721434, + "step": 106110 + }, + { + "epoch": 15.063165365507452, + "grad_norm": 4.639670372009277, + "learning_rate": 8.494336408800568e-05, + "loss": 0.013209784030914306, + "step": 106120 + }, + { + "epoch": 15.06458481192335, + "grad_norm": 0.9888678789138794, + "learning_rate": 8.494194464158979e-05, + "loss": 0.04018623828887939, + "step": 106130 + }, + { + "epoch": 15.066004258339248, + "grad_norm": 3.075080156326294, + "learning_rate": 8.494052519517389e-05, + "loss": 0.03815135657787323, + "step": 106140 + }, + { + "epoch": 15.067423704755145, + "grad_norm": 1.7605953216552734, + "learning_rate": 8.493910574875798e-05, + "loss": 0.007596167922019959, + "step": 106150 + }, + { + "epoch": 15.068843151171043, + "grad_norm": 0.012811378575861454, + "learning_rate": 8.49376863023421e-05, + "loss": 0.006362202018499375, + "step": 106160 + }, + { + "epoch": 15.070262597586941, + "grad_norm": 1.7996217012405396, + "learning_rate": 8.493626685592619e-05, + "loss": 0.009276589751243592, + "step": 106170 + }, + { + "epoch": 15.07168204400284, + "grad_norm": 6.258857727050781, + "learning_rate": 8.49348474095103e-05, + "loss": 0.025115305185317995, + "step": 106180 + }, + { + "epoch": 15.073101490418736, + "grad_norm": 4.9005632400512695, + "learning_rate": 8.493342796309439e-05, + "loss": 0.009275999665260316, + "step": 106190 + }, + { + "epoch": 15.074520936834634, + "grad_norm": 0.04640462249517441, + "learning_rate": 8.49320085166785e-05, + "loss": 0.028778478503227234, + "step": 106200 + }, + { + "epoch": 15.075940383250533, + "grad_norm": 0.02235148847103119, + "learning_rate": 8.49305890702626e-05, + "loss": 0.06231725811958313, + "step": 106210 + }, + { + "epoch": 15.07735982966643, + "grad_norm": 0.03783341869711876, + "learning_rate": 8.49291696238467e-05, + "loss": 0.010832040011882782, + "step": 106220 + }, + { + "epoch": 15.078779276082328, + "grad_norm": 9.759112358093262, + "learning_rate": 8.49277501774308e-05, + "loss": 0.02757435142993927, + "step": 106230 + }, + { + "epoch": 15.080198722498226, + "grad_norm": 13.926852226257324, + "learning_rate": 8.492633073101491e-05, + "loss": 0.019434280693531036, + "step": 106240 + }, + { + "epoch": 15.081618168914124, + "grad_norm": 0.11499132961034775, + "learning_rate": 8.492491128459901e-05, + "loss": 0.03727405369281769, + "step": 106250 + }, + { + "epoch": 15.08303761533002, + "grad_norm": 8.562307357788086, + "learning_rate": 8.492349183818311e-05, + "loss": 0.03538262248039246, + "step": 106260 + }, + { + "epoch": 15.084457061745919, + "grad_norm": 19.21854019165039, + "learning_rate": 8.492207239176722e-05, + "loss": 0.019014018774032592, + "step": 106270 + }, + { + "epoch": 15.085876508161817, + "grad_norm": 0.5749403238296509, + "learning_rate": 8.492065294535132e-05, + "loss": 0.03317195773124695, + "step": 106280 + }, + { + "epoch": 15.087295954577714, + "grad_norm": 9.464776039123535, + "learning_rate": 8.491923349893543e-05, + "loss": 0.02200724482536316, + "step": 106290 + }, + { + "epoch": 15.088715400993612, + "grad_norm": 0.37284818291664124, + "learning_rate": 8.491781405251951e-05, + "loss": 0.03638032078742981, + "step": 106300 + }, + { + "epoch": 15.09013484740951, + "grad_norm": 6.311309814453125, + "learning_rate": 8.491639460610362e-05, + "loss": 0.03605286777019501, + "step": 106310 + }, + { + "epoch": 15.091554293825409, + "grad_norm": 0.11901773512363434, + "learning_rate": 8.491497515968772e-05, + "loss": 0.03141979277133942, + "step": 106320 + }, + { + "epoch": 15.092973740241305, + "grad_norm": 0.030311891809105873, + "learning_rate": 8.491355571327183e-05, + "loss": 0.016466012597084044, + "step": 106330 + }, + { + "epoch": 15.094393186657204, + "grad_norm": 2.06520414352417, + "learning_rate": 8.491213626685593e-05, + "loss": 0.011501440405845642, + "step": 106340 + }, + { + "epoch": 15.095812633073102, + "grad_norm": 0.344914972782135, + "learning_rate": 8.491071682044003e-05, + "loss": 0.01476086676120758, + "step": 106350 + }, + { + "epoch": 15.097232079488998, + "grad_norm": 7.9556498527526855, + "learning_rate": 8.490929737402414e-05, + "loss": 0.006111126765608788, + "step": 106360 + }, + { + "epoch": 15.098651525904897, + "grad_norm": 0.5263380408287048, + "learning_rate": 8.490787792760823e-05, + "loss": 0.006287018954753876, + "step": 106370 + }, + { + "epoch": 15.100070972320795, + "grad_norm": 0.007816202007234097, + "learning_rate": 8.490645848119235e-05, + "loss": 0.035767942667007446, + "step": 106380 + }, + { + "epoch": 15.101490418736693, + "grad_norm": 0.1581820547580719, + "learning_rate": 8.490503903477644e-05, + "loss": 0.006314507126808167, + "step": 106390 + }, + { + "epoch": 15.10290986515259, + "grad_norm": 0.29545098543167114, + "learning_rate": 8.490361958836054e-05, + "loss": 0.02759753167629242, + "step": 106400 + }, + { + "epoch": 15.104329311568488, + "grad_norm": 0.3469163775444031, + "learning_rate": 8.490220014194464e-05, + "loss": 0.0040346551686525345, + "step": 106410 + }, + { + "epoch": 15.105748757984387, + "grad_norm": 0.24172019958496094, + "learning_rate": 8.490078069552875e-05, + "loss": 0.02106695920228958, + "step": 106420 + }, + { + "epoch": 15.107168204400283, + "grad_norm": 0.20899048447608948, + "learning_rate": 8.489936124911285e-05, + "loss": 0.013521634042263031, + "step": 106430 + }, + { + "epoch": 15.108587650816181, + "grad_norm": 2.07224440574646, + "learning_rate": 8.489794180269696e-05, + "loss": 0.02519119381904602, + "step": 106440 + }, + { + "epoch": 15.11000709723208, + "grad_norm": 1.6022323369979858, + "learning_rate": 8.489652235628105e-05, + "loss": 0.011256136745214463, + "step": 106450 + }, + { + "epoch": 15.111426543647978, + "grad_norm": 0.06830890476703644, + "learning_rate": 8.489510290986515e-05, + "loss": 0.042464840412139895, + "step": 106460 + }, + { + "epoch": 15.112845990063875, + "grad_norm": 0.025041621178388596, + "learning_rate": 8.489368346344926e-05, + "loss": 0.0023004353046417237, + "step": 106470 + }, + { + "epoch": 15.114265436479773, + "grad_norm": 3.737102508544922, + "learning_rate": 8.489226401703336e-05, + "loss": 0.03057832717895508, + "step": 106480 + }, + { + "epoch": 15.115684882895671, + "grad_norm": 0.3811050057411194, + "learning_rate": 8.489084457061747e-05, + "loss": 0.010930734872817992, + "step": 106490 + }, + { + "epoch": 15.117104329311568, + "grad_norm": 1.066455602645874, + "learning_rate": 8.488942512420155e-05, + "loss": 0.0034047245979309084, + "step": 106500 + }, + { + "epoch": 15.117104329311568, + "eval_accuracy": 0.9809245247027405, + "eval_loss": 0.06730737537145615, + "eval_runtime": 33.0311, + "eval_samples_per_second": 476.126, + "eval_steps_per_second": 14.895, + "step": 106500 + }, + { + "epoch": 15.118523775727466, + "grad_norm": 14.054121971130371, + "learning_rate": 8.488800567778567e-05, + "loss": 0.021509627997875213, + "step": 106510 + }, + { + "epoch": 15.119943222143364, + "grad_norm": 2.574340343475342, + "learning_rate": 8.488658623136976e-05, + "loss": 0.015637876093387605, + "step": 106520 + }, + { + "epoch": 15.121362668559263, + "grad_norm": 0.16894987225532532, + "learning_rate": 8.488516678495387e-05, + "loss": 0.012121076881885528, + "step": 106530 + }, + { + "epoch": 15.12278211497516, + "grad_norm": 0.2743363678455353, + "learning_rate": 8.488374733853797e-05, + "loss": 0.009187763184309005, + "step": 106540 + }, + { + "epoch": 15.124201561391057, + "grad_norm": 1.0792840719223022, + "learning_rate": 8.488232789212207e-05, + "loss": 0.020037105679512023, + "step": 106550 + }, + { + "epoch": 15.125621007806956, + "grad_norm": 0.768378496170044, + "learning_rate": 8.488090844570618e-05, + "loss": 0.061950075626373294, + "step": 106560 + }, + { + "epoch": 15.127040454222852, + "grad_norm": 3.3310279846191406, + "learning_rate": 8.487948899929028e-05, + "loss": 0.011520791053771972, + "step": 106570 + }, + { + "epoch": 15.12845990063875, + "grad_norm": 1.0102425813674927, + "learning_rate": 8.487806955287439e-05, + "loss": 0.04175326824188232, + "step": 106580 + }, + { + "epoch": 15.129879347054649, + "grad_norm": 0.11006898432970047, + "learning_rate": 8.487665010645849e-05, + "loss": 0.014114585518836976, + "step": 106590 + }, + { + "epoch": 15.131298793470547, + "grad_norm": 0.9182747602462769, + "learning_rate": 8.48752306600426e-05, + "loss": 0.037896940112113954, + "step": 106600 + }, + { + "epoch": 15.132718239886444, + "grad_norm": 0.8234381079673767, + "learning_rate": 8.487381121362668e-05, + "loss": 0.03469969928264618, + "step": 106610 + }, + { + "epoch": 15.134137686302342, + "grad_norm": 5.307381629943848, + "learning_rate": 8.487239176721079e-05, + "loss": 0.008131003379821778, + "step": 106620 + }, + { + "epoch": 15.13555713271824, + "grad_norm": 0.09964298456907272, + "learning_rate": 8.487097232079489e-05, + "loss": 0.03536056876182556, + "step": 106630 + }, + { + "epoch": 15.136976579134137, + "grad_norm": 0.10982150584459305, + "learning_rate": 8.4869552874379e-05, + "loss": 0.008714452385902405, + "step": 106640 + }, + { + "epoch": 15.138396025550035, + "grad_norm": 0.2031431943178177, + "learning_rate": 8.48681334279631e-05, + "loss": 0.05170516967773438, + "step": 106650 + }, + { + "epoch": 15.139815471965933, + "grad_norm": 0.2435196489095688, + "learning_rate": 8.48667139815472e-05, + "loss": 0.016551582515239714, + "step": 106660 + }, + { + "epoch": 15.141234918381832, + "grad_norm": 0.15026158094406128, + "learning_rate": 8.48652945351313e-05, + "loss": 0.021115848422050477, + "step": 106670 + }, + { + "epoch": 15.142654364797728, + "grad_norm": 0.7808573842048645, + "learning_rate": 8.48638750887154e-05, + "loss": 0.006684541702270508, + "step": 106680 + }, + { + "epoch": 15.144073811213627, + "grad_norm": 1.9700183868408203, + "learning_rate": 8.486245564229951e-05, + "loss": 0.014693331718444825, + "step": 106690 + }, + { + "epoch": 15.145493257629525, + "grad_norm": 4.157971382141113, + "learning_rate": 8.486103619588361e-05, + "loss": 0.023039808869361876, + "step": 106700 + }, + { + "epoch": 15.146912704045421, + "grad_norm": 0.5956658720970154, + "learning_rate": 8.485961674946771e-05, + "loss": 0.012369179725646972, + "step": 106710 + }, + { + "epoch": 15.14833215046132, + "grad_norm": 0.15433919429779053, + "learning_rate": 8.48581973030518e-05, + "loss": 0.027466484904289247, + "step": 106720 + }, + { + "epoch": 15.149751596877218, + "grad_norm": 2.2408993244171143, + "learning_rate": 8.485677785663592e-05, + "loss": 0.03421107828617096, + "step": 106730 + }, + { + "epoch": 15.151171043293116, + "grad_norm": 0.34294793009757996, + "learning_rate": 8.485535841022001e-05, + "loss": 0.011824176460504533, + "step": 106740 + }, + { + "epoch": 15.152590489709013, + "grad_norm": 7.680258750915527, + "learning_rate": 8.485393896380412e-05, + "loss": 0.02799707055091858, + "step": 106750 + }, + { + "epoch": 15.154009936124911, + "grad_norm": 10.662795066833496, + "learning_rate": 8.485251951738822e-05, + "loss": 0.01989738643169403, + "step": 106760 + }, + { + "epoch": 15.15542938254081, + "grad_norm": 0.026028187945485115, + "learning_rate": 8.485110007097232e-05, + "loss": 0.0778480350971222, + "step": 106770 + }, + { + "epoch": 15.156848828956706, + "grad_norm": 0.5896021723747253, + "learning_rate": 8.484968062455643e-05, + "loss": 0.0032101761549711227, + "step": 106780 + }, + { + "epoch": 15.158268275372604, + "grad_norm": 0.2833305895328522, + "learning_rate": 8.484826117814053e-05, + "loss": 0.06184155941009521, + "step": 106790 + }, + { + "epoch": 15.159687721788503, + "grad_norm": 16.18917465209961, + "learning_rate": 8.484684173172464e-05, + "loss": 0.01600014418363571, + "step": 106800 + }, + { + "epoch": 15.161107168204401, + "grad_norm": 0.010805347934365273, + "learning_rate": 8.484542228530872e-05, + "loss": 0.009944060444831848, + "step": 106810 + }, + { + "epoch": 15.162526614620297, + "grad_norm": 5.7854108810424805, + "learning_rate": 8.484400283889283e-05, + "loss": 0.01406932771205902, + "step": 106820 + }, + { + "epoch": 15.163946061036196, + "grad_norm": 5.306714057922363, + "learning_rate": 8.484258339247693e-05, + "loss": 0.009397006034851075, + "step": 106830 + }, + { + "epoch": 15.165365507452094, + "grad_norm": 4.2969651222229, + "learning_rate": 8.484116394606104e-05, + "loss": 0.018796910345554353, + "step": 106840 + }, + { + "epoch": 15.16678495386799, + "grad_norm": 5.016585350036621, + "learning_rate": 8.483974449964515e-05, + "loss": 0.009421862661838531, + "step": 106850 + }, + { + "epoch": 15.168204400283889, + "grad_norm": 0.25573283433914185, + "learning_rate": 8.483832505322924e-05, + "loss": 0.0481646716594696, + "step": 106860 + }, + { + "epoch": 15.169623846699787, + "grad_norm": 0.49223005771636963, + "learning_rate": 8.483690560681335e-05, + "loss": 0.05817559361457825, + "step": 106870 + }, + { + "epoch": 15.171043293115686, + "grad_norm": 0.015384448692202568, + "learning_rate": 8.483548616039744e-05, + "loss": 0.02148095369338989, + "step": 106880 + }, + { + "epoch": 15.172462739531582, + "grad_norm": 0.05889136344194412, + "learning_rate": 8.483406671398156e-05, + "loss": 0.003951547667384148, + "step": 106890 + }, + { + "epoch": 15.17388218594748, + "grad_norm": 0.9542629718780518, + "learning_rate": 8.483264726756565e-05, + "loss": 0.03866635262966156, + "step": 106900 + }, + { + "epoch": 15.175301632363379, + "grad_norm": 0.1657920777797699, + "learning_rate": 8.483122782114975e-05, + "loss": 0.033595597743988036, + "step": 106910 + }, + { + "epoch": 15.176721078779275, + "grad_norm": 14.490575790405273, + "learning_rate": 8.482980837473385e-05, + "loss": 0.020913679897785187, + "step": 106920 + }, + { + "epoch": 15.178140525195174, + "grad_norm": 0.5272268652915955, + "learning_rate": 8.482838892831796e-05, + "loss": 0.008652272075414658, + "step": 106930 + }, + { + "epoch": 15.179559971611072, + "grad_norm": 0.7444571256637573, + "learning_rate": 8.482696948190207e-05, + "loss": 0.007965491712093353, + "step": 106940 + }, + { + "epoch": 15.18097941802697, + "grad_norm": 0.338965505361557, + "learning_rate": 8.482555003548617e-05, + "loss": 0.02447792887687683, + "step": 106950 + }, + { + "epoch": 15.182398864442867, + "grad_norm": 9.44382095336914, + "learning_rate": 8.482413058907028e-05, + "loss": 0.015413524210453033, + "step": 106960 + }, + { + "epoch": 15.183818310858765, + "grad_norm": 0.007815083488821983, + "learning_rate": 8.482271114265436e-05, + "loss": 0.12182474136352539, + "step": 106970 + }, + { + "epoch": 15.185237757274663, + "grad_norm": 4.471283435821533, + "learning_rate": 8.482129169623847e-05, + "loss": 0.029788649082183837, + "step": 106980 + }, + { + "epoch": 15.18665720369056, + "grad_norm": 19.09647560119629, + "learning_rate": 8.481987224982257e-05, + "loss": 0.03674539923667908, + "step": 106990 + }, + { + "epoch": 15.188076650106458, + "grad_norm": 0.18893034756183624, + "learning_rate": 8.481845280340668e-05, + "loss": 0.014733706414699555, + "step": 107000 + }, + { + "epoch": 15.188076650106458, + "eval_accuracy": 0.9782539581611241, + "eval_loss": 0.08096309751272202, + "eval_runtime": 31.867, + "eval_samples_per_second": 493.52, + "eval_steps_per_second": 15.439, + "step": 107000 + }, + { + "epoch": 15.189496096522356, + "grad_norm": 1.9686450958251953, + "learning_rate": 8.481703335699078e-05, + "loss": 0.038543257117271426, + "step": 107010 + }, + { + "epoch": 15.190915542938255, + "grad_norm": 2.9802894592285156, + "learning_rate": 8.481561391057488e-05, + "loss": 0.014302444458007813, + "step": 107020 + }, + { + "epoch": 15.192334989354151, + "grad_norm": 0.6058565974235535, + "learning_rate": 8.481419446415899e-05, + "loss": 0.05712045431137085, + "step": 107030 + }, + { + "epoch": 15.19375443577005, + "grad_norm": 0.004902772139757872, + "learning_rate": 8.481277501774308e-05, + "loss": 0.012648455798625946, + "step": 107040 + }, + { + "epoch": 15.195173882185948, + "grad_norm": 7.411450386047363, + "learning_rate": 8.48113555713272e-05, + "loss": 0.0784771203994751, + "step": 107050 + }, + { + "epoch": 15.196593328601844, + "grad_norm": 0.8683082461357117, + "learning_rate": 8.480993612491129e-05, + "loss": 0.026170209050178528, + "step": 107060 + }, + { + "epoch": 15.198012775017743, + "grad_norm": 0.04517837241292, + "learning_rate": 8.480851667849539e-05, + "loss": 0.0017347116023302077, + "step": 107070 + }, + { + "epoch": 15.199432221433641, + "grad_norm": 13.40206241607666, + "learning_rate": 8.480709723207949e-05, + "loss": 0.045030486583709714, + "step": 107080 + }, + { + "epoch": 15.20085166784954, + "grad_norm": 1.9814329147338867, + "learning_rate": 8.48056777856636e-05, + "loss": 0.03218642473220825, + "step": 107090 + }, + { + "epoch": 15.202271114265436, + "grad_norm": 1.8605858087539673, + "learning_rate": 8.48042583392477e-05, + "loss": 0.02088761329650879, + "step": 107100 + }, + { + "epoch": 15.203690560681334, + "grad_norm": 0.3231185972690582, + "learning_rate": 8.48028388928318e-05, + "loss": 0.03878544867038727, + "step": 107110 + }, + { + "epoch": 15.205110007097232, + "grad_norm": 1.338391661643982, + "learning_rate": 8.48014194464159e-05, + "loss": 0.019712454080581664, + "step": 107120 + }, + { + "epoch": 15.206529453513129, + "grad_norm": 0.7573134899139404, + "learning_rate": 8.48e-05, + "loss": 0.030023956298828126, + "step": 107130 + }, + { + "epoch": 15.207948899929027, + "grad_norm": 0.08245470374822617, + "learning_rate": 8.479858055358411e-05, + "loss": 0.008657753467559814, + "step": 107140 + }, + { + "epoch": 15.209368346344926, + "grad_norm": 4.181887149810791, + "learning_rate": 8.479716110716821e-05, + "loss": 0.017116375267505646, + "step": 107150 + }, + { + "epoch": 15.210787792760824, + "grad_norm": 0.013549786061048508, + "learning_rate": 8.479574166075232e-05, + "loss": 0.01756223291158676, + "step": 107160 + }, + { + "epoch": 15.21220723917672, + "grad_norm": 5.840518951416016, + "learning_rate": 8.47943222143364e-05, + "loss": 0.0685306191444397, + "step": 107170 + }, + { + "epoch": 15.213626685592619, + "grad_norm": 0.18977539241313934, + "learning_rate": 8.479290276792052e-05, + "loss": 0.033226925134658816, + "step": 107180 + }, + { + "epoch": 15.215046132008517, + "grad_norm": 0.1447950005531311, + "learning_rate": 8.479148332150461e-05, + "loss": 0.026240897178649903, + "step": 107190 + }, + { + "epoch": 15.216465578424414, + "grad_norm": 0.12102290987968445, + "learning_rate": 8.479006387508872e-05, + "loss": 0.0039006978273391723, + "step": 107200 + }, + { + "epoch": 15.217885024840312, + "grad_norm": 0.09665011614561081, + "learning_rate": 8.478864442867282e-05, + "loss": 0.009919333457946777, + "step": 107210 + }, + { + "epoch": 15.21930447125621, + "grad_norm": 0.2344123125076294, + "learning_rate": 8.478722498225692e-05, + "loss": 0.042432117462158206, + "step": 107220 + }, + { + "epoch": 15.220723917672109, + "grad_norm": 1.6214635372161865, + "learning_rate": 8.478580553584103e-05, + "loss": 0.010458789765834808, + "step": 107230 + }, + { + "epoch": 15.222143364088005, + "grad_norm": 0.058471862226724625, + "learning_rate": 8.478438608942513e-05, + "loss": 0.0985255241394043, + "step": 107240 + }, + { + "epoch": 15.223562810503903, + "grad_norm": 5.8149495124816895, + "learning_rate": 8.478296664300924e-05, + "loss": 0.025858157873153688, + "step": 107250 + }, + { + "epoch": 15.224982256919802, + "grad_norm": 0.12123987078666687, + "learning_rate": 8.478154719659333e-05, + "loss": 0.021722464263439177, + "step": 107260 + }, + { + "epoch": 15.2264017033357, + "grad_norm": 0.7817923426628113, + "learning_rate": 8.478012775017743e-05, + "loss": 0.01945972293615341, + "step": 107270 + }, + { + "epoch": 15.227821149751597, + "grad_norm": 0.03892279788851738, + "learning_rate": 8.477870830376153e-05, + "loss": 0.009514982998371124, + "step": 107280 + }, + { + "epoch": 15.229240596167495, + "grad_norm": 1.8403258323669434, + "learning_rate": 8.477728885734564e-05, + "loss": 0.04027565121650696, + "step": 107290 + }, + { + "epoch": 15.230660042583393, + "grad_norm": 0.049720872193574905, + "learning_rate": 8.477586941092974e-05, + "loss": 0.012929567694664001, + "step": 107300 + }, + { + "epoch": 15.23207948899929, + "grad_norm": 1.8823243379592896, + "learning_rate": 8.477444996451385e-05, + "loss": 0.02587153613567352, + "step": 107310 + }, + { + "epoch": 15.233498935415188, + "grad_norm": 0.21937565505504608, + "learning_rate": 8.477303051809795e-05, + "loss": 0.026926514506340028, + "step": 107320 + }, + { + "epoch": 15.234918381831086, + "grad_norm": 0.034718845039606094, + "learning_rate": 8.477161107168204e-05, + "loss": 0.0010982461273670197, + "step": 107330 + }, + { + "epoch": 15.236337828246985, + "grad_norm": 0.1314888447523117, + "learning_rate": 8.477019162526615e-05, + "loss": 0.032864326238632204, + "step": 107340 + }, + { + "epoch": 15.237757274662881, + "grad_norm": 0.024118224158883095, + "learning_rate": 8.476877217885025e-05, + "loss": 0.004268708452582359, + "step": 107350 + }, + { + "epoch": 15.23917672107878, + "grad_norm": 0.0030872609931975603, + "learning_rate": 8.476735273243436e-05, + "loss": 0.014760005474090575, + "step": 107360 + }, + { + "epoch": 15.240596167494678, + "grad_norm": 0.00926938932389021, + "learning_rate": 8.476593328601846e-05, + "loss": 0.044267669320106506, + "step": 107370 + }, + { + "epoch": 15.242015613910574, + "grad_norm": 0.8168309330940247, + "learning_rate": 8.476451383960256e-05, + "loss": 0.007230883836746216, + "step": 107380 + }, + { + "epoch": 15.243435060326473, + "grad_norm": 0.06650938093662262, + "learning_rate": 8.476309439318665e-05, + "loss": 0.0143302783370018, + "step": 107390 + }, + { + "epoch": 15.24485450674237, + "grad_norm": 1.2258223295211792, + "learning_rate": 8.476167494677077e-05, + "loss": 0.0024061430245637893, + "step": 107400 + }, + { + "epoch": 15.24627395315827, + "grad_norm": 0.09853166341781616, + "learning_rate": 8.476025550035486e-05, + "loss": 0.0160846084356308, + "step": 107410 + }, + { + "epoch": 15.247693399574166, + "grad_norm": 5.717872619628906, + "learning_rate": 8.475883605393897e-05, + "loss": 0.027300027012825013, + "step": 107420 + }, + { + "epoch": 15.249112845990064, + "grad_norm": 6.418786525726318, + "learning_rate": 8.475741660752307e-05, + "loss": 0.017071712017059325, + "step": 107430 + }, + { + "epoch": 15.250532292405962, + "grad_norm": 0.07067640870809555, + "learning_rate": 8.475599716110717e-05, + "loss": 0.024442729353904725, + "step": 107440 + }, + { + "epoch": 15.251951738821859, + "grad_norm": 1.3273271322250366, + "learning_rate": 8.475457771469128e-05, + "loss": 0.00802953988313675, + "step": 107450 + }, + { + "epoch": 15.253371185237757, + "grad_norm": 6.9608964920043945, + "learning_rate": 8.475315826827538e-05, + "loss": 0.018375779688358306, + "step": 107460 + }, + { + "epoch": 15.254790631653655, + "grad_norm": 0.00919936690479517, + "learning_rate": 8.475173882185949e-05, + "loss": 0.018325018882751464, + "step": 107470 + }, + { + "epoch": 15.256210078069554, + "grad_norm": 1.523152232170105, + "learning_rate": 8.475031937544357e-05, + "loss": 0.005215193331241608, + "step": 107480 + }, + { + "epoch": 15.25762952448545, + "grad_norm": 0.3632388710975647, + "learning_rate": 8.474889992902768e-05, + "loss": 0.017545858025550844, + "step": 107490 + }, + { + "epoch": 15.259048970901349, + "grad_norm": 1.890334963798523, + "learning_rate": 8.474748048261178e-05, + "loss": 0.03647333979606628, + "step": 107500 + }, + { + "epoch": 15.259048970901349, + "eval_accuracy": 0.9864564125389458, + "eval_loss": 0.04831605404615402, + "eval_runtime": 32.764, + "eval_samples_per_second": 480.009, + "eval_steps_per_second": 15.017, + "step": 107500 + }, + { + "epoch": 15.260468417317247, + "grad_norm": 0.5332909822463989, + "learning_rate": 8.474606103619589e-05, + "loss": 0.01102583259344101, + "step": 107510 + }, + { + "epoch": 15.261887863733143, + "grad_norm": 2.9368104934692383, + "learning_rate": 8.474464158977999e-05, + "loss": 0.07062627673149109, + "step": 107520 + }, + { + "epoch": 15.263307310149042, + "grad_norm": 0.2998633086681366, + "learning_rate": 8.474322214336409e-05, + "loss": 0.011581972986459733, + "step": 107530 + }, + { + "epoch": 15.26472675656494, + "grad_norm": 0.20585720241069794, + "learning_rate": 8.47418026969482e-05, + "loss": 0.023847994208335877, + "step": 107540 + }, + { + "epoch": 15.266146202980838, + "grad_norm": 0.44639578461647034, + "learning_rate": 8.47403832505323e-05, + "loss": 0.02701796293258667, + "step": 107550 + }, + { + "epoch": 15.267565649396735, + "grad_norm": 13.586299896240234, + "learning_rate": 8.47389638041164e-05, + "loss": 0.030145710706710814, + "step": 107560 + }, + { + "epoch": 15.268985095812633, + "grad_norm": 0.13911595940589905, + "learning_rate": 8.47375443577005e-05, + "loss": 0.03382292091846466, + "step": 107570 + }, + { + "epoch": 15.270404542228531, + "grad_norm": 0.6651485562324524, + "learning_rate": 8.47361249112846e-05, + "loss": 0.018535080552101135, + "step": 107580 + }, + { + "epoch": 15.271823988644428, + "grad_norm": 10.191615104675293, + "learning_rate": 8.47347054648687e-05, + "loss": 0.038520559668540955, + "step": 107590 + }, + { + "epoch": 15.273243435060326, + "grad_norm": 11.469804763793945, + "learning_rate": 8.473328601845281e-05, + "loss": 0.02716066837310791, + "step": 107600 + }, + { + "epoch": 15.274662881476225, + "grad_norm": 3.524324417114258, + "learning_rate": 8.47318665720369e-05, + "loss": 0.009171813726425171, + "step": 107610 + }, + { + "epoch": 15.276082327892123, + "grad_norm": 0.7207483053207397, + "learning_rate": 8.473044712562102e-05, + "loss": 0.017212912440299988, + "step": 107620 + }, + { + "epoch": 15.27750177430802, + "grad_norm": 9.047149658203125, + "learning_rate": 8.472902767920511e-05, + "loss": 0.03292661309242249, + "step": 107630 + }, + { + "epoch": 15.278921220723918, + "grad_norm": 13.497153282165527, + "learning_rate": 8.472760823278921e-05, + "loss": 0.0390909492969513, + "step": 107640 + }, + { + "epoch": 15.280340667139816, + "grad_norm": 0.6657223701477051, + "learning_rate": 8.472618878637332e-05, + "loss": 0.012965390086174011, + "step": 107650 + }, + { + "epoch": 15.281760113555713, + "grad_norm": 4.909111022949219, + "learning_rate": 8.472476933995742e-05, + "loss": 0.02200748324394226, + "step": 107660 + }, + { + "epoch": 15.283179559971611, + "grad_norm": 6.05718994140625, + "learning_rate": 8.472334989354153e-05, + "loss": 0.009501121938228607, + "step": 107670 + }, + { + "epoch": 15.28459900638751, + "grad_norm": 0.006838800385594368, + "learning_rate": 8.472193044712563e-05, + "loss": 0.035414910316467284, + "step": 107680 + }, + { + "epoch": 15.286018452803408, + "grad_norm": 7.050548553466797, + "learning_rate": 8.472051100070973e-05, + "loss": 0.0719907522201538, + "step": 107690 + }, + { + "epoch": 15.287437899219304, + "grad_norm": 0.3600425124168396, + "learning_rate": 8.471909155429382e-05, + "loss": 0.006389583647251129, + "step": 107700 + }, + { + "epoch": 15.288857345635202, + "grad_norm": 0.07480444014072418, + "learning_rate": 8.471767210787793e-05, + "loss": 0.023404929041862487, + "step": 107710 + }, + { + "epoch": 15.2902767920511, + "grad_norm": 0.056176621466875076, + "learning_rate": 8.471625266146203e-05, + "loss": 0.005539464205503464, + "step": 107720 + }, + { + "epoch": 15.291696238466997, + "grad_norm": 0.8116849064826965, + "learning_rate": 8.471483321504614e-05, + "loss": 0.01017369031906128, + "step": 107730 + }, + { + "epoch": 15.293115684882896, + "grad_norm": 1.566999912261963, + "learning_rate": 8.471341376863024e-05, + "loss": 0.008306996524333954, + "step": 107740 + }, + { + "epoch": 15.294535131298794, + "grad_norm": 0.6852664351463318, + "learning_rate": 8.471199432221434e-05, + "loss": 0.02082604467868805, + "step": 107750 + }, + { + "epoch": 15.295954577714692, + "grad_norm": 1.0683574676513672, + "learning_rate": 8.471057487579845e-05, + "loss": 0.0872978389263153, + "step": 107760 + }, + { + "epoch": 15.297374024130589, + "grad_norm": 0.01093759760260582, + "learning_rate": 8.470915542938254e-05, + "loss": 0.014742153882980346, + "step": 107770 + }, + { + "epoch": 15.298793470546487, + "grad_norm": 5.044257640838623, + "learning_rate": 8.470773598296666e-05, + "loss": 0.012482000142335891, + "step": 107780 + }, + { + "epoch": 15.300212916962385, + "grad_norm": 0.09109003096818924, + "learning_rate": 8.470631653655074e-05, + "loss": 0.006877711415290833, + "step": 107790 + }, + { + "epoch": 15.301632363378282, + "grad_norm": 13.097904205322266, + "learning_rate": 8.470489709013485e-05, + "loss": 0.018438270688056944, + "step": 107800 + }, + { + "epoch": 15.30305180979418, + "grad_norm": 0.045856866985559464, + "learning_rate": 8.470347764371895e-05, + "loss": 0.006077097356319427, + "step": 107810 + }, + { + "epoch": 15.304471256210078, + "grad_norm": 0.6873058080673218, + "learning_rate": 8.470205819730306e-05, + "loss": 0.024769291281700134, + "step": 107820 + }, + { + "epoch": 15.305890702625977, + "grad_norm": 9.974763870239258, + "learning_rate": 8.470063875088716e-05, + "loss": 0.02486882209777832, + "step": 107830 + }, + { + "epoch": 15.307310149041873, + "grad_norm": 10.30651569366455, + "learning_rate": 8.469921930447125e-05, + "loss": 0.029645463824272154, + "step": 107840 + }, + { + "epoch": 15.308729595457772, + "grad_norm": 8.100309371948242, + "learning_rate": 8.469779985805536e-05, + "loss": 0.009516823291778564, + "step": 107850 + }, + { + "epoch": 15.31014904187367, + "grad_norm": 0.0811367779970169, + "learning_rate": 8.469638041163946e-05, + "loss": 0.058248645067214964, + "step": 107860 + }, + { + "epoch": 15.311568488289566, + "grad_norm": 4.043773651123047, + "learning_rate": 8.469496096522357e-05, + "loss": 0.027607759833335875, + "step": 107870 + }, + { + "epoch": 15.312987934705465, + "grad_norm": 0.33214518427848816, + "learning_rate": 8.469354151880767e-05, + "loss": 0.03843773007392883, + "step": 107880 + }, + { + "epoch": 15.314407381121363, + "grad_norm": 6.0449604988098145, + "learning_rate": 8.469212207239177e-05, + "loss": 0.013122335076332092, + "step": 107890 + }, + { + "epoch": 15.315826827537261, + "grad_norm": 1.2720344066619873, + "learning_rate": 8.469070262597587e-05, + "loss": 0.00861743837594986, + "step": 107900 + }, + { + "epoch": 15.317246273953158, + "grad_norm": 2.66919207572937, + "learning_rate": 8.468928317955998e-05, + "loss": 0.05666149854660034, + "step": 107910 + }, + { + "epoch": 15.318665720369056, + "grad_norm": 5.7319159507751465, + "learning_rate": 8.468786373314407e-05, + "loss": 0.023085048794746398, + "step": 107920 + }, + { + "epoch": 15.320085166784954, + "grad_norm": 0.14215344190597534, + "learning_rate": 8.468644428672818e-05, + "loss": 0.03705045878887177, + "step": 107930 + }, + { + "epoch": 15.321504613200851, + "grad_norm": 0.012645716778934002, + "learning_rate": 8.468502484031228e-05, + "loss": 0.14912995100021362, + "step": 107940 + }, + { + "epoch": 15.32292405961675, + "grad_norm": 0.10450851917266846, + "learning_rate": 8.468360539389638e-05, + "loss": 0.008530506491661071, + "step": 107950 + }, + { + "epoch": 15.324343506032648, + "grad_norm": 9.909424781799316, + "learning_rate": 8.468218594748049e-05, + "loss": 0.03583186268806458, + "step": 107960 + }, + { + "epoch": 15.325762952448546, + "grad_norm": 4.21767520904541, + "learning_rate": 8.468076650106459e-05, + "loss": 0.00969957560300827, + "step": 107970 + }, + { + "epoch": 15.327182398864442, + "grad_norm": 0.27315735816955566, + "learning_rate": 8.46793470546487e-05, + "loss": 0.015069955587387085, + "step": 107980 + }, + { + "epoch": 15.32860184528034, + "grad_norm": 4.084084987640381, + "learning_rate": 8.46779276082328e-05, + "loss": 0.014058254659175873, + "step": 107990 + }, + { + "epoch": 15.330021291696239, + "grad_norm": 0.011087893508374691, + "learning_rate": 8.467650816181689e-05, + "loss": 0.0190117746591568, + "step": 108000 + }, + { + "epoch": 15.330021291696239, + "eval_accuracy": 0.9873466013861512, + "eval_loss": 0.03986372798681259, + "eval_runtime": 32.6811, + "eval_samples_per_second": 481.226, + "eval_steps_per_second": 15.055, + "step": 108000 + }, + { + "epoch": 15.331440738112136, + "grad_norm": 3.722987174987793, + "learning_rate": 8.467508871540099e-05, + "loss": 0.03890916705131531, + "step": 108010 + }, + { + "epoch": 15.332860184528034, + "grad_norm": 0.38124150037765503, + "learning_rate": 8.46736692689851e-05, + "loss": 0.01923559159040451, + "step": 108020 + }, + { + "epoch": 15.334279630943932, + "grad_norm": 8.261305809020996, + "learning_rate": 8.46722498225692e-05, + "loss": 0.025743892788887023, + "step": 108030 + }, + { + "epoch": 15.33569907735983, + "grad_norm": 0.09685249626636505, + "learning_rate": 8.467083037615331e-05, + "loss": 0.04506351947784424, + "step": 108040 + }, + { + "epoch": 15.337118523775727, + "grad_norm": 0.41357406973838806, + "learning_rate": 8.466941092973741e-05, + "loss": 0.007587555050849915, + "step": 108050 + }, + { + "epoch": 15.338537970191625, + "grad_norm": 0.4583379030227661, + "learning_rate": 8.46679914833215e-05, + "loss": 0.010946182906627655, + "step": 108060 + }, + { + "epoch": 15.339957416607524, + "grad_norm": 3.243618965148926, + "learning_rate": 8.466657203690562e-05, + "loss": 0.017219507694244386, + "step": 108070 + }, + { + "epoch": 15.34137686302342, + "grad_norm": 1.3947383165359497, + "learning_rate": 8.466515259048971e-05, + "loss": 0.007583361119031906, + "step": 108080 + }, + { + "epoch": 15.342796309439318, + "grad_norm": 0.377756267786026, + "learning_rate": 8.466373314407382e-05, + "loss": 0.014791847765445709, + "step": 108090 + }, + { + "epoch": 15.344215755855217, + "grad_norm": 0.04573707655072212, + "learning_rate": 8.466231369765791e-05, + "loss": 0.01001567840576172, + "step": 108100 + }, + { + "epoch": 15.345635202271115, + "grad_norm": 0.047843072563409805, + "learning_rate": 8.466089425124202e-05, + "loss": 0.0023218248039484023, + "step": 108110 + }, + { + "epoch": 15.347054648687012, + "grad_norm": 0.05948958918452263, + "learning_rate": 8.465947480482612e-05, + "loss": 0.02049640268087387, + "step": 108120 + }, + { + "epoch": 15.34847409510291, + "grad_norm": 0.03125576674938202, + "learning_rate": 8.465805535841023e-05, + "loss": 0.006368052214384079, + "step": 108130 + }, + { + "epoch": 15.349893541518808, + "grad_norm": 4.6414313316345215, + "learning_rate": 8.465663591199434e-05, + "loss": 0.04324187040328979, + "step": 108140 + }, + { + "epoch": 15.351312987934705, + "grad_norm": 11.299116134643555, + "learning_rate": 8.465521646557842e-05, + "loss": 0.02383486181497574, + "step": 108150 + }, + { + "epoch": 15.352732434350603, + "grad_norm": 3.3975841999053955, + "learning_rate": 8.465379701916253e-05, + "loss": 0.024656203389167786, + "step": 108160 + }, + { + "epoch": 15.354151880766501, + "grad_norm": 0.009448827244341373, + "learning_rate": 8.465237757274663e-05, + "loss": 0.00397581048309803, + "step": 108170 + }, + { + "epoch": 15.3555713271824, + "grad_norm": 2.3756935596466064, + "learning_rate": 8.465095812633074e-05, + "loss": 0.014156597852706908, + "step": 108180 + }, + { + "epoch": 15.356990773598296, + "grad_norm": 5.241497993469238, + "learning_rate": 8.464953867991484e-05, + "loss": 0.039077538251876834, + "step": 108190 + }, + { + "epoch": 15.358410220014195, + "grad_norm": 0.07957743853330612, + "learning_rate": 8.464811923349894e-05, + "loss": 0.08788187503814697, + "step": 108200 + }, + { + "epoch": 15.359829666430093, + "grad_norm": 0.07085578143596649, + "learning_rate": 8.464669978708303e-05, + "loss": 0.03421534895896912, + "step": 108210 + }, + { + "epoch": 15.36124911284599, + "grad_norm": 0.024479210376739502, + "learning_rate": 8.464528034066714e-05, + "loss": 0.006818431615829468, + "step": 108220 + }, + { + "epoch": 15.362668559261888, + "grad_norm": 0.16495859622955322, + "learning_rate": 8.464386089425124e-05, + "loss": 0.04878163933753967, + "step": 108230 + }, + { + "epoch": 15.364088005677786, + "grad_norm": 0.0742361769080162, + "learning_rate": 8.464244144783535e-05, + "loss": 0.02441754937171936, + "step": 108240 + }, + { + "epoch": 15.365507452093684, + "grad_norm": 1.2929010391235352, + "learning_rate": 8.464102200141945e-05, + "loss": 0.010754087567329406, + "step": 108250 + }, + { + "epoch": 15.36692689850958, + "grad_norm": 4.1936492919921875, + "learning_rate": 8.463960255500355e-05, + "loss": 0.0323737770318985, + "step": 108260 + }, + { + "epoch": 15.36834634492548, + "grad_norm": 0.1088828593492508, + "learning_rate": 8.463818310858766e-05, + "loss": 0.0027613572776317595, + "step": 108270 + }, + { + "epoch": 15.369765791341377, + "grad_norm": 0.1193610355257988, + "learning_rate": 8.463676366217176e-05, + "loss": 0.04451070725917816, + "step": 108280 + }, + { + "epoch": 15.371185237757274, + "grad_norm": 0.012951977550983429, + "learning_rate": 8.463534421575587e-05, + "loss": 0.04991414546966553, + "step": 108290 + }, + { + "epoch": 15.372604684173172, + "grad_norm": 0.5242592096328735, + "learning_rate": 8.463392476933995e-05, + "loss": 0.017808882892131804, + "step": 108300 + }, + { + "epoch": 15.37402413058907, + "grad_norm": 0.4178023338317871, + "learning_rate": 8.463250532292406e-05, + "loss": 0.054301905632019046, + "step": 108310 + }, + { + "epoch": 15.375443577004969, + "grad_norm": 0.2042308747768402, + "learning_rate": 8.463108587650816e-05, + "loss": 0.030471009016036988, + "step": 108320 + }, + { + "epoch": 15.376863023420865, + "grad_norm": 3.57515025138855, + "learning_rate": 8.462966643009227e-05, + "loss": 0.06504546403884888, + "step": 108330 + }, + { + "epoch": 15.378282469836764, + "grad_norm": 1.0132148265838623, + "learning_rate": 8.462824698367638e-05, + "loss": 0.008075962215662003, + "step": 108340 + }, + { + "epoch": 15.379701916252662, + "grad_norm": 0.360158771276474, + "learning_rate": 8.462682753726048e-05, + "loss": 0.06669944524765015, + "step": 108350 + }, + { + "epoch": 15.381121362668559, + "grad_norm": 0.13241221010684967, + "learning_rate": 8.462540809084457e-05, + "loss": 0.021548727154731752, + "step": 108360 + }, + { + "epoch": 15.382540809084457, + "grad_norm": 0.6008273363113403, + "learning_rate": 8.462398864442867e-05, + "loss": 0.014168235659599304, + "step": 108370 + }, + { + "epoch": 15.383960255500355, + "grad_norm": 1.9323714971542358, + "learning_rate": 8.462256919801278e-05, + "loss": 0.043941497802734375, + "step": 108380 + }, + { + "epoch": 15.385379701916253, + "grad_norm": 0.8188585042953491, + "learning_rate": 8.462114975159688e-05, + "loss": 0.04607037603855133, + "step": 108390 + }, + { + "epoch": 15.38679914833215, + "grad_norm": 5.428987503051758, + "learning_rate": 8.461973030518099e-05, + "loss": 0.04405255615711212, + "step": 108400 + }, + { + "epoch": 15.388218594748048, + "grad_norm": 0.852557361125946, + "learning_rate": 8.461831085876508e-05, + "loss": 0.01311432421207428, + "step": 108410 + }, + { + "epoch": 15.389638041163947, + "grad_norm": 0.11089548468589783, + "learning_rate": 8.461689141234919e-05, + "loss": 0.01127699315547943, + "step": 108420 + }, + { + "epoch": 15.391057487579843, + "grad_norm": 5.423348903656006, + "learning_rate": 8.46154719659333e-05, + "loss": 0.010904674232006074, + "step": 108430 + }, + { + "epoch": 15.392476933995741, + "grad_norm": 0.7922012209892273, + "learning_rate": 8.46140525195174e-05, + "loss": 0.005975948274135589, + "step": 108440 + }, + { + "epoch": 15.39389638041164, + "grad_norm": 0.041600536555051804, + "learning_rate": 8.46126330731015e-05, + "loss": 0.015418723225593567, + "step": 108450 + }, + { + "epoch": 15.395315826827538, + "grad_norm": 3.9857535362243652, + "learning_rate": 8.461121362668559e-05, + "loss": 0.035864454507827756, + "step": 108460 + }, + { + "epoch": 15.396735273243435, + "grad_norm": 1.131729245185852, + "learning_rate": 8.46097941802697e-05, + "loss": 0.017155754566192626, + "step": 108470 + }, + { + "epoch": 15.398154719659333, + "grad_norm": 1.1946663856506348, + "learning_rate": 8.46083747338538e-05, + "loss": 0.0766279399394989, + "step": 108480 + }, + { + "epoch": 15.399574166075231, + "grad_norm": 3.973695993423462, + "learning_rate": 8.460695528743791e-05, + "loss": 0.012332093715667725, + "step": 108490 + }, + { + "epoch": 15.400993612491128, + "grad_norm": 7.021789073944092, + "learning_rate": 8.4605535841022e-05, + "loss": 0.020251087844371796, + "step": 108500 + }, + { + "epoch": 15.400993612491128, + "eval_accuracy": 0.9862020728683156, + "eval_loss": 0.04970347508788109, + "eval_runtime": 31.7417, + "eval_samples_per_second": 495.469, + "eval_steps_per_second": 15.5, + "step": 108500 + }, + { + "epoch": 15.402413058907026, + "grad_norm": 0.031689513474702835, + "learning_rate": 8.46041163946061e-05, + "loss": 0.02430199831724167, + "step": 108510 + }, + { + "epoch": 15.403832505322924, + "grad_norm": 0.1859818547964096, + "learning_rate": 8.460269694819021e-05, + "loss": 0.021585284173488616, + "step": 108520 + }, + { + "epoch": 15.405251951738823, + "grad_norm": 0.9980630278587341, + "learning_rate": 8.460127750177431e-05, + "loss": 0.0029014710336923598, + "step": 108530 + }, + { + "epoch": 15.40667139815472, + "grad_norm": 0.061858199536800385, + "learning_rate": 8.459985805535842e-05, + "loss": 0.012076663225889206, + "step": 108540 + }, + { + "epoch": 15.408090844570618, + "grad_norm": 7.55286979675293, + "learning_rate": 8.459843860894252e-05, + "loss": 0.016109907627105714, + "step": 108550 + }, + { + "epoch": 15.409510290986516, + "grad_norm": 0.013273878023028374, + "learning_rate": 8.459701916252662e-05, + "loss": 0.025737404823303223, + "step": 108560 + }, + { + "epoch": 15.410929737402412, + "grad_norm": 0.9089930057525635, + "learning_rate": 8.459559971611071e-05, + "loss": 0.019002526998519897, + "step": 108570 + }, + { + "epoch": 15.41234918381831, + "grad_norm": 0.5245806574821472, + "learning_rate": 8.459418026969483e-05, + "loss": 0.015612088143825531, + "step": 108580 + }, + { + "epoch": 15.413768630234209, + "grad_norm": 3.1340649127960205, + "learning_rate": 8.459276082327892e-05, + "loss": 0.015143591165542602, + "step": 108590 + }, + { + "epoch": 15.415188076650107, + "grad_norm": 0.5625861287117004, + "learning_rate": 8.459134137686303e-05, + "loss": 0.014427962899208068, + "step": 108600 + }, + { + "epoch": 15.416607523066004, + "grad_norm": 0.2291407436132431, + "learning_rate": 8.458992193044713e-05, + "loss": 0.03853601813316345, + "step": 108610 + }, + { + "epoch": 15.418026969481902, + "grad_norm": 0.009141476824879646, + "learning_rate": 8.458850248403123e-05, + "loss": 0.005736962333321572, + "step": 108620 + }, + { + "epoch": 15.4194464158978, + "grad_norm": 0.04585874825716019, + "learning_rate": 8.458708303761534e-05, + "loss": 0.030605682730674745, + "step": 108630 + }, + { + "epoch": 15.420865862313697, + "grad_norm": 0.16862089931964874, + "learning_rate": 8.458566359119944e-05, + "loss": 0.03745960295200348, + "step": 108640 + }, + { + "epoch": 15.422285308729595, + "grad_norm": 0.052765995264053345, + "learning_rate": 8.458424414478355e-05, + "loss": 0.01019173339009285, + "step": 108650 + }, + { + "epoch": 15.423704755145494, + "grad_norm": 0.11879035085439682, + "learning_rate": 8.458282469836763e-05, + "loss": 0.002845485135912895, + "step": 108660 + }, + { + "epoch": 15.425124201561392, + "grad_norm": 8.916354179382324, + "learning_rate": 8.458140525195174e-05, + "loss": 0.023224112391471863, + "step": 108670 + }, + { + "epoch": 15.426543647977288, + "grad_norm": 1.4591403007507324, + "learning_rate": 8.457998580553584e-05, + "loss": 0.024321596324443816, + "step": 108680 + }, + { + "epoch": 15.427963094393187, + "grad_norm": 1.928444743156433, + "learning_rate": 8.457856635911995e-05, + "loss": 0.01079963445663452, + "step": 108690 + }, + { + "epoch": 15.429382540809085, + "grad_norm": 2.116582155227661, + "learning_rate": 8.457714691270405e-05, + "loss": 0.0035361595451831817, + "step": 108700 + }, + { + "epoch": 15.430801987224982, + "grad_norm": 0.03351214528083801, + "learning_rate": 8.457572746628816e-05, + "loss": 0.03481769263744354, + "step": 108710 + }, + { + "epoch": 15.43222143364088, + "grad_norm": 0.4059963822364807, + "learning_rate": 8.457430801987226e-05, + "loss": 0.0201575830578804, + "step": 108720 + }, + { + "epoch": 15.433640880056778, + "grad_norm": 0.11144604533910751, + "learning_rate": 8.457288857345635e-05, + "loss": 0.02286584973335266, + "step": 108730 + }, + { + "epoch": 15.435060326472676, + "grad_norm": 0.06553922593593597, + "learning_rate": 8.457146912704046e-05, + "loss": 0.0369859516620636, + "step": 108740 + }, + { + "epoch": 15.436479772888573, + "grad_norm": 14.971281051635742, + "learning_rate": 8.457004968062456e-05, + "loss": 0.06031980514526367, + "step": 108750 + }, + { + "epoch": 15.437899219304471, + "grad_norm": 9.861590385437012, + "learning_rate": 8.456863023420867e-05, + "loss": 0.03545091152191162, + "step": 108760 + }, + { + "epoch": 15.43931866572037, + "grad_norm": 0.8584105372428894, + "learning_rate": 8.456721078779276e-05, + "loss": 0.02768990397453308, + "step": 108770 + }, + { + "epoch": 15.440738112136266, + "grad_norm": 0.7349197864532471, + "learning_rate": 8.456579134137687e-05, + "loss": 0.04703467786312103, + "step": 108780 + }, + { + "epoch": 15.442157558552164, + "grad_norm": 1.0171449184417725, + "learning_rate": 8.456437189496097e-05, + "loss": 0.008258529007434845, + "step": 108790 + }, + { + "epoch": 15.443577004968063, + "grad_norm": 0.018382063135504723, + "learning_rate": 8.456295244854508e-05, + "loss": 0.01452644169330597, + "step": 108800 + }, + { + "epoch": 15.444996451383961, + "grad_norm": 12.21877384185791, + "learning_rate": 8.456153300212917e-05, + "loss": 0.0642141580581665, + "step": 108810 + }, + { + "epoch": 15.446415897799858, + "grad_norm": 13.660399436950684, + "learning_rate": 8.456011355571327e-05, + "loss": 0.0224165216088295, + "step": 108820 + }, + { + "epoch": 15.447835344215756, + "grad_norm": 1.5048636198043823, + "learning_rate": 8.455869410929738e-05, + "loss": 0.049782159924507144, + "step": 108830 + }, + { + "epoch": 15.449254790631654, + "grad_norm": 0.023611735552549362, + "learning_rate": 8.455727466288148e-05, + "loss": 0.04725245237350464, + "step": 108840 + }, + { + "epoch": 15.45067423704755, + "grad_norm": 3.5750725269317627, + "learning_rate": 8.455585521646559e-05, + "loss": 0.00850258320569992, + "step": 108850 + }, + { + "epoch": 15.452093683463449, + "grad_norm": 0.11319718509912491, + "learning_rate": 8.455443577004969e-05, + "loss": 0.009740934520959855, + "step": 108860 + }, + { + "epoch": 15.453513129879347, + "grad_norm": 0.16189701855182648, + "learning_rate": 8.455301632363378e-05, + "loss": 0.01002160757780075, + "step": 108870 + }, + { + "epoch": 15.454932576295246, + "grad_norm": 0.027291517704725266, + "learning_rate": 8.455159687721788e-05, + "loss": 0.020919431746006013, + "step": 108880 + }, + { + "epoch": 15.456352022711142, + "grad_norm": 0.10434415936470032, + "learning_rate": 8.455017743080199e-05, + "loss": 0.03278235793113708, + "step": 108890 + }, + { + "epoch": 15.45777146912704, + "grad_norm": 0.10862031579017639, + "learning_rate": 8.454875798438609e-05, + "loss": 0.013124911487102509, + "step": 108900 + }, + { + "epoch": 15.459190915542939, + "grad_norm": 0.21281233429908752, + "learning_rate": 8.45473385379702e-05, + "loss": 0.04097933173179626, + "step": 108910 + }, + { + "epoch": 15.460610361958835, + "grad_norm": 0.7787414193153381, + "learning_rate": 8.45459190915543e-05, + "loss": 0.02274901270866394, + "step": 108920 + }, + { + "epoch": 15.462029808374734, + "grad_norm": 2.227569818496704, + "learning_rate": 8.45444996451384e-05, + "loss": 0.012134695053100586, + "step": 108930 + }, + { + "epoch": 15.463449254790632, + "grad_norm": 0.4353095591068268, + "learning_rate": 8.454308019872251e-05, + "loss": 0.017594021558761597, + "step": 108940 + }, + { + "epoch": 15.46486870120653, + "grad_norm": 0.0655631497502327, + "learning_rate": 8.45416607523066e-05, + "loss": 0.015526409447193145, + "step": 108950 + }, + { + "epoch": 15.466288147622427, + "grad_norm": 0.017096268013119698, + "learning_rate": 8.454024130589072e-05, + "loss": 0.0432012140750885, + "step": 108960 + }, + { + "epoch": 15.467707594038325, + "grad_norm": 4.582834720611572, + "learning_rate": 8.45388218594748e-05, + "loss": 0.02698971927165985, + "step": 108970 + }, + { + "epoch": 15.469127040454223, + "grad_norm": 0.517230749130249, + "learning_rate": 8.453740241305891e-05, + "loss": 0.052729862928390506, + "step": 108980 + }, + { + "epoch": 15.47054648687012, + "grad_norm": 0.03594077005982399, + "learning_rate": 8.453598296664301e-05, + "loss": 0.014178904891014098, + "step": 108990 + }, + { + "epoch": 15.471965933286018, + "grad_norm": 3.0221614837646484, + "learning_rate": 8.453456352022712e-05, + "loss": 0.02753186821937561, + "step": 109000 + }, + { + "epoch": 15.471965933286018, + "eval_accuracy": 0.9848032046798499, + "eval_loss": 0.05491510033607483, + "eval_runtime": 32.6115, + "eval_samples_per_second": 482.254, + "eval_steps_per_second": 15.087, + "step": 109000 + }, + { + "epoch": 15.473385379701917, + "grad_norm": 0.011246667243540287, + "learning_rate": 8.453314407381122e-05, + "loss": 0.011115138232707978, + "step": 109010 + }, + { + "epoch": 15.474804826117815, + "grad_norm": 0.08684574067592621, + "learning_rate": 8.453172462739531e-05, + "loss": 0.014247065782546997, + "step": 109020 + }, + { + "epoch": 15.476224272533711, + "grad_norm": 6.032614231109619, + "learning_rate": 8.453030518097942e-05, + "loss": 0.02520335018634796, + "step": 109030 + }, + { + "epoch": 15.47764371894961, + "grad_norm": 0.05028625950217247, + "learning_rate": 8.452888573456352e-05, + "loss": 0.018765005469322204, + "step": 109040 + }, + { + "epoch": 15.479063165365508, + "grad_norm": 0.1101774200797081, + "learning_rate": 8.452746628814763e-05, + "loss": 0.06100413203239441, + "step": 109050 + }, + { + "epoch": 15.480482611781405, + "grad_norm": 0.01917419768869877, + "learning_rate": 8.452604684173173e-05, + "loss": 0.023857808113098143, + "step": 109060 + }, + { + "epoch": 15.481902058197303, + "grad_norm": 0.9928895235061646, + "learning_rate": 8.452462739531584e-05, + "loss": 0.03471379578113556, + "step": 109070 + }, + { + "epoch": 15.483321504613201, + "grad_norm": 0.9378594160079956, + "learning_rate": 8.452320794889992e-05, + "loss": 0.04051099121570587, + "step": 109080 + }, + { + "epoch": 15.4847409510291, + "grad_norm": 0.3512844443321228, + "learning_rate": 8.452178850248404e-05, + "loss": 0.026929941773414613, + "step": 109090 + }, + { + "epoch": 15.486160397444996, + "grad_norm": 0.055643901228904724, + "learning_rate": 8.452036905606813e-05, + "loss": 0.04442196190357208, + "step": 109100 + }, + { + "epoch": 15.487579843860894, + "grad_norm": 0.3258360028266907, + "learning_rate": 8.451894960965224e-05, + "loss": 0.006144022569060326, + "step": 109110 + }, + { + "epoch": 15.488999290276793, + "grad_norm": 0.4137914180755615, + "learning_rate": 8.451753016323634e-05, + "loss": 0.010329674184322356, + "step": 109120 + }, + { + "epoch": 15.490418736692689, + "grad_norm": 4.536850929260254, + "learning_rate": 8.451625266146204e-05, + "loss": 0.08604642748832703, + "step": 109130 + }, + { + "epoch": 15.491838183108587, + "grad_norm": 3.1179816722869873, + "learning_rate": 8.451483321504614e-05, + "loss": 0.010609415918588638, + "step": 109140 + }, + { + "epoch": 15.493257629524486, + "grad_norm": 3.4988842010498047, + "learning_rate": 8.451341376863023e-05, + "loss": 0.023094484210014345, + "step": 109150 + }, + { + "epoch": 15.494677075940384, + "grad_norm": 0.053110271692276, + "learning_rate": 8.451199432221433e-05, + "loss": 0.02963247299194336, + "step": 109160 + }, + { + "epoch": 15.49609652235628, + "grad_norm": 8.496163368225098, + "learning_rate": 8.451057487579844e-05, + "loss": 0.008166144788265228, + "step": 109170 + }, + { + "epoch": 15.497515968772179, + "grad_norm": 0.19316455721855164, + "learning_rate": 8.450915542938255e-05, + "loss": 0.008004320412874221, + "step": 109180 + }, + { + "epoch": 15.498935415188077, + "grad_norm": 0.49412721395492554, + "learning_rate": 8.450773598296665e-05, + "loss": 0.014553853869438171, + "step": 109190 + }, + { + "epoch": 15.500354861603974, + "grad_norm": 8.566283226013184, + "learning_rate": 8.450631653655075e-05, + "loss": 0.027910608053207397, + "step": 109200 + }, + { + "epoch": 15.501774308019872, + "grad_norm": 7.212894916534424, + "learning_rate": 8.450489709013485e-05, + "loss": 0.040905225276947024, + "step": 109210 + }, + { + "epoch": 15.50319375443577, + "grad_norm": 8.938632011413574, + "learning_rate": 8.450347764371896e-05, + "loss": 0.009672276675701141, + "step": 109220 + }, + { + "epoch": 15.504613200851669, + "grad_norm": 2.8423256874084473, + "learning_rate": 8.450205819730305e-05, + "loss": 0.02199479639530182, + "step": 109230 + }, + { + "epoch": 15.506032647267565, + "grad_norm": 0.3441467881202698, + "learning_rate": 8.450063875088717e-05, + "loss": 0.006689305603504181, + "step": 109240 + }, + { + "epoch": 15.507452093683463, + "grad_norm": 0.24933524429798126, + "learning_rate": 8.449921930447125e-05, + "loss": 0.04364819526672363, + "step": 109250 + }, + { + "epoch": 15.508871540099362, + "grad_norm": 6.487727165222168, + "learning_rate": 8.449779985805536e-05, + "loss": 0.0283764511346817, + "step": 109260 + }, + { + "epoch": 15.510290986515258, + "grad_norm": 0.009871330112218857, + "learning_rate": 8.449638041163947e-05, + "loss": 0.04828583300113678, + "step": 109270 + }, + { + "epoch": 15.511710432931157, + "grad_norm": 9.64976978302002, + "learning_rate": 8.449496096522357e-05, + "loss": 0.04197643399238586, + "step": 109280 + }, + { + "epoch": 15.513129879347055, + "grad_norm": 1.2106422185897827, + "learning_rate": 8.449354151880768e-05, + "loss": 0.013595214486122132, + "step": 109290 + }, + { + "epoch": 15.514549325762953, + "grad_norm": 4.21859073638916, + "learning_rate": 8.449212207239176e-05, + "loss": 0.07362929582595826, + "step": 109300 + }, + { + "epoch": 15.51596877217885, + "grad_norm": 0.23973767459392548, + "learning_rate": 8.449070262597587e-05, + "loss": 0.014032267034053802, + "step": 109310 + }, + { + "epoch": 15.517388218594748, + "grad_norm": 0.07020927220582962, + "learning_rate": 8.448928317955997e-05, + "loss": 0.03569666743278503, + "step": 109320 + }, + { + "epoch": 15.518807665010646, + "grad_norm": 0.3164190649986267, + "learning_rate": 8.448786373314408e-05, + "loss": 0.014000938832759857, + "step": 109330 + }, + { + "epoch": 15.520227111426543, + "grad_norm": 7.998960018157959, + "learning_rate": 8.448644428672818e-05, + "loss": 0.015060046315193176, + "step": 109340 + }, + { + "epoch": 15.521646557842441, + "grad_norm": 2.536862373352051, + "learning_rate": 8.448502484031228e-05, + "loss": 0.004766803607344628, + "step": 109350 + }, + { + "epoch": 15.52306600425834, + "grad_norm": 3.9543192386627197, + "learning_rate": 8.448360539389639e-05, + "loss": 0.00784388929605484, + "step": 109360 + }, + { + "epoch": 15.524485450674238, + "grad_norm": 1.2425618171691895, + "learning_rate": 8.448218594748049e-05, + "loss": 0.040623527765274045, + "step": 109370 + }, + { + "epoch": 15.525904897090134, + "grad_norm": 0.3892207741737366, + "learning_rate": 8.44807665010646e-05, + "loss": 0.021851207315921783, + "step": 109380 + }, + { + "epoch": 15.527324343506033, + "grad_norm": 12.130237579345703, + "learning_rate": 8.44793470546487e-05, + "loss": 0.05706889033317566, + "step": 109390 + }, + { + "epoch": 15.528743789921931, + "grad_norm": 4.82382345199585, + "learning_rate": 8.44779276082328e-05, + "loss": 0.04465123414993286, + "step": 109400 + }, + { + "epoch": 15.530163236337827, + "grad_norm": 8.075292587280273, + "learning_rate": 8.447650816181689e-05, + "loss": 0.012319304049015045, + "step": 109410 + }, + { + "epoch": 15.531582682753726, + "grad_norm": 1.7110710144042969, + "learning_rate": 8.4475088715401e-05, + "loss": 0.03167652189731598, + "step": 109420 + }, + { + "epoch": 15.533002129169624, + "grad_norm": 0.03574613481760025, + "learning_rate": 8.44736692689851e-05, + "loss": 0.004731189832091332, + "step": 109430 + }, + { + "epoch": 15.534421575585522, + "grad_norm": 4.071065902709961, + "learning_rate": 8.447224982256921e-05, + "loss": 0.03880989253520965, + "step": 109440 + }, + { + "epoch": 15.535841022001419, + "grad_norm": 0.3194302022457123, + "learning_rate": 8.44708303761533e-05, + "loss": 0.009817516803741455, + "step": 109450 + }, + { + "epoch": 15.537260468417317, + "grad_norm": 0.6993071436882019, + "learning_rate": 8.44694109297374e-05, + "loss": 0.023785218596458435, + "step": 109460 + }, + { + "epoch": 15.538679914833216, + "grad_norm": 0.22547049820423126, + "learning_rate": 8.446799148332151e-05, + "loss": 0.004621018841862679, + "step": 109470 + }, + { + "epoch": 15.540099361249112, + "grad_norm": 5.650206089019775, + "learning_rate": 8.446657203690561e-05, + "loss": 0.021305915713310242, + "step": 109480 + }, + { + "epoch": 15.54151880766501, + "grad_norm": 0.619448184967041, + "learning_rate": 8.446515259048972e-05, + "loss": 0.011055320501327515, + "step": 109490 + }, + { + "epoch": 15.542938254080909, + "grad_norm": 0.01749510131776333, + "learning_rate": 8.446373314407382e-05, + "loss": 0.028083550930023193, + "step": 109500 + }, + { + "epoch": 15.542938254080909, + "eval_accuracy": 0.9842945253385896, + "eval_loss": 0.05707499384880066, + "eval_runtime": 33.3307, + "eval_samples_per_second": 471.847, + "eval_steps_per_second": 14.761, + "step": 109500 + }, + { + "epoch": 15.544357700496807, + "grad_norm": 0.015825387090444565, + "learning_rate": 8.446231369765792e-05, + "loss": 0.013564802706241608, + "step": 109510 + }, + { + "epoch": 15.545777146912704, + "grad_norm": 0.11282722651958466, + "learning_rate": 8.446089425124201e-05, + "loss": 0.010410679876804352, + "step": 109520 + }, + { + "epoch": 15.547196593328602, + "grad_norm": 0.02192036435008049, + "learning_rate": 8.445947480482612e-05, + "loss": 0.03194921612739563, + "step": 109530 + }, + { + "epoch": 15.5486160397445, + "grad_norm": 0.06405551731586456, + "learning_rate": 8.445805535841022e-05, + "loss": 0.024281325936317443, + "step": 109540 + }, + { + "epoch": 15.550035486160397, + "grad_norm": 0.48121848702430725, + "learning_rate": 8.445663591199433e-05, + "loss": 0.01762586086988449, + "step": 109550 + }, + { + "epoch": 15.551454932576295, + "grad_norm": 0.06481608003377914, + "learning_rate": 8.445521646557843e-05, + "loss": 0.057471299171447755, + "step": 109560 + }, + { + "epoch": 15.552874378992193, + "grad_norm": 0.8036059141159058, + "learning_rate": 8.445379701916253e-05, + "loss": 0.007377585023641586, + "step": 109570 + }, + { + "epoch": 15.554293825408092, + "grad_norm": 6.7593207359313965, + "learning_rate": 8.445237757274664e-05, + "loss": 0.044133511185646054, + "step": 109580 + }, + { + "epoch": 15.555713271823988, + "grad_norm": 6.9386749267578125, + "learning_rate": 8.445095812633074e-05, + "loss": 0.023405832052230836, + "step": 109590 + }, + { + "epoch": 15.557132718239886, + "grad_norm": 0.21647658944129944, + "learning_rate": 8.444953867991485e-05, + "loss": 0.021339884400367735, + "step": 109600 + }, + { + "epoch": 15.558552164655785, + "grad_norm": 1.1516494750976562, + "learning_rate": 8.444811923349893e-05, + "loss": 0.011148992180824279, + "step": 109610 + }, + { + "epoch": 15.559971611071681, + "grad_norm": 1.6991609334945679, + "learning_rate": 8.444669978708304e-05, + "loss": 0.004113087803125382, + "step": 109620 + }, + { + "epoch": 15.56139105748758, + "grad_norm": 0.2822941839694977, + "learning_rate": 8.444528034066714e-05, + "loss": 0.0026031706482172014, + "step": 109630 + }, + { + "epoch": 15.562810503903478, + "grad_norm": 0.015090609900653362, + "learning_rate": 8.444386089425125e-05, + "loss": 0.039816674590110776, + "step": 109640 + }, + { + "epoch": 15.564229950319376, + "grad_norm": 0.018448730930685997, + "learning_rate": 8.444244144783535e-05, + "loss": 0.04701717495918274, + "step": 109650 + }, + { + "epoch": 15.565649396735273, + "grad_norm": 0.252054899930954, + "learning_rate": 8.444102200141944e-05, + "loss": 0.03242330551147461, + "step": 109660 + }, + { + "epoch": 15.567068843151171, + "grad_norm": 0.38011518120765686, + "learning_rate": 8.443960255500356e-05, + "loss": 0.017498777806758882, + "step": 109670 + }, + { + "epoch": 15.56848828956707, + "grad_norm": 7.523592948913574, + "learning_rate": 8.443818310858765e-05, + "loss": 0.009059159457683564, + "step": 109680 + }, + { + "epoch": 15.569907735982966, + "grad_norm": 0.7460406422615051, + "learning_rate": 8.443676366217176e-05, + "loss": 0.010137155652046204, + "step": 109690 + }, + { + "epoch": 15.571327182398864, + "grad_norm": 8.281078338623047, + "learning_rate": 8.443534421575586e-05, + "loss": 0.05115741491317749, + "step": 109700 + }, + { + "epoch": 15.572746628814762, + "grad_norm": 5.468574047088623, + "learning_rate": 8.443392476933996e-05, + "loss": 0.06222133040428161, + "step": 109710 + }, + { + "epoch": 15.57416607523066, + "grad_norm": 4.441808223724365, + "learning_rate": 8.443250532292406e-05, + "loss": 0.02200213372707367, + "step": 109720 + }, + { + "epoch": 15.575585521646557, + "grad_norm": 1.1405576467514038, + "learning_rate": 8.443108587650817e-05, + "loss": 0.018508346378803255, + "step": 109730 + }, + { + "epoch": 15.577004968062456, + "grad_norm": 9.524807929992676, + "learning_rate": 8.442966643009226e-05, + "loss": 0.08532507419586181, + "step": 109740 + }, + { + "epoch": 15.578424414478354, + "grad_norm": 3.8635847568511963, + "learning_rate": 8.442824698367638e-05, + "loss": 0.018411895632743834, + "step": 109750 + }, + { + "epoch": 15.57984386089425, + "grad_norm": 0.13418318331241608, + "learning_rate": 8.442682753726047e-05, + "loss": 0.007721404731273651, + "step": 109760 + }, + { + "epoch": 15.581263307310149, + "grad_norm": 0.04814405366778374, + "learning_rate": 8.442540809084457e-05, + "loss": 0.03254770040512085, + "step": 109770 + }, + { + "epoch": 15.582682753726047, + "grad_norm": 0.1322525292634964, + "learning_rate": 8.442398864442868e-05, + "loss": 0.04441567361354828, + "step": 109780 + }, + { + "epoch": 15.584102200141945, + "grad_norm": 0.26600533723831177, + "learning_rate": 8.442256919801278e-05, + "loss": 0.015367360413074493, + "step": 109790 + }, + { + "epoch": 15.585521646557842, + "grad_norm": 0.045342884957790375, + "learning_rate": 8.442114975159689e-05, + "loss": 0.043045997619628906, + "step": 109800 + }, + { + "epoch": 15.58694109297374, + "grad_norm": 0.2467890977859497, + "learning_rate": 8.441973030518099e-05, + "loss": 0.003971902281045913, + "step": 109810 + }, + { + "epoch": 15.588360539389639, + "grad_norm": 0.08323675394058228, + "learning_rate": 8.441831085876508e-05, + "loss": 0.007375334948301315, + "step": 109820 + }, + { + "epoch": 15.589779985805535, + "grad_norm": 1.4727420806884766, + "learning_rate": 8.441689141234918e-05, + "loss": 0.04179688096046448, + "step": 109830 + }, + { + "epoch": 15.591199432221433, + "grad_norm": 13.468354225158691, + "learning_rate": 8.441547196593329e-05, + "loss": 0.09071345329284668, + "step": 109840 + }, + { + "epoch": 15.592618878637332, + "grad_norm": 15.406740188598633, + "learning_rate": 8.441405251951739e-05, + "loss": 0.04422733187675476, + "step": 109850 + }, + { + "epoch": 15.59403832505323, + "grad_norm": 0.10879992693662643, + "learning_rate": 8.44126330731015e-05, + "loss": 0.06023339033126831, + "step": 109860 + }, + { + "epoch": 15.595457771469126, + "grad_norm": 8.273409843444824, + "learning_rate": 8.44112136266856e-05, + "loss": 0.04545511603355408, + "step": 109870 + }, + { + "epoch": 15.596877217885025, + "grad_norm": 0.15698932111263275, + "learning_rate": 8.44097941802697e-05, + "loss": 0.021031519770622252, + "step": 109880 + }, + { + "epoch": 15.598296664300923, + "grad_norm": 1.2806004285812378, + "learning_rate": 8.44083747338538e-05, + "loss": 0.03998381495475769, + "step": 109890 + }, + { + "epoch": 15.59971611071682, + "grad_norm": 2.8216757774353027, + "learning_rate": 8.44069552874379e-05, + "loss": 0.04080126881599426, + "step": 109900 + }, + { + "epoch": 15.601135557132718, + "grad_norm": 0.4584507942199707, + "learning_rate": 8.440553584102201e-05, + "loss": 0.042827948927879333, + "step": 109910 + }, + { + "epoch": 15.602555003548616, + "grad_norm": 10.623929977416992, + "learning_rate": 8.44041163946061e-05, + "loss": 0.029111909866333007, + "step": 109920 + }, + { + "epoch": 15.603974449964515, + "grad_norm": 0.0433068685233593, + "learning_rate": 8.440269694819021e-05, + "loss": 0.027372494339942932, + "step": 109930 + }, + { + "epoch": 15.605393896380411, + "grad_norm": 1.3678462505340576, + "learning_rate": 8.44012775017743e-05, + "loss": 0.03315771222114563, + "step": 109940 + }, + { + "epoch": 15.60681334279631, + "grad_norm": 0.028468577191233635, + "learning_rate": 8.439985805535842e-05, + "loss": 0.03360509574413299, + "step": 109950 + }, + { + "epoch": 15.608232789212208, + "grad_norm": 11.046137809753418, + "learning_rate": 8.439843860894251e-05, + "loss": 0.061426812410354616, + "step": 109960 + }, + { + "epoch": 15.609652235628104, + "grad_norm": 0.8080819249153137, + "learning_rate": 8.439701916252661e-05, + "loss": 0.025454476475715637, + "step": 109970 + }, + { + "epoch": 15.611071682044003, + "grad_norm": 3.136892557144165, + "learning_rate": 8.439559971611072e-05, + "loss": 0.06736152172088623, + "step": 109980 + }, + { + "epoch": 15.6124911284599, + "grad_norm": 0.2572226822376251, + "learning_rate": 8.439418026969482e-05, + "loss": 0.028138145804405212, + "step": 109990 + }, + { + "epoch": 15.6139105748758, + "grad_norm": 3.7471795082092285, + "learning_rate": 8.439276082327893e-05, + "loss": 0.020710088312625885, + "step": 110000 + }, + { + "epoch": 15.6139105748758, + "eval_accuracy": 0.981941883385261, + "eval_loss": 0.059561554342508316, + "eval_runtime": 32.3129, + "eval_samples_per_second": 486.71, + "eval_steps_per_second": 15.226, + "step": 110000 + }, + { + "epoch": 15.615330021291696, + "grad_norm": 1.7016760110855103, + "learning_rate": 8.439134137686303e-05, + "loss": 0.027257269620895384, + "step": 110010 + }, + { + "epoch": 15.616749467707594, + "grad_norm": 7.8672380447387695, + "learning_rate": 8.438992193044713e-05, + "loss": 0.03369447588920593, + "step": 110020 + }, + { + "epoch": 15.618168914123492, + "grad_norm": 16.262685775756836, + "learning_rate": 8.438850248403122e-05, + "loss": 0.06224566698074341, + "step": 110030 + }, + { + "epoch": 15.619588360539389, + "grad_norm": 0.527633786201477, + "learning_rate": 8.438708303761533e-05, + "loss": 0.041495251655578616, + "step": 110040 + }, + { + "epoch": 15.621007806955287, + "grad_norm": 1.2803436517715454, + "learning_rate": 8.438566359119943e-05, + "loss": 0.004190302640199661, + "step": 110050 + }, + { + "epoch": 15.622427253371185, + "grad_norm": 0.2544032335281372, + "learning_rate": 8.438424414478354e-05, + "loss": 0.024184998869895936, + "step": 110060 + }, + { + "epoch": 15.623846699787084, + "grad_norm": 0.30951157212257385, + "learning_rate": 8.438282469836764e-05, + "loss": 0.0079819455742836, + "step": 110070 + }, + { + "epoch": 15.62526614620298, + "grad_norm": 0.08560092002153397, + "learning_rate": 8.438140525195174e-05, + "loss": 0.01343330293893814, + "step": 110080 + }, + { + "epoch": 15.626685592618879, + "grad_norm": 0.5142914056777954, + "learning_rate": 8.437998580553585e-05, + "loss": 0.009387575834989548, + "step": 110090 + }, + { + "epoch": 15.628105039034777, + "grad_norm": 0.1999034583568573, + "learning_rate": 8.437856635911995e-05, + "loss": 0.015430738031864167, + "step": 110100 + }, + { + "epoch": 15.629524485450673, + "grad_norm": 8.752143859863281, + "learning_rate": 8.437714691270406e-05, + "loss": 0.01040157824754715, + "step": 110110 + }, + { + "epoch": 15.630943931866572, + "grad_norm": 7.477208614349365, + "learning_rate": 8.437572746628815e-05, + "loss": 0.015335509181022644, + "step": 110120 + }, + { + "epoch": 15.63236337828247, + "grad_norm": 0.04349973425269127, + "learning_rate": 8.437430801987225e-05, + "loss": 0.014721313118934631, + "step": 110130 + }, + { + "epoch": 15.633782824698368, + "grad_norm": 1.6337802410125732, + "learning_rate": 8.437288857345635e-05, + "loss": 0.015643975138664244, + "step": 110140 + }, + { + "epoch": 15.635202271114265, + "grad_norm": 0.07937774807214737, + "learning_rate": 8.437146912704046e-05, + "loss": 0.020302596688270568, + "step": 110150 + }, + { + "epoch": 15.636621717530163, + "grad_norm": 6.785717487335205, + "learning_rate": 8.437004968062456e-05, + "loss": 0.02386097311973572, + "step": 110160 + }, + { + "epoch": 15.638041163946061, + "grad_norm": 3.58707594871521, + "learning_rate": 8.436863023420867e-05, + "loss": 0.004123737290501595, + "step": 110170 + }, + { + "epoch": 15.639460610361958, + "grad_norm": 0.0928313136100769, + "learning_rate": 8.436721078779277e-05, + "loss": 0.03107512891292572, + "step": 110180 + }, + { + "epoch": 15.640880056777856, + "grad_norm": 6.510340213775635, + "learning_rate": 8.436579134137686e-05, + "loss": 0.01583448201417923, + "step": 110190 + }, + { + "epoch": 15.642299503193755, + "grad_norm": 0.04997854679822922, + "learning_rate": 8.436437189496097e-05, + "loss": 0.0402520090341568, + "step": 110200 + }, + { + "epoch": 15.643718949609653, + "grad_norm": 2.4349265098571777, + "learning_rate": 8.436295244854507e-05, + "loss": 0.012507960200309753, + "step": 110210 + }, + { + "epoch": 15.64513839602555, + "grad_norm": 0.037816863507032394, + "learning_rate": 8.436153300212918e-05, + "loss": 0.007110661268234253, + "step": 110220 + }, + { + "epoch": 15.646557842441448, + "grad_norm": 0.1295090615749359, + "learning_rate": 8.436011355571327e-05, + "loss": 0.037706056237220766, + "step": 110230 + }, + { + "epoch": 15.647977288857346, + "grad_norm": 0.10089115798473358, + "learning_rate": 8.435869410929738e-05, + "loss": 0.02398446500301361, + "step": 110240 + }, + { + "epoch": 15.649396735273243, + "grad_norm": 3.9181690216064453, + "learning_rate": 8.435727466288147e-05, + "loss": 0.03424106240272522, + "step": 110250 + }, + { + "epoch": 15.650816181689141, + "grad_norm": 0.01480263751000166, + "learning_rate": 8.435585521646559e-05, + "loss": 0.014413505792617798, + "step": 110260 + }, + { + "epoch": 15.65223562810504, + "grad_norm": 0.007548884954303503, + "learning_rate": 8.435443577004968e-05, + "loss": 0.041290727257728574, + "step": 110270 + }, + { + "epoch": 15.653655074520938, + "grad_norm": 0.3037410378456116, + "learning_rate": 8.435301632363378e-05, + "loss": 0.004957319423556328, + "step": 110280 + }, + { + "epoch": 15.655074520936834, + "grad_norm": 0.1113152801990509, + "learning_rate": 8.435159687721789e-05, + "loss": 0.019102156162261963, + "step": 110290 + }, + { + "epoch": 15.656493967352732, + "grad_norm": 0.6818870902061462, + "learning_rate": 8.435017743080199e-05, + "loss": 0.0033379919826984406, + "step": 110300 + }, + { + "epoch": 15.65791341376863, + "grad_norm": 0.11788970977067947, + "learning_rate": 8.43487579843861e-05, + "loss": 0.022930392622947694, + "step": 110310 + }, + { + "epoch": 15.659332860184527, + "grad_norm": 1.483289122581482, + "learning_rate": 8.43473385379702e-05, + "loss": 0.012271914631128311, + "step": 110320 + }, + { + "epoch": 15.660752306600425, + "grad_norm": 0.18088115751743317, + "learning_rate": 8.43459190915543e-05, + "loss": 0.015769003331661223, + "step": 110330 + }, + { + "epoch": 15.662171753016324, + "grad_norm": 0.052908070385456085, + "learning_rate": 8.434449964513839e-05, + "loss": 0.04344092309474945, + "step": 110340 + }, + { + "epoch": 15.663591199432222, + "grad_norm": 6.1623125076293945, + "learning_rate": 8.43430801987225e-05, + "loss": 0.004051884636282921, + "step": 110350 + }, + { + "epoch": 15.665010645848119, + "grad_norm": 5.770845890045166, + "learning_rate": 8.43416607523066e-05, + "loss": 0.004904124140739441, + "step": 110360 + }, + { + "epoch": 15.666430092264017, + "grad_norm": 0.3964264988899231, + "learning_rate": 8.434024130589071e-05, + "loss": 0.007972334325313569, + "step": 110370 + }, + { + "epoch": 15.667849538679915, + "grad_norm": 0.17308181524276733, + "learning_rate": 8.433882185947481e-05, + "loss": 0.043213242292404176, + "step": 110380 + }, + { + "epoch": 15.669268985095812, + "grad_norm": 0.36493051052093506, + "learning_rate": 8.43374024130589e-05, + "loss": 0.03352370262145996, + "step": 110390 + }, + { + "epoch": 15.67068843151171, + "grad_norm": 0.02064143307507038, + "learning_rate": 8.433598296664302e-05, + "loss": 0.013609150052070617, + "step": 110400 + }, + { + "epoch": 15.672107877927608, + "grad_norm": 8.8279390335083, + "learning_rate": 8.433456352022711e-05, + "loss": 0.017481517791748048, + "step": 110410 + }, + { + "epoch": 15.673527324343507, + "grad_norm": 3.44317364692688, + "learning_rate": 8.433314407381122e-05, + "loss": 0.05524869561195374, + "step": 110420 + }, + { + "epoch": 15.674946770759403, + "grad_norm": 0.13901682198047638, + "learning_rate": 8.433172462739531e-05, + "loss": 0.03441147804260254, + "step": 110430 + }, + { + "epoch": 15.676366217175302, + "grad_norm": 0.07394246757030487, + "learning_rate": 8.433030518097942e-05, + "loss": 0.018186067044734956, + "step": 110440 + }, + { + "epoch": 15.6777856635912, + "grad_norm": 6.032867908477783, + "learning_rate": 8.432888573456352e-05, + "loss": 0.017774075269699097, + "step": 110450 + }, + { + "epoch": 15.679205110007096, + "grad_norm": 2.218717336654663, + "learning_rate": 8.432746628814763e-05, + "loss": 0.04855410158634186, + "step": 110460 + }, + { + "epoch": 15.680624556422995, + "grad_norm": 0.3446206748485565, + "learning_rate": 8.432604684173172e-05, + "loss": 0.0166058674454689, + "step": 110470 + }, + { + "epoch": 15.682044002838893, + "grad_norm": 11.216866493225098, + "learning_rate": 8.432462739531584e-05, + "loss": 0.028833556175231933, + "step": 110480 + }, + { + "epoch": 15.683463449254791, + "grad_norm": 0.23951852321624756, + "learning_rate": 8.432320794889993e-05, + "loss": 0.043452754616737366, + "step": 110490 + }, + { + "epoch": 15.684882895670688, + "grad_norm": 9.91982650756836, + "learning_rate": 8.432178850248403e-05, + "loss": 0.04466983377933502, + "step": 110500 + }, + { + "epoch": 15.684882895670688, + "eval_accuracy": 0.9761556558784257, + "eval_loss": 0.08100114017724991, + "eval_runtime": 32.2852, + "eval_samples_per_second": 487.127, + "eval_steps_per_second": 15.239, + "step": 110500 + }, + { + "epoch": 15.686302342086586, + "grad_norm": 2.133854866027832, + "learning_rate": 8.432036905606814e-05, + "loss": 0.04842991232872009, + "step": 110510 + }, + { + "epoch": 15.687721788502484, + "grad_norm": 3.1208901405334473, + "learning_rate": 8.431894960965224e-05, + "loss": 0.013027089834213256, + "step": 110520 + }, + { + "epoch": 15.689141234918381, + "grad_norm": 0.02794860303401947, + "learning_rate": 8.431753016323635e-05, + "loss": 0.04402931034564972, + "step": 110530 + }, + { + "epoch": 15.69056068133428, + "grad_norm": 0.09005889296531677, + "learning_rate": 8.431611071682043e-05, + "loss": 0.030572378635406496, + "step": 110540 + }, + { + "epoch": 15.691980127750178, + "grad_norm": 1.786181092262268, + "learning_rate": 8.431469127040454e-05, + "loss": 0.008524458110332488, + "step": 110550 + }, + { + "epoch": 15.693399574166076, + "grad_norm": 0.46687254309654236, + "learning_rate": 8.431327182398864e-05, + "loss": 0.035732558369636534, + "step": 110560 + }, + { + "epoch": 15.694819020581972, + "grad_norm": 1.8484487533569336, + "learning_rate": 8.431185237757275e-05, + "loss": 0.03911701440811157, + "step": 110570 + }, + { + "epoch": 15.69623846699787, + "grad_norm": 11.28540325164795, + "learning_rate": 8.431043293115686e-05, + "loss": 0.04078208804130554, + "step": 110580 + }, + { + "epoch": 15.697657913413769, + "grad_norm": 0.06755779683589935, + "learning_rate": 8.430901348474095e-05, + "loss": 0.01349416971206665, + "step": 110590 + }, + { + "epoch": 15.699077359829666, + "grad_norm": 8.60862922668457, + "learning_rate": 8.430759403832506e-05, + "loss": 0.06120396852493286, + "step": 110600 + }, + { + "epoch": 15.700496806245564, + "grad_norm": 1.085951328277588, + "learning_rate": 8.430617459190916e-05, + "loss": 0.041951301693916324, + "step": 110610 + }, + { + "epoch": 15.701916252661462, + "grad_norm": 4.475429058074951, + "learning_rate": 8.430475514549327e-05, + "loss": 0.053812021017074586, + "step": 110620 + }, + { + "epoch": 15.70333569907736, + "grad_norm": 5.355215549468994, + "learning_rate": 8.430333569907736e-05, + "loss": 0.015490742027759552, + "step": 110630 + }, + { + "epoch": 15.704755145493257, + "grad_norm": 0.05852344259619713, + "learning_rate": 8.430191625266146e-05, + "loss": 0.002999766170978546, + "step": 110640 + }, + { + "epoch": 15.706174591909155, + "grad_norm": 5.2904767990112305, + "learning_rate": 8.430049680624556e-05, + "loss": 0.011166901141405106, + "step": 110650 + }, + { + "epoch": 15.707594038325054, + "grad_norm": 1.8793965578079224, + "learning_rate": 8.429907735982967e-05, + "loss": 0.035853844881057736, + "step": 110660 + }, + { + "epoch": 15.70901348474095, + "grad_norm": 0.28392934799194336, + "learning_rate": 8.429765791341378e-05, + "loss": 0.008197212964296341, + "step": 110670 + }, + { + "epoch": 15.710432931156848, + "grad_norm": 0.019327659159898758, + "learning_rate": 8.429623846699788e-05, + "loss": 0.02017320692539215, + "step": 110680 + }, + { + "epoch": 15.711852377572747, + "grad_norm": 0.040241993963718414, + "learning_rate": 8.429481902058198e-05, + "loss": 0.036533668637275696, + "step": 110690 + }, + { + "epoch": 15.713271823988645, + "grad_norm": 1.2111730575561523, + "learning_rate": 8.429339957416607e-05, + "loss": 0.052335488796234134, + "step": 110700 + }, + { + "epoch": 15.714691270404542, + "grad_norm": 0.16749830543994904, + "learning_rate": 8.429198012775018e-05, + "loss": 0.0016597557812929153, + "step": 110710 + }, + { + "epoch": 15.71611071682044, + "grad_norm": 0.04028880223631859, + "learning_rate": 8.429056068133428e-05, + "loss": 0.01130024939775467, + "step": 110720 + }, + { + "epoch": 15.717530163236338, + "grad_norm": 0.8011389970779419, + "learning_rate": 8.428914123491839e-05, + "loss": 0.0341774582862854, + "step": 110730 + }, + { + "epoch": 15.718949609652235, + "grad_norm": 0.1157180517911911, + "learning_rate": 8.428772178850248e-05, + "loss": 0.024665170907974245, + "step": 110740 + }, + { + "epoch": 15.720369056068133, + "grad_norm": 0.24222633242607117, + "learning_rate": 8.428630234208659e-05, + "loss": 0.014342208206653596, + "step": 110750 + }, + { + "epoch": 15.721788502484031, + "grad_norm": 0.11375311762094498, + "learning_rate": 8.42848828956707e-05, + "loss": 0.025572729110717774, + "step": 110760 + }, + { + "epoch": 15.72320794889993, + "grad_norm": 0.2284621149301529, + "learning_rate": 8.42834634492548e-05, + "loss": 0.016583889722824097, + "step": 110770 + }, + { + "epoch": 15.724627395315826, + "grad_norm": 2.907301425933838, + "learning_rate": 8.42820440028389e-05, + "loss": 0.014733342826366425, + "step": 110780 + }, + { + "epoch": 15.726046841731725, + "grad_norm": 4.311647415161133, + "learning_rate": 8.4280624556423e-05, + "loss": 0.044314044713974, + "step": 110790 + }, + { + "epoch": 15.727466288147623, + "grad_norm": 1.3881518840789795, + "learning_rate": 8.42792051100071e-05, + "loss": 0.008146890997886657, + "step": 110800 + }, + { + "epoch": 15.72888573456352, + "grad_norm": 0.0465969555079937, + "learning_rate": 8.42777856635912e-05, + "loss": 0.016651517152786253, + "step": 110810 + }, + { + "epoch": 15.730305180979418, + "grad_norm": 0.1810646504163742, + "learning_rate": 8.427636621717531e-05, + "loss": 0.010874331742525101, + "step": 110820 + }, + { + "epoch": 15.731724627395316, + "grad_norm": 0.049988340586423874, + "learning_rate": 8.42749467707594e-05, + "loss": 0.0026690881699323655, + "step": 110830 + }, + { + "epoch": 15.733144073811214, + "grad_norm": 0.05745243653655052, + "learning_rate": 8.427352732434352e-05, + "loss": 0.007335717976093292, + "step": 110840 + }, + { + "epoch": 15.73456352022711, + "grad_norm": 0.04251491278409958, + "learning_rate": 8.427210787792762e-05, + "loss": 0.03565714657306671, + "step": 110850 + }, + { + "epoch": 15.735982966643009, + "grad_norm": 0.5023424625396729, + "learning_rate": 8.427068843151171e-05, + "loss": 0.0396359771490097, + "step": 110860 + }, + { + "epoch": 15.737402413058907, + "grad_norm": 0.19283650815486908, + "learning_rate": 8.426926898509582e-05, + "loss": 0.0022236768156290053, + "step": 110870 + }, + { + "epoch": 15.738821859474804, + "grad_norm": 0.23359252512454987, + "learning_rate": 8.426784953867992e-05, + "loss": 0.030733969807624818, + "step": 110880 + }, + { + "epoch": 15.740241305890702, + "grad_norm": 0.6884561777114868, + "learning_rate": 8.426643009226403e-05, + "loss": 0.03513207733631134, + "step": 110890 + }, + { + "epoch": 15.7416607523066, + "grad_norm": 4.131224632263184, + "learning_rate": 8.426501064584812e-05, + "loss": 0.004008464515209198, + "step": 110900 + }, + { + "epoch": 15.743080198722499, + "grad_norm": 0.22556258738040924, + "learning_rate": 8.426359119943223e-05, + "loss": 0.003835783526301384, + "step": 110910 + }, + { + "epoch": 15.744499645138395, + "grad_norm": 2.7751762866973877, + "learning_rate": 8.426217175301632e-05, + "loss": 0.005967815220355987, + "step": 110920 + }, + { + "epoch": 15.745919091554294, + "grad_norm": 13.616484642028809, + "learning_rate": 8.426075230660043e-05, + "loss": 0.02839590609073639, + "step": 110930 + }, + { + "epoch": 15.747338537970192, + "grad_norm": 0.7434455752372742, + "learning_rate": 8.425933286018453e-05, + "loss": 0.011851108074188233, + "step": 110940 + }, + { + "epoch": 15.748757984386089, + "grad_norm": 0.3369200527667999, + "learning_rate": 8.425791341376863e-05, + "loss": 0.04642275869846344, + "step": 110950 + }, + { + "epoch": 15.750177430801987, + "grad_norm": 14.551253318786621, + "learning_rate": 8.425649396735274e-05, + "loss": 0.07257083654403687, + "step": 110960 + }, + { + "epoch": 15.751596877217885, + "grad_norm": 9.559237480163574, + "learning_rate": 8.425507452093684e-05, + "loss": 0.03088544011116028, + "step": 110970 + }, + { + "epoch": 15.753016323633783, + "grad_norm": 0.060590412467718124, + "learning_rate": 8.425365507452095e-05, + "loss": 0.09815815687179566, + "step": 110980 + }, + { + "epoch": 15.75443577004968, + "grad_norm": 9.443406105041504, + "learning_rate": 8.425223562810505e-05, + "loss": 0.01634849011898041, + "step": 110990 + }, + { + "epoch": 15.755855216465578, + "grad_norm": 0.03370879590511322, + "learning_rate": 8.425081618168914e-05, + "loss": 0.040988501906394956, + "step": 111000 + }, + { + "epoch": 15.755855216465578, + "eval_accuracy": 0.9864564125389458, + "eval_loss": 0.05055028945207596, + "eval_runtime": 32.712, + "eval_samples_per_second": 480.772, + "eval_steps_per_second": 15.04, + "step": 111000 + }, + { + "epoch": 15.757274662881477, + "grad_norm": 0.006649512331932783, + "learning_rate": 8.424939673527324e-05, + "loss": 0.02285851240158081, + "step": 111010 + }, + { + "epoch": 15.758694109297373, + "grad_norm": 0.03384064882993698, + "learning_rate": 8.424797728885735e-05, + "loss": 0.01740650236606598, + "step": 111020 + }, + { + "epoch": 15.760113555713271, + "grad_norm": 6.754789352416992, + "learning_rate": 8.424655784244145e-05, + "loss": 0.02038225531578064, + "step": 111030 + }, + { + "epoch": 15.76153300212917, + "grad_norm": 0.211502805352211, + "learning_rate": 8.424513839602556e-05, + "loss": 0.027649855613708495, + "step": 111040 + }, + { + "epoch": 15.762952448545068, + "grad_norm": 0.04203588888049126, + "learning_rate": 8.424371894960966e-05, + "loss": 0.0319151371717453, + "step": 111050 + }, + { + "epoch": 15.764371894960965, + "grad_norm": 0.061523132026195526, + "learning_rate": 8.424229950319375e-05, + "loss": 0.007026679813861847, + "step": 111060 + }, + { + "epoch": 15.765791341376863, + "grad_norm": 1.0526913404464722, + "learning_rate": 8.424088005677787e-05, + "loss": 0.017617282271385194, + "step": 111070 + }, + { + "epoch": 15.767210787792761, + "grad_norm": 0.545382022857666, + "learning_rate": 8.423946061036196e-05, + "loss": 0.008751662820577622, + "step": 111080 + }, + { + "epoch": 15.768630234208658, + "grad_norm": 2.2254326343536377, + "learning_rate": 8.423804116394607e-05, + "loss": 0.004927302151918412, + "step": 111090 + }, + { + "epoch": 15.770049680624556, + "grad_norm": 0.035762522369623184, + "learning_rate": 8.423662171753016e-05, + "loss": 0.015064448118209839, + "step": 111100 + }, + { + "epoch": 15.771469127040454, + "grad_norm": 0.004056834150105715, + "learning_rate": 8.423520227111427e-05, + "loss": 0.022363266348838805, + "step": 111110 + }, + { + "epoch": 15.772888573456353, + "grad_norm": 3.803673267364502, + "learning_rate": 8.423378282469837e-05, + "loss": 0.025861644744873048, + "step": 111120 + }, + { + "epoch": 15.77430801987225, + "grad_norm": 1.0381547212600708, + "learning_rate": 8.423236337828248e-05, + "loss": 0.03979891836643219, + "step": 111130 + }, + { + "epoch": 15.775727466288147, + "grad_norm": 2.0041587352752686, + "learning_rate": 8.423094393186657e-05, + "loss": 0.0236584410071373, + "step": 111140 + }, + { + "epoch": 15.777146912704046, + "grad_norm": 0.5491400361061096, + "learning_rate": 8.422952448545069e-05, + "loss": 0.06289567351341248, + "step": 111150 + }, + { + "epoch": 15.778566359119942, + "grad_norm": 0.2346036732196808, + "learning_rate": 8.422810503903478e-05, + "loss": 0.012413589656352997, + "step": 111160 + }, + { + "epoch": 15.77998580553584, + "grad_norm": 10.742231369018555, + "learning_rate": 8.422668559261888e-05, + "loss": 0.05024981498718262, + "step": 111170 + }, + { + "epoch": 15.781405251951739, + "grad_norm": 0.9036098718643188, + "learning_rate": 8.422526614620299e-05, + "loss": 0.006312942504882813, + "step": 111180 + }, + { + "epoch": 15.782824698367637, + "grad_norm": 0.039659857749938965, + "learning_rate": 8.422384669978709e-05, + "loss": 0.020886825025081636, + "step": 111190 + }, + { + "epoch": 15.784244144783534, + "grad_norm": 2.2897820472717285, + "learning_rate": 8.42224272533712e-05, + "loss": 0.040818363428115845, + "step": 111200 + }, + { + "epoch": 15.785663591199432, + "grad_norm": 0.9270223379135132, + "learning_rate": 8.422100780695528e-05, + "loss": 0.04686786830425262, + "step": 111210 + }, + { + "epoch": 15.78708303761533, + "grad_norm": 6.989339828491211, + "learning_rate": 8.42195883605394e-05, + "loss": 0.016091875731945038, + "step": 111220 + }, + { + "epoch": 15.788502484031227, + "grad_norm": 8.478525161743164, + "learning_rate": 8.421816891412349e-05, + "loss": 0.005417653918266296, + "step": 111230 + }, + { + "epoch": 15.789921930447125, + "grad_norm": 0.11680683493614197, + "learning_rate": 8.42167494677076e-05, + "loss": 0.014682632684707642, + "step": 111240 + }, + { + "epoch": 15.791341376863024, + "grad_norm": 2.4658005237579346, + "learning_rate": 8.42153300212917e-05, + "loss": 0.01771555542945862, + "step": 111250 + }, + { + "epoch": 15.792760823278922, + "grad_norm": 7.7995195388793945, + "learning_rate": 8.42140525195174e-05, + "loss": 0.035976368188858035, + "step": 111260 + }, + { + "epoch": 15.794180269694818, + "grad_norm": 5.016642093658447, + "learning_rate": 8.42126330731015e-05, + "loss": 0.02658340334892273, + "step": 111270 + }, + { + "epoch": 15.795599716110717, + "grad_norm": 10.646329879760742, + "learning_rate": 8.421121362668559e-05, + "loss": 0.014648254215717315, + "step": 111280 + }, + { + "epoch": 15.797019162526615, + "grad_norm": 0.013211095705628395, + "learning_rate": 8.420979418026969e-05, + "loss": 0.002887158468365669, + "step": 111290 + }, + { + "epoch": 15.798438608942512, + "grad_norm": 0.05278421938419342, + "learning_rate": 8.42083747338538e-05, + "loss": 0.01711827516555786, + "step": 111300 + }, + { + "epoch": 15.79985805535841, + "grad_norm": 12.257071495056152, + "learning_rate": 8.42069552874379e-05, + "loss": 0.019732020795345306, + "step": 111310 + }, + { + "epoch": 15.801277501774308, + "grad_norm": 0.05365946516394615, + "learning_rate": 8.420553584102201e-05, + "loss": 0.03791945576667786, + "step": 111320 + }, + { + "epoch": 15.802696948190206, + "grad_norm": 0.23845407366752625, + "learning_rate": 8.420411639460611e-05, + "loss": 0.012302954494953156, + "step": 111330 + }, + { + "epoch": 15.804116394606103, + "grad_norm": 3.534014940261841, + "learning_rate": 8.42026969481902e-05, + "loss": 0.024273604154586792, + "step": 111340 + }, + { + "epoch": 15.805535841022001, + "grad_norm": 0.020159900188446045, + "learning_rate": 8.420127750177432e-05, + "loss": 0.01508972942829132, + "step": 111350 + }, + { + "epoch": 15.8069552874379, + "grad_norm": 0.07559079676866531, + "learning_rate": 8.419985805535841e-05, + "loss": 0.030358341336250306, + "step": 111360 + }, + { + "epoch": 15.808374733853796, + "grad_norm": 0.23751536011695862, + "learning_rate": 8.419843860894252e-05, + "loss": 0.06312950253486634, + "step": 111370 + }, + { + "epoch": 15.809794180269694, + "grad_norm": 6.484631061553955, + "learning_rate": 8.419701916252661e-05, + "loss": 0.06035802960395813, + "step": 111380 + }, + { + "epoch": 15.811213626685593, + "grad_norm": 0.06401360034942627, + "learning_rate": 8.419559971611072e-05, + "loss": 0.014664597809314728, + "step": 111390 + }, + { + "epoch": 15.812633073101491, + "grad_norm": 4.50247859954834, + "learning_rate": 8.419418026969482e-05, + "loss": 0.017867615818977355, + "step": 111400 + }, + { + "epoch": 15.814052519517388, + "grad_norm": 4.960999488830566, + "learning_rate": 8.419276082327893e-05, + "loss": 0.04982729852199554, + "step": 111410 + }, + { + "epoch": 15.815471965933286, + "grad_norm": 0.12740544974803925, + "learning_rate": 8.419134137686304e-05, + "loss": 0.02763109505176544, + "step": 111420 + }, + { + "epoch": 15.816891412349184, + "grad_norm": 3.2318503856658936, + "learning_rate": 8.418992193044712e-05, + "loss": 0.08191608190536499, + "step": 111430 + }, + { + "epoch": 15.81831085876508, + "grad_norm": 0.348230242729187, + "learning_rate": 8.418850248403123e-05, + "loss": 0.012708775699138641, + "step": 111440 + }, + { + "epoch": 15.819730305180979, + "grad_norm": 2.7329788208007812, + "learning_rate": 8.418708303761533e-05, + "loss": 0.028577986359596252, + "step": 111450 + }, + { + "epoch": 15.821149751596877, + "grad_norm": 9.365242004394531, + "learning_rate": 8.418566359119944e-05, + "loss": 0.030303579568862916, + "step": 111460 + }, + { + "epoch": 15.822569198012776, + "grad_norm": 0.8939119577407837, + "learning_rate": 8.418424414478354e-05, + "loss": 0.014215029776096344, + "step": 111470 + }, + { + "epoch": 15.823988644428672, + "grad_norm": 9.930445671081543, + "learning_rate": 8.418282469836765e-05, + "loss": 0.032778263092041016, + "step": 111480 + }, + { + "epoch": 15.82540809084457, + "grad_norm": 6.8321757316589355, + "learning_rate": 8.418140525195173e-05, + "loss": 0.012145863473415374, + "step": 111490 + }, + { + "epoch": 15.826827537260469, + "grad_norm": 0.24095939099788666, + "learning_rate": 8.417998580553584e-05, + "loss": 0.03848366439342499, + "step": 111500 + }, + { + "epoch": 15.826827537260469, + "eval_accuracy": 0.9883639600686717, + "eval_loss": 0.03906998410820961, + "eval_runtime": 32.8578, + "eval_samples_per_second": 478.639, + "eval_steps_per_second": 14.974, + "step": 111500 + }, + { + "epoch": 15.828246983676365, + "grad_norm": 0.6555007100105286, + "learning_rate": 8.417856635911995e-05, + "loss": 0.012145914137363434, + "step": 111510 + }, + { + "epoch": 15.829666430092264, + "grad_norm": 0.16343164443969727, + "learning_rate": 8.417714691270405e-05, + "loss": 0.02596222162246704, + "step": 111520 + }, + { + "epoch": 15.831085876508162, + "grad_norm": 0.07217422127723694, + "learning_rate": 8.417572746628816e-05, + "loss": 0.05710041522979736, + "step": 111530 + }, + { + "epoch": 15.83250532292406, + "grad_norm": 0.29869720339775085, + "learning_rate": 8.417430801987225e-05, + "loss": 0.025982874631881713, + "step": 111540 + }, + { + "epoch": 15.833924769339957, + "grad_norm": 6.981906890869141, + "learning_rate": 8.417288857345636e-05, + "loss": 0.014904718101024627, + "step": 111550 + }, + { + "epoch": 15.835344215755855, + "grad_norm": 1.1227699518203735, + "learning_rate": 8.417146912704046e-05, + "loss": 0.033892059326171876, + "step": 111560 + }, + { + "epoch": 15.836763662171753, + "grad_norm": 6.107216835021973, + "learning_rate": 8.417004968062457e-05, + "loss": 0.007829079031944275, + "step": 111570 + }, + { + "epoch": 15.83818310858765, + "grad_norm": 10.42241382598877, + "learning_rate": 8.416863023420866e-05, + "loss": 0.02919740676879883, + "step": 111580 + }, + { + "epoch": 15.839602555003548, + "grad_norm": 0.1118733212351799, + "learning_rate": 8.416721078779276e-05, + "loss": 0.009149536490440369, + "step": 111590 + }, + { + "epoch": 15.841022001419446, + "grad_norm": 1.1477084159851074, + "learning_rate": 8.416579134137687e-05, + "loss": 0.010149497538805008, + "step": 111600 + }, + { + "epoch": 15.842441447835345, + "grad_norm": 0.6257818341255188, + "learning_rate": 8.416437189496097e-05, + "loss": 0.01542675495147705, + "step": 111610 + }, + { + "epoch": 15.843860894251241, + "grad_norm": 0.05429164692759514, + "learning_rate": 8.416295244854508e-05, + "loss": 0.01368640512228012, + "step": 111620 + }, + { + "epoch": 15.84528034066714, + "grad_norm": 9.229668617248535, + "learning_rate": 8.416153300212918e-05, + "loss": 0.0817399799823761, + "step": 111630 + }, + { + "epoch": 15.846699787083038, + "grad_norm": 0.07244868576526642, + "learning_rate": 8.416011355571327e-05, + "loss": 0.05533000230789185, + "step": 111640 + }, + { + "epoch": 15.848119233498934, + "grad_norm": 0.11954519152641296, + "learning_rate": 8.415869410929737e-05, + "loss": 0.019126889109611512, + "step": 111650 + }, + { + "epoch": 15.849538679914833, + "grad_norm": 0.40038758516311646, + "learning_rate": 8.415727466288148e-05, + "loss": 0.004278429970145226, + "step": 111660 + }, + { + "epoch": 15.850958126330731, + "grad_norm": 0.4650089740753174, + "learning_rate": 8.415585521646558e-05, + "loss": 0.009340885281562804, + "step": 111670 + }, + { + "epoch": 15.85237757274663, + "grad_norm": 0.37171149253845215, + "learning_rate": 8.415443577004969e-05, + "loss": 0.015973356366157532, + "step": 111680 + }, + { + "epoch": 15.853797019162526, + "grad_norm": 0.15554748475551605, + "learning_rate": 8.415301632363379e-05, + "loss": 0.05459659695625305, + "step": 111690 + }, + { + "epoch": 15.855216465578424, + "grad_norm": 0.66119784116745, + "learning_rate": 8.415159687721789e-05, + "loss": 0.012813696265220642, + "step": 111700 + }, + { + "epoch": 15.856635911994323, + "grad_norm": 0.025645015761256218, + "learning_rate": 8.4150177430802e-05, + "loss": 0.008818748593330383, + "step": 111710 + }, + { + "epoch": 15.858055358410219, + "grad_norm": 3.9062538146972656, + "learning_rate": 8.41487579843861e-05, + "loss": 0.04479396939277649, + "step": 111720 + }, + { + "epoch": 15.859474804826117, + "grad_norm": 0.01201682724058628, + "learning_rate": 8.41473385379702e-05, + "loss": 0.012727364897727966, + "step": 111730 + }, + { + "epoch": 15.860894251242016, + "grad_norm": 0.7928599119186401, + "learning_rate": 8.414591909155429e-05, + "loss": 0.020368021726608277, + "step": 111740 + }, + { + "epoch": 15.862313697657914, + "grad_norm": 10.790556907653809, + "learning_rate": 8.41444996451384e-05, + "loss": 0.022309675812721252, + "step": 111750 + }, + { + "epoch": 15.86373314407381, + "grad_norm": 3.925905466079712, + "learning_rate": 8.41430801987225e-05, + "loss": 0.03202139735221863, + "step": 111760 + }, + { + "epoch": 15.865152590489709, + "grad_norm": 0.42782142758369446, + "learning_rate": 8.414166075230661e-05, + "loss": 0.014055775105953216, + "step": 111770 + }, + { + "epoch": 15.866572036905607, + "grad_norm": 1.0484387874603271, + "learning_rate": 8.41402413058907e-05, + "loss": 0.01409704089164734, + "step": 111780 + }, + { + "epoch": 15.867991483321505, + "grad_norm": 0.23534570634365082, + "learning_rate": 8.41388218594748e-05, + "loss": 0.07245479226112365, + "step": 111790 + }, + { + "epoch": 15.869410929737402, + "grad_norm": 8.457581520080566, + "learning_rate": 8.413740241305891e-05, + "loss": 0.05491371154785156, + "step": 111800 + }, + { + "epoch": 15.8708303761533, + "grad_norm": 0.24857249855995178, + "learning_rate": 8.413598296664301e-05, + "loss": 0.03584108054637909, + "step": 111810 + }, + { + "epoch": 15.872249822569199, + "grad_norm": 0.17013956606388092, + "learning_rate": 8.413456352022712e-05, + "loss": 0.012269440293312072, + "step": 111820 + }, + { + "epoch": 15.873669268985095, + "grad_norm": 1.8806430101394653, + "learning_rate": 8.413314407381122e-05, + "loss": 0.03550401926040649, + "step": 111830 + }, + { + "epoch": 15.875088715400993, + "grad_norm": 0.308080792427063, + "learning_rate": 8.413172462739533e-05, + "loss": 0.04612856507301331, + "step": 111840 + }, + { + "epoch": 15.876508161816892, + "grad_norm": 0.014880211092531681, + "learning_rate": 8.413030518097941e-05, + "loss": 0.008867764472961425, + "step": 111850 + }, + { + "epoch": 15.87792760823279, + "grad_norm": 0.8367263674736023, + "learning_rate": 8.412888573456353e-05, + "loss": 0.010482820868492126, + "step": 111860 + }, + { + "epoch": 15.879347054648687, + "grad_norm": 0.00848662480711937, + "learning_rate": 8.412746628814762e-05, + "loss": 0.013878077268600464, + "step": 111870 + }, + { + "epoch": 15.880766501064585, + "grad_norm": 6.243743896484375, + "learning_rate": 8.412604684173173e-05, + "loss": 0.019290319085121153, + "step": 111880 + }, + { + "epoch": 15.882185947480483, + "grad_norm": 0.19164791703224182, + "learning_rate": 8.412462739531583e-05, + "loss": 0.07253921627998353, + "step": 111890 + }, + { + "epoch": 15.88360539389638, + "grad_norm": 0.2494039535522461, + "learning_rate": 8.412320794889993e-05, + "loss": 0.036090517044067384, + "step": 111900 + }, + { + "epoch": 15.885024840312278, + "grad_norm": 4.53750467300415, + "learning_rate": 8.412178850248404e-05, + "loss": 0.008243809640407562, + "step": 111910 + }, + { + "epoch": 15.886444286728176, + "grad_norm": 0.1850540190935135, + "learning_rate": 8.412036905606814e-05, + "loss": 0.029005610942840578, + "step": 111920 + }, + { + "epoch": 15.887863733144075, + "grad_norm": 0.18927457928657532, + "learning_rate": 8.411894960965225e-05, + "loss": 0.03487345576286316, + "step": 111930 + }, + { + "epoch": 15.889283179559971, + "grad_norm": 0.005943607538938522, + "learning_rate": 8.411753016323635e-05, + "loss": 0.024997258186340333, + "step": 111940 + }, + { + "epoch": 15.89070262597587, + "grad_norm": 5.918054580688477, + "learning_rate": 8.411611071682044e-05, + "loss": 0.008443067967891692, + "step": 111950 + }, + { + "epoch": 15.892122072391768, + "grad_norm": 0.6412557363510132, + "learning_rate": 8.411469127040454e-05, + "loss": 0.0039185058325529095, + "step": 111960 + }, + { + "epoch": 15.893541518807664, + "grad_norm": 4.264925003051758, + "learning_rate": 8.411327182398865e-05, + "loss": 0.035122907161712645, + "step": 111970 + }, + { + "epoch": 15.894960965223563, + "grad_norm": 0.6918242573738098, + "learning_rate": 8.411185237757275e-05, + "loss": 0.02806202173233032, + "step": 111980 + }, + { + "epoch": 15.896380411639461, + "grad_norm": 0.370086133480072, + "learning_rate": 8.411043293115686e-05, + "loss": 0.013014155626296996, + "step": 111990 + }, + { + "epoch": 15.89779985805536, + "grad_norm": 0.08562023937702179, + "learning_rate": 8.410901348474096e-05, + "loss": 0.017157554626464844, + "step": 112000 + }, + { + "epoch": 15.89779985805536, + "eval_accuracy": 0.9874737712214663, + "eval_loss": 0.042419932782649994, + "eval_runtime": 32.0075, + "eval_samples_per_second": 491.354, + "eval_steps_per_second": 15.371, + "step": 112000 + }, + { + "epoch": 15.899219304471256, + "grad_norm": 0.32334455847740173, + "learning_rate": 8.410759403832505e-05, + "loss": 0.022023583948612212, + "step": 112010 + }, + { + "epoch": 15.900638750887154, + "grad_norm": 0.8937520980834961, + "learning_rate": 8.410617459190916e-05, + "loss": 0.033554989099502566, + "step": 112020 + }, + { + "epoch": 15.902058197303052, + "grad_norm": 4.915921688079834, + "learning_rate": 8.410475514549326e-05, + "loss": 0.022143127024173738, + "step": 112030 + }, + { + "epoch": 15.903477643718949, + "grad_norm": 8.751575469970703, + "learning_rate": 8.410333569907737e-05, + "loss": 0.03376420438289642, + "step": 112040 + }, + { + "epoch": 15.904897090134847, + "grad_norm": 0.06762025505304337, + "learning_rate": 8.410191625266146e-05, + "loss": 0.011050128936767578, + "step": 112050 + }, + { + "epoch": 15.906316536550746, + "grad_norm": 0.14220504462718964, + "learning_rate": 8.410049680624557e-05, + "loss": 0.01688355803489685, + "step": 112060 + }, + { + "epoch": 15.907735982966644, + "grad_norm": 0.10968053340911865, + "learning_rate": 8.409907735982967e-05, + "loss": 0.052211225032806396, + "step": 112070 + }, + { + "epoch": 15.90915542938254, + "grad_norm": 4.428947925567627, + "learning_rate": 8.409765791341378e-05, + "loss": 0.014014440774917602, + "step": 112080 + }, + { + "epoch": 15.910574875798439, + "grad_norm": 0.0376085601747036, + "learning_rate": 8.409623846699787e-05, + "loss": 0.003753634914755821, + "step": 112090 + }, + { + "epoch": 15.911994322214337, + "grad_norm": 19.084617614746094, + "learning_rate": 8.409481902058197e-05, + "loss": 0.03427457511425018, + "step": 112100 + }, + { + "epoch": 15.913413768630233, + "grad_norm": 0.031321216374635696, + "learning_rate": 8.409339957416608e-05, + "loss": 0.04106015264987946, + "step": 112110 + }, + { + "epoch": 15.914833215046132, + "grad_norm": 0.0813651755452156, + "learning_rate": 8.409198012775018e-05, + "loss": 0.013805033266544342, + "step": 112120 + }, + { + "epoch": 15.91625266146203, + "grad_norm": 6.500750541687012, + "learning_rate": 8.409056068133429e-05, + "loss": 0.04296710193157196, + "step": 112130 + }, + { + "epoch": 15.917672107877928, + "grad_norm": 0.1737774908542633, + "learning_rate": 8.408914123491839e-05, + "loss": 0.027148693799972534, + "step": 112140 + }, + { + "epoch": 15.919091554293825, + "grad_norm": 0.02817084826529026, + "learning_rate": 8.408772178850248e-05, + "loss": 0.041319137811660765, + "step": 112150 + }, + { + "epoch": 15.920511000709723, + "grad_norm": 0.17972466349601746, + "learning_rate": 8.408630234208658e-05, + "loss": 0.01709538549184799, + "step": 112160 + }, + { + "epoch": 15.921930447125622, + "grad_norm": 0.6958065032958984, + "learning_rate": 8.408488289567069e-05, + "loss": 0.025096040964126588, + "step": 112170 + }, + { + "epoch": 15.923349893541518, + "grad_norm": 4.633899211883545, + "learning_rate": 8.408346344925479e-05, + "loss": 0.03428847789764404, + "step": 112180 + }, + { + "epoch": 15.924769339957416, + "grad_norm": 0.2608836889266968, + "learning_rate": 8.40820440028389e-05, + "loss": 0.024020206928253175, + "step": 112190 + }, + { + "epoch": 15.926188786373315, + "grad_norm": 1.9271941184997559, + "learning_rate": 8.4080624556423e-05, + "loss": 0.014135521650314332, + "step": 112200 + }, + { + "epoch": 15.927608232789213, + "grad_norm": 1.365207552909851, + "learning_rate": 8.40792051100071e-05, + "loss": 0.01664539873600006, + "step": 112210 + }, + { + "epoch": 15.92902767920511, + "grad_norm": 0.03752214461565018, + "learning_rate": 8.407778566359121e-05, + "loss": 0.019473978877067567, + "step": 112220 + }, + { + "epoch": 15.930447125621008, + "grad_norm": 0.5360216498374939, + "learning_rate": 8.40763662171753e-05, + "loss": 0.042499488592147826, + "step": 112230 + }, + { + "epoch": 15.931866572036906, + "grad_norm": 0.14745332300662994, + "learning_rate": 8.407494677075942e-05, + "loss": 0.029511284828186036, + "step": 112240 + }, + { + "epoch": 15.933286018452803, + "grad_norm": 0.8141366243362427, + "learning_rate": 8.407352732434351e-05, + "loss": 0.017665939033031465, + "step": 112250 + }, + { + "epoch": 15.934705464868701, + "grad_norm": 0.15989312529563904, + "learning_rate": 8.407210787792761e-05, + "loss": 0.03332527875900269, + "step": 112260 + }, + { + "epoch": 15.9361249112846, + "grad_norm": 6.702155113220215, + "learning_rate": 8.407068843151171e-05, + "loss": 0.025216048955917357, + "step": 112270 + }, + { + "epoch": 15.937544357700498, + "grad_norm": 4.99879264831543, + "learning_rate": 8.406926898509582e-05, + "loss": 0.010565738379955291, + "step": 112280 + }, + { + "epoch": 15.938963804116394, + "grad_norm": 1.4353564977645874, + "learning_rate": 8.406784953867992e-05, + "loss": 0.024430674314498902, + "step": 112290 + }, + { + "epoch": 15.940383250532292, + "grad_norm": 0.6167616248130798, + "learning_rate": 8.406643009226403e-05, + "loss": 0.056137692928314206, + "step": 112300 + }, + { + "epoch": 15.94180269694819, + "grad_norm": 1.0208706855773926, + "learning_rate": 8.406501064584812e-05, + "loss": 0.01831849366426468, + "step": 112310 + }, + { + "epoch": 15.943222143364087, + "grad_norm": 0.07933774590492249, + "learning_rate": 8.406359119943222e-05, + "loss": 0.04364119172096252, + "step": 112320 + }, + { + "epoch": 15.944641589779986, + "grad_norm": 0.0641828402876854, + "learning_rate": 8.406217175301633e-05, + "loss": 0.009057014435529708, + "step": 112330 + }, + { + "epoch": 15.946061036195884, + "grad_norm": 6.091492652893066, + "learning_rate": 8.406075230660043e-05, + "loss": 0.011662401258945465, + "step": 112340 + }, + { + "epoch": 15.947480482611782, + "grad_norm": 0.13847945630550385, + "learning_rate": 8.405933286018454e-05, + "loss": 0.02798805832862854, + "step": 112350 + }, + { + "epoch": 15.948899929027679, + "grad_norm": 0.2876303195953369, + "learning_rate": 8.405791341376862e-05, + "loss": 0.008951310813426972, + "step": 112360 + }, + { + "epoch": 15.950319375443577, + "grad_norm": 0.3807947039604187, + "learning_rate": 8.405649396735274e-05, + "loss": 0.018087761104106904, + "step": 112370 + }, + { + "epoch": 15.951738821859475, + "grad_norm": 0.19306473433971405, + "learning_rate": 8.405507452093683e-05, + "loss": 0.025654715299606324, + "step": 112380 + }, + { + "epoch": 15.953158268275372, + "grad_norm": 0.596372127532959, + "learning_rate": 8.405365507452094e-05, + "loss": 0.03283799290657043, + "step": 112390 + }, + { + "epoch": 15.95457771469127, + "grad_norm": 10.068487167358398, + "learning_rate": 8.405223562810504e-05, + "loss": 0.010049515962600708, + "step": 112400 + }, + { + "epoch": 15.955997161107168, + "grad_norm": 6.676372051239014, + "learning_rate": 8.405081618168914e-05, + "loss": 0.032247743010520934, + "step": 112410 + }, + { + "epoch": 15.957416607523067, + "grad_norm": 0.18787598609924316, + "learning_rate": 8.404939673527325e-05, + "loss": 0.027509596943855286, + "step": 112420 + }, + { + "epoch": 15.958836053938963, + "grad_norm": 0.0962926521897316, + "learning_rate": 8.404797728885735e-05, + "loss": 0.03231292963027954, + "step": 112430 + }, + { + "epoch": 15.960255500354862, + "grad_norm": 1.5756398439407349, + "learning_rate": 8.404655784244146e-05, + "loss": 0.021877171099185945, + "step": 112440 + }, + { + "epoch": 15.96167494677076, + "grad_norm": 3.9619812965393066, + "learning_rate": 8.404513839602556e-05, + "loss": 0.0720575749874115, + "step": 112450 + }, + { + "epoch": 15.963094393186656, + "grad_norm": 1.577416181564331, + "learning_rate": 8.404371894960965e-05, + "loss": 0.02191317081451416, + "step": 112460 + }, + { + "epoch": 15.964513839602555, + "grad_norm": 0.6079181432723999, + "learning_rate": 8.404229950319375e-05, + "loss": 0.007672099024057388, + "step": 112470 + }, + { + "epoch": 15.965933286018453, + "grad_norm": 0.038492441177368164, + "learning_rate": 8.404088005677786e-05, + "loss": 0.03949972987174988, + "step": 112480 + }, + { + "epoch": 15.967352732434351, + "grad_norm": 1.8505480289459229, + "learning_rate": 8.403946061036196e-05, + "loss": 0.01702859401702881, + "step": 112490 + }, + { + "epoch": 15.968772178850248, + "grad_norm": 0.02366023324429989, + "learning_rate": 8.403804116394607e-05, + "loss": 0.014248864352703094, + "step": 112500 + }, + { + "epoch": 15.968772178850248, + "eval_accuracy": 0.9862020728683156, + "eval_loss": 0.048695117235183716, + "eval_runtime": 33.2219, + "eval_samples_per_second": 473.392, + "eval_steps_per_second": 14.809, + "step": 112500 + }, + { + "epoch": 15.970191625266146, + "grad_norm": 0.09983468055725098, + "learning_rate": 8.403662171753017e-05, + "loss": 0.041495496034622194, + "step": 112510 + }, + { + "epoch": 15.971611071682045, + "grad_norm": 0.7310523390769958, + "learning_rate": 8.403520227111426e-05, + "loss": 0.004337532818317414, + "step": 112520 + }, + { + "epoch": 15.973030518097941, + "grad_norm": 0.7653382420539856, + "learning_rate": 8.403378282469837e-05, + "loss": 0.007619468867778778, + "step": 112530 + }, + { + "epoch": 15.97444996451384, + "grad_norm": 1.2560532093048096, + "learning_rate": 8.403236337828247e-05, + "loss": 0.00598950944840908, + "step": 112540 + }, + { + "epoch": 15.975869410929738, + "grad_norm": 0.12405021488666534, + "learning_rate": 8.403094393186658e-05, + "loss": 0.0030649252235889434, + "step": 112550 + }, + { + "epoch": 15.977288857345636, + "grad_norm": 0.9109636545181274, + "learning_rate": 8.402952448545068e-05, + "loss": 0.05473610758781433, + "step": 112560 + }, + { + "epoch": 15.978708303761533, + "grad_norm": 6.435018539428711, + "learning_rate": 8.402810503903478e-05, + "loss": 0.013474112749099732, + "step": 112570 + }, + { + "epoch": 15.98012775017743, + "grad_norm": 0.34421679377555847, + "learning_rate": 8.402668559261888e-05, + "loss": 0.05626802444458008, + "step": 112580 + }, + { + "epoch": 15.98154719659333, + "grad_norm": 5.662394046783447, + "learning_rate": 8.402526614620299e-05, + "loss": 0.03009917736053467, + "step": 112590 + }, + { + "epoch": 15.982966643009226, + "grad_norm": 0.11366448551416397, + "learning_rate": 8.402384669978708e-05, + "loss": 0.02108916491270065, + "step": 112600 + }, + { + "epoch": 15.984386089425124, + "grad_norm": 4.0829057693481445, + "learning_rate": 8.40224272533712e-05, + "loss": 0.006870292872190475, + "step": 112610 + }, + { + "epoch": 15.985805535841022, + "grad_norm": 8.846285820007324, + "learning_rate": 8.402100780695529e-05, + "loss": 0.028130248188972473, + "step": 112620 + }, + { + "epoch": 15.98722498225692, + "grad_norm": 0.16223247349262238, + "learning_rate": 8.401958836053939e-05, + "loss": 0.01694146990776062, + "step": 112630 + }, + { + "epoch": 15.988644428672817, + "grad_norm": 0.1122329905629158, + "learning_rate": 8.40181689141235e-05, + "loss": 0.042585551738739014, + "step": 112640 + }, + { + "epoch": 15.990063875088715, + "grad_norm": 7.043773651123047, + "learning_rate": 8.40167494677076e-05, + "loss": 0.027266567945480345, + "step": 112650 + }, + { + "epoch": 15.991483321504614, + "grad_norm": 3.80629301071167, + "learning_rate": 8.401533002129171e-05, + "loss": 0.017532944679260254, + "step": 112660 + }, + { + "epoch": 15.99290276792051, + "grad_norm": 10.180436134338379, + "learning_rate": 8.401391057487579e-05, + "loss": 0.04507697224617004, + "step": 112670 + }, + { + "epoch": 15.994322214336409, + "grad_norm": 1.181219458580017, + "learning_rate": 8.40124911284599e-05, + "loss": 0.002518647536635399, + "step": 112680 + }, + { + "epoch": 15.995741660752307, + "grad_norm": 2.4488043785095215, + "learning_rate": 8.4011071682044e-05, + "loss": 0.020063599944114684, + "step": 112690 + }, + { + "epoch": 15.997161107168205, + "grad_norm": 0.6078232526779175, + "learning_rate": 8.400965223562811e-05, + "loss": 0.00689728707075119, + "step": 112700 + }, + { + "epoch": 15.998580553584102, + "grad_norm": 0.18818716704845428, + "learning_rate": 8.400823278921221e-05, + "loss": 0.0035076819360256193, + "step": 112710 + }, + { + "epoch": 16.0, + "grad_norm": 3.3798446655273438, + "learning_rate": 8.40068133427963e-05, + "loss": 0.006935934722423554, + "step": 112720 + }, + { + "epoch": 16.0014194464159, + "grad_norm": 1.8569788932800293, + "learning_rate": 8.400539389638042e-05, + "loss": 0.014986465871334075, + "step": 112730 + }, + { + "epoch": 16.002838892831797, + "grad_norm": 0.38919079303741455, + "learning_rate": 8.400397444996451e-05, + "loss": 0.018700113892555235, + "step": 112740 + }, + { + "epoch": 16.004258339247695, + "grad_norm": 0.20278839766979218, + "learning_rate": 8.400255500354863e-05, + "loss": 0.009249137341976165, + "step": 112750 + }, + { + "epoch": 16.00567778566359, + "grad_norm": 1.0998165607452393, + "learning_rate": 8.400113555713272e-05, + "loss": 0.02100822627544403, + "step": 112760 + }, + { + "epoch": 16.007097232079488, + "grad_norm": 0.06329260021448135, + "learning_rate": 8.399971611071682e-05, + "loss": 0.01535399556159973, + "step": 112770 + }, + { + "epoch": 16.008516678495386, + "grad_norm": 4.446816921234131, + "learning_rate": 8.399829666430092e-05, + "loss": 0.023000138998031616, + "step": 112780 + }, + { + "epoch": 16.009936124911285, + "grad_norm": 0.7698379755020142, + "learning_rate": 8.399687721788503e-05, + "loss": 0.00757470577955246, + "step": 112790 + }, + { + "epoch": 16.011355571327183, + "grad_norm": 1.464584231376648, + "learning_rate": 8.399545777146913e-05, + "loss": 0.003870783746242523, + "step": 112800 + }, + { + "epoch": 16.01277501774308, + "grad_norm": 4.501135349273682, + "learning_rate": 8.399403832505324e-05, + "loss": 0.0050099663436412815, + "step": 112810 + }, + { + "epoch": 16.01419446415898, + "grad_norm": 0.6903401613235474, + "learning_rate": 8.399261887863733e-05, + "loss": 0.011681679636240005, + "step": 112820 + }, + { + "epoch": 16.015613910574874, + "grad_norm": 0.8593311309814453, + "learning_rate": 8.399119943222143e-05, + "loss": 0.033016365766525266, + "step": 112830 + }, + { + "epoch": 16.017033356990773, + "grad_norm": 0.8904664516448975, + "learning_rate": 8.398977998580554e-05, + "loss": 0.008453948795795441, + "step": 112840 + }, + { + "epoch": 16.01845280340667, + "grad_norm": 0.12439213693141937, + "learning_rate": 8.398836053938964e-05, + "loss": 0.014540690183639526, + "step": 112850 + }, + { + "epoch": 16.01987224982257, + "grad_norm": 0.12126282602548599, + "learning_rate": 8.398694109297375e-05, + "loss": 0.016869255900382997, + "step": 112860 + }, + { + "epoch": 16.021291696238467, + "grad_norm": 2.421494245529175, + "learning_rate": 8.398552164655783e-05, + "loss": 0.03948677182197571, + "step": 112870 + }, + { + "epoch": 16.022711142654366, + "grad_norm": 0.043827567249536514, + "learning_rate": 8.398410220014195e-05, + "loss": 0.00926891267299652, + "step": 112880 + }, + { + "epoch": 16.024130589070264, + "grad_norm": 0.08589991182088852, + "learning_rate": 8.398268275372604e-05, + "loss": 0.007034893333911896, + "step": 112890 + }, + { + "epoch": 16.02555003548616, + "grad_norm": 0.07396560162305832, + "learning_rate": 8.398126330731015e-05, + "loss": 0.005567807704210281, + "step": 112900 + }, + { + "epoch": 16.026969481902057, + "grad_norm": 0.05021880194544792, + "learning_rate": 8.397984386089426e-05, + "loss": 0.011187079548835754, + "step": 112910 + }, + { + "epoch": 16.028388928317955, + "grad_norm": 2.3082640171051025, + "learning_rate": 8.397842441447836e-05, + "loss": 0.002422238886356354, + "step": 112920 + }, + { + "epoch": 16.029808374733854, + "grad_norm": 0.029654445126652718, + "learning_rate": 8.397700496806246e-05, + "loss": 0.007260937988758087, + "step": 112930 + }, + { + "epoch": 16.031227821149752, + "grad_norm": 4.165407180786133, + "learning_rate": 8.397558552164656e-05, + "loss": 0.0049143187701702114, + "step": 112940 + }, + { + "epoch": 16.03264726756565, + "grad_norm": 0.09881002455949783, + "learning_rate": 8.397416607523067e-05, + "loss": 0.002365070208907127, + "step": 112950 + }, + { + "epoch": 16.03406671398155, + "grad_norm": 0.1122019812464714, + "learning_rate": 8.397274662881477e-05, + "loss": 0.002952999994158745, + "step": 112960 + }, + { + "epoch": 16.035486160397443, + "grad_norm": 0.09588045626878738, + "learning_rate": 8.397132718239888e-05, + "loss": 0.033624234795570376, + "step": 112970 + }, + { + "epoch": 16.03690560681334, + "grad_norm": 0.013734962791204453, + "learning_rate": 8.396990773598296e-05, + "loss": 0.032681146264076234, + "step": 112980 + }, + { + "epoch": 16.03832505322924, + "grad_norm": 0.21410711109638214, + "learning_rate": 8.396848828956707e-05, + "loss": 0.006711065024137497, + "step": 112990 + }, + { + "epoch": 16.03974449964514, + "grad_norm": 13.848963737487793, + "learning_rate": 8.396706884315118e-05, + "loss": 0.049816185235977174, + "step": 113000 + }, + { + "epoch": 16.03974449964514, + "eval_accuracy": 0.988872639409932, + "eval_loss": 0.03962714597582817, + "eval_runtime": 32.0224, + "eval_samples_per_second": 491.124, + "eval_steps_per_second": 15.364, + "step": 113000 + }, + { + "epoch": 16.041163946061037, + "grad_norm": 0.010466187261044979, + "learning_rate": 8.396564939673528e-05, + "loss": 0.010902185738086701, + "step": 113010 + }, + { + "epoch": 16.042583392476935, + "grad_norm": 5.942933082580566, + "learning_rate": 8.396422995031939e-05, + "loss": 0.014651863276958466, + "step": 113020 + }, + { + "epoch": 16.044002838892833, + "grad_norm": 0.2957586348056793, + "learning_rate": 8.396281050390347e-05, + "loss": 0.04585053324699402, + "step": 113030 + }, + { + "epoch": 16.045422285308728, + "grad_norm": 11.693886756896973, + "learning_rate": 8.396139105748758e-05, + "loss": 0.023407797515392303, + "step": 113040 + }, + { + "epoch": 16.046841731724626, + "grad_norm": 11.953022956848145, + "learning_rate": 8.395997161107168e-05, + "loss": 0.029421928524971008, + "step": 113050 + }, + { + "epoch": 16.048261178140525, + "grad_norm": 1.568056344985962, + "learning_rate": 8.395855216465579e-05, + "loss": 0.0045596893876791, + "step": 113060 + }, + { + "epoch": 16.049680624556423, + "grad_norm": 1.8041973114013672, + "learning_rate": 8.395713271823989e-05, + "loss": 0.02693631947040558, + "step": 113070 + }, + { + "epoch": 16.05110007097232, + "grad_norm": 2.983215808868408, + "learning_rate": 8.395571327182399e-05, + "loss": 0.006398119032382965, + "step": 113080 + }, + { + "epoch": 16.05251951738822, + "grad_norm": 0.19344577193260193, + "learning_rate": 8.39542938254081e-05, + "loss": 0.014838898181915283, + "step": 113090 + }, + { + "epoch": 16.053938963804118, + "grad_norm": 12.71757984161377, + "learning_rate": 8.39528743789922e-05, + "loss": 0.039472135901451114, + "step": 113100 + }, + { + "epoch": 16.055358410220013, + "grad_norm": 0.5390371680259705, + "learning_rate": 8.395145493257631e-05, + "loss": 0.01392301321029663, + "step": 113110 + }, + { + "epoch": 16.05677785663591, + "grad_norm": 0.3036908805370331, + "learning_rate": 8.39500354861604e-05, + "loss": 0.0026406005024909975, + "step": 113120 + }, + { + "epoch": 16.05819730305181, + "grad_norm": 0.07947884500026703, + "learning_rate": 8.39486160397445e-05, + "loss": 0.008793811500072479, + "step": 113130 + }, + { + "epoch": 16.059616749467708, + "grad_norm": 0.04886677861213684, + "learning_rate": 8.39471965933286e-05, + "loss": 0.005639223754405976, + "step": 113140 + }, + { + "epoch": 16.061036195883606, + "grad_norm": 5.401880741119385, + "learning_rate": 8.394577714691271e-05, + "loss": 0.05478711128234863, + "step": 113150 + }, + { + "epoch": 16.062455642299504, + "grad_norm": 0.4508975148200989, + "learning_rate": 8.394435770049681e-05, + "loss": 0.026101893186569212, + "step": 113160 + }, + { + "epoch": 16.063875088715402, + "grad_norm": 0.22109004855155945, + "learning_rate": 8.394293825408092e-05, + "loss": 0.0165793314576149, + "step": 113170 + }, + { + "epoch": 16.065294535131297, + "grad_norm": 0.09848267585039139, + "learning_rate": 8.394151880766502e-05, + "loss": 0.03446832001209259, + "step": 113180 + }, + { + "epoch": 16.066713981547196, + "grad_norm": 0.04517025500535965, + "learning_rate": 8.394009936124911e-05, + "loss": 0.018778929114341737, + "step": 113190 + }, + { + "epoch": 16.068133427963094, + "grad_norm": 15.434242248535156, + "learning_rate": 8.393867991483322e-05, + "loss": 0.020848213136196135, + "step": 113200 + }, + { + "epoch": 16.069552874378992, + "grad_norm": 0.043510545045137405, + "learning_rate": 8.393726046841732e-05, + "loss": 0.023297858238220216, + "step": 113210 + }, + { + "epoch": 16.07097232079489, + "grad_norm": 15.332098007202148, + "learning_rate": 8.393584102200143e-05, + "loss": 0.03652581572532654, + "step": 113220 + }, + { + "epoch": 16.07239176721079, + "grad_norm": 8.775132179260254, + "learning_rate": 8.393442157558553e-05, + "loss": 0.07113283276557922, + "step": 113230 + }, + { + "epoch": 16.073811213626687, + "grad_norm": 0.041509199887514114, + "learning_rate": 8.393300212916963e-05, + "loss": 0.014686094224452972, + "step": 113240 + }, + { + "epoch": 16.075230660042582, + "grad_norm": 0.4447268843650818, + "learning_rate": 8.393158268275372e-05, + "loss": 0.047878941893577574, + "step": 113250 + }, + { + "epoch": 16.07665010645848, + "grad_norm": 0.5964030027389526, + "learning_rate": 8.393016323633784e-05, + "loss": 0.005113707855343819, + "step": 113260 + }, + { + "epoch": 16.07806955287438, + "grad_norm": 2.3050570487976074, + "learning_rate": 8.392888573456352e-05, + "loss": 0.08484262824058533, + "step": 113270 + }, + { + "epoch": 16.079488999290277, + "grad_norm": 1.124589204788208, + "learning_rate": 8.392746628814763e-05, + "loss": 0.017958565056324004, + "step": 113280 + }, + { + "epoch": 16.080908445706175, + "grad_norm": 0.08736645430326462, + "learning_rate": 8.392604684173173e-05, + "loss": 0.029423543810844423, + "step": 113290 + }, + { + "epoch": 16.082327892122073, + "grad_norm": 0.009426097385585308, + "learning_rate": 8.392462739531584e-05, + "loss": 0.0018295619636774063, + "step": 113300 + }, + { + "epoch": 16.08374733853797, + "grad_norm": 1.1997662782669067, + "learning_rate": 8.392320794889992e-05, + "loss": 0.01612427681684494, + "step": 113310 + }, + { + "epoch": 16.085166784953866, + "grad_norm": 0.032743312418460846, + "learning_rate": 8.392178850248403e-05, + "loss": 0.02764788568019867, + "step": 113320 + }, + { + "epoch": 16.086586231369765, + "grad_norm": 1.5302761793136597, + "learning_rate": 8.392036905606813e-05, + "loss": 0.05237635374069214, + "step": 113330 + }, + { + "epoch": 16.088005677785663, + "grad_norm": 0.0579666830599308, + "learning_rate": 8.391894960965224e-05, + "loss": 0.05110551118850708, + "step": 113340 + }, + { + "epoch": 16.08942512420156, + "grad_norm": 0.18044817447662354, + "learning_rate": 8.391753016323634e-05, + "loss": 0.02474549263715744, + "step": 113350 + }, + { + "epoch": 16.09084457061746, + "grad_norm": 0.3958684206008911, + "learning_rate": 8.391611071682044e-05, + "loss": 0.03521615266799927, + "step": 113360 + }, + { + "epoch": 16.092264017033358, + "grad_norm": 0.5362204313278198, + "learning_rate": 8.391469127040455e-05, + "loss": 0.051009106636047366, + "step": 113370 + }, + { + "epoch": 16.093683463449256, + "grad_norm": 0.08517732471227646, + "learning_rate": 8.391327182398865e-05, + "loss": 0.016589146852493287, + "step": 113380 + }, + { + "epoch": 16.09510290986515, + "grad_norm": 0.5726885795593262, + "learning_rate": 8.391185237757276e-05, + "loss": 0.012025222927331925, + "step": 113390 + }, + { + "epoch": 16.09652235628105, + "grad_norm": 0.1441572606563568, + "learning_rate": 8.391043293115685e-05, + "loss": 0.0675897479057312, + "step": 113400 + }, + { + "epoch": 16.097941802696948, + "grad_norm": 0.4573246240615845, + "learning_rate": 8.390901348474095e-05, + "loss": 0.0028651710599660873, + "step": 113410 + }, + { + "epoch": 16.099361249112846, + "grad_norm": 0.07784921675920486, + "learning_rate": 8.390759403832505e-05, + "loss": 0.02416677474975586, + "step": 113420 + }, + { + "epoch": 16.100780695528744, + "grad_norm": 11.768951416015625, + "learning_rate": 8.390617459190916e-05, + "loss": 0.022789698839187623, + "step": 113430 + }, + { + "epoch": 16.102200141944643, + "grad_norm": 0.08935142308473587, + "learning_rate": 8.390475514549326e-05, + "loss": 0.0012326732277870179, + "step": 113440 + }, + { + "epoch": 16.10361958836054, + "grad_norm": 1.0763702392578125, + "learning_rate": 8.390333569907737e-05, + "loss": 0.010844753682613372, + "step": 113450 + }, + { + "epoch": 16.105039034776436, + "grad_norm": 0.42075884342193604, + "learning_rate": 8.390191625266147e-05, + "loss": 0.00529075525701046, + "step": 113460 + }, + { + "epoch": 16.106458481192334, + "grad_norm": 0.32909151911735535, + "learning_rate": 8.390049680624556e-05, + "loss": 0.007352690398693085, + "step": 113470 + }, + { + "epoch": 16.107877927608232, + "grad_norm": 0.017695745453238487, + "learning_rate": 8.389907735982967e-05, + "loss": 0.008752924203872681, + "step": 113480 + }, + { + "epoch": 16.10929737402413, + "grad_norm": 14.182565689086914, + "learning_rate": 8.389765791341377e-05, + "loss": 0.020782370865345002, + "step": 113490 + }, + { + "epoch": 16.11071682044003, + "grad_norm": 0.04523881524801254, + "learning_rate": 8.389623846699788e-05, + "loss": 0.02090078145265579, + "step": 113500 + }, + { + "epoch": 16.11071682044003, + "eval_accuracy": 0.9842309404209322, + "eval_loss": 0.058538712561130524, + "eval_runtime": 32.6752, + "eval_samples_per_second": 481.312, + "eval_steps_per_second": 15.057, + "step": 113500 + }, + { + "epoch": 16.112136266855927, + "grad_norm": 1.4609222412109375, + "learning_rate": 8.389481902058197e-05, + "loss": 0.06761115193367004, + "step": 113510 + }, + { + "epoch": 16.113555713271825, + "grad_norm": 0.02035006321966648, + "learning_rate": 8.389339957416608e-05, + "loss": 0.027057936787605284, + "step": 113520 + }, + { + "epoch": 16.11497515968772, + "grad_norm": 1.180684208869934, + "learning_rate": 8.389198012775017e-05, + "loss": 0.021567445993423463, + "step": 113530 + }, + { + "epoch": 16.11639460610362, + "grad_norm": 0.0720195323228836, + "learning_rate": 8.389056068133429e-05, + "loss": 0.022597649693489076, + "step": 113540 + }, + { + "epoch": 16.117814052519517, + "grad_norm": 3.2325596809387207, + "learning_rate": 8.388914123491838e-05, + "loss": 0.03432404398918152, + "step": 113550 + }, + { + "epoch": 16.119233498935415, + "grad_norm": 0.2862333655357361, + "learning_rate": 8.38877217885025e-05, + "loss": 0.04290721118450165, + "step": 113560 + }, + { + "epoch": 16.120652945351313, + "grad_norm": 0.8970960974693298, + "learning_rate": 8.388630234208659e-05, + "loss": 0.02014774680137634, + "step": 113570 + }, + { + "epoch": 16.12207239176721, + "grad_norm": 0.037683311849832535, + "learning_rate": 8.388488289567069e-05, + "loss": 0.050986915826797485, + "step": 113580 + }, + { + "epoch": 16.12349183818311, + "grad_norm": 6.2578959465026855, + "learning_rate": 8.38834634492548e-05, + "loss": 0.019598402082920074, + "step": 113590 + }, + { + "epoch": 16.124911284599005, + "grad_norm": 1.3776534795761108, + "learning_rate": 8.38820440028389e-05, + "loss": 0.04389630854129791, + "step": 113600 + }, + { + "epoch": 16.126330731014903, + "grad_norm": 3.2619071006774902, + "learning_rate": 8.388062455642301e-05, + "loss": 0.02302349656820297, + "step": 113610 + }, + { + "epoch": 16.1277501774308, + "grad_norm": 0.03281306102871895, + "learning_rate": 8.387920511000709e-05, + "loss": 0.03444663286209106, + "step": 113620 + }, + { + "epoch": 16.1291696238467, + "grad_norm": 0.4675664007663727, + "learning_rate": 8.38777856635912e-05, + "loss": 0.033840471506118776, + "step": 113630 + }, + { + "epoch": 16.130589070262598, + "grad_norm": 2.2576606273651123, + "learning_rate": 8.38763662171753e-05, + "loss": 0.012754887342453003, + "step": 113640 + }, + { + "epoch": 16.132008516678496, + "grad_norm": 0.26638638973236084, + "learning_rate": 8.387494677075941e-05, + "loss": 0.02814761996269226, + "step": 113650 + }, + { + "epoch": 16.133427963094395, + "grad_norm": 0.17130306363105774, + "learning_rate": 8.387352732434352e-05, + "loss": 0.009480338543653488, + "step": 113660 + }, + { + "epoch": 16.13484740951029, + "grad_norm": 1.0494219064712524, + "learning_rate": 8.38721078779276e-05, + "loss": 0.01807313859462738, + "step": 113670 + }, + { + "epoch": 16.136266855926188, + "grad_norm": 4.024386405944824, + "learning_rate": 8.387068843151172e-05, + "loss": 0.018029528856277465, + "step": 113680 + }, + { + "epoch": 16.137686302342086, + "grad_norm": 0.07396601140499115, + "learning_rate": 8.386926898509581e-05, + "loss": 0.009591655433177948, + "step": 113690 + }, + { + "epoch": 16.139105748757984, + "grad_norm": 0.0040374561212956905, + "learning_rate": 8.386784953867992e-05, + "loss": 0.00962514728307724, + "step": 113700 + }, + { + "epoch": 16.140525195173883, + "grad_norm": 0.0394991971552372, + "learning_rate": 8.386643009226402e-05, + "loss": 0.01851954609155655, + "step": 113710 + }, + { + "epoch": 16.14194464158978, + "grad_norm": 10.12916088104248, + "learning_rate": 8.386501064584812e-05, + "loss": 0.0460242509841919, + "step": 113720 + }, + { + "epoch": 16.14336408800568, + "grad_norm": 1.8034098148345947, + "learning_rate": 8.386359119943222e-05, + "loss": 0.0432327538728714, + "step": 113730 + }, + { + "epoch": 16.144783534421574, + "grad_norm": 2.1534202098846436, + "learning_rate": 8.386217175301633e-05, + "loss": 0.02302303910255432, + "step": 113740 + }, + { + "epoch": 16.146202980837472, + "grad_norm": 0.3321599066257477, + "learning_rate": 8.386075230660044e-05, + "loss": 0.0073155477643013, + "step": 113750 + }, + { + "epoch": 16.14762242725337, + "grad_norm": 0.5808457732200623, + "learning_rate": 8.385933286018454e-05, + "loss": 0.016782888770103456, + "step": 113760 + }, + { + "epoch": 16.14904187366927, + "grad_norm": 6.599685192108154, + "learning_rate": 8.385791341376863e-05, + "loss": 0.0414692759513855, + "step": 113770 + }, + { + "epoch": 16.150461320085167, + "grad_norm": 5.038705348968506, + "learning_rate": 8.385649396735273e-05, + "loss": 0.022276198863983153, + "step": 113780 + }, + { + "epoch": 16.151880766501066, + "grad_norm": 0.011023037135601044, + "learning_rate": 8.385507452093684e-05, + "loss": 0.008303380012512207, + "step": 113790 + }, + { + "epoch": 16.153300212916964, + "grad_norm": 0.7618648409843445, + "learning_rate": 8.385365507452094e-05, + "loss": 0.046433225274086, + "step": 113800 + }, + { + "epoch": 16.15471965933286, + "grad_norm": 0.15397347509860992, + "learning_rate": 8.385223562810505e-05, + "loss": 0.02768896520137787, + "step": 113810 + }, + { + "epoch": 16.156139105748757, + "grad_norm": 0.014487593434751034, + "learning_rate": 8.385081618168913e-05, + "loss": 0.05798448324203491, + "step": 113820 + }, + { + "epoch": 16.157558552164655, + "grad_norm": 0.8680633306503296, + "learning_rate": 8.384939673527324e-05, + "loss": 0.05073647499084473, + "step": 113830 + }, + { + "epoch": 16.158977998580554, + "grad_norm": 0.09245634078979492, + "learning_rate": 8.384797728885736e-05, + "loss": 0.016138990223407746, + "step": 113840 + }, + { + "epoch": 16.160397444996452, + "grad_norm": 0.0410037636756897, + "learning_rate": 8.384655784244145e-05, + "loss": 0.006852047145366668, + "step": 113850 + }, + { + "epoch": 16.16181689141235, + "grad_norm": 7.452047348022461, + "learning_rate": 8.384513839602556e-05, + "loss": 0.030913379788398743, + "step": 113860 + }, + { + "epoch": 16.16323633782825, + "grad_norm": 0.3431392312049866, + "learning_rate": 8.384371894960965e-05, + "loss": 0.010368605703115463, + "step": 113870 + }, + { + "epoch": 16.164655784244143, + "grad_norm": 0.02049533650279045, + "learning_rate": 8.384229950319376e-05, + "loss": 0.050056666135787964, + "step": 113880 + }, + { + "epoch": 16.16607523066004, + "grad_norm": 1.6124444007873535, + "learning_rate": 8.384088005677786e-05, + "loss": 0.006237022578716278, + "step": 113890 + }, + { + "epoch": 16.16749467707594, + "grad_norm": 0.058554697781801224, + "learning_rate": 8.383946061036197e-05, + "loss": 0.017270559072494508, + "step": 113900 + }, + { + "epoch": 16.168914123491838, + "grad_norm": 0.15180853009223938, + "learning_rate": 8.383804116394606e-05, + "loss": 0.014956575632095338, + "step": 113910 + }, + { + "epoch": 16.170333569907736, + "grad_norm": 0.9848091006278992, + "learning_rate": 8.383662171753018e-05, + "loss": 0.014708085358142853, + "step": 113920 + }, + { + "epoch": 16.171753016323635, + "grad_norm": 3.5492801666259766, + "learning_rate": 8.383520227111427e-05, + "loss": 0.017875519394874573, + "step": 113930 + }, + { + "epoch": 16.173172462739533, + "grad_norm": 0.6977704167366028, + "learning_rate": 8.383378282469837e-05, + "loss": 0.032053911685943605, + "step": 113940 + }, + { + "epoch": 16.174591909155428, + "grad_norm": 3.879185676574707, + "learning_rate": 8.383236337828248e-05, + "loss": 0.024631142616271973, + "step": 113950 + }, + { + "epoch": 16.176011355571326, + "grad_norm": 2.5624606609344482, + "learning_rate": 8.383094393186658e-05, + "loss": 0.028592270612716675, + "step": 113960 + }, + { + "epoch": 16.177430801987224, + "grad_norm": 0.04760686308145523, + "learning_rate": 8.382952448545069e-05, + "loss": 0.027992811799049378, + "step": 113970 + }, + { + "epoch": 16.178850248403123, + "grad_norm": 0.4733794033527374, + "learning_rate": 8.382810503903477e-05, + "loss": 0.004519284889101982, + "step": 113980 + }, + { + "epoch": 16.18026969481902, + "grad_norm": 4.214389324188232, + "learning_rate": 8.382668559261888e-05, + "loss": 0.033532992005348206, + "step": 113990 + }, + { + "epoch": 16.18168914123492, + "grad_norm": 0.059593670070171356, + "learning_rate": 8.382526614620298e-05, + "loss": 0.036214256286621095, + "step": 114000 + }, + { + "epoch": 16.18168914123492, + "eval_accuracy": 0.9774909391492338, + "eval_loss": 0.07867772877216339, + "eval_runtime": 32.61, + "eval_samples_per_second": 482.275, + "eval_steps_per_second": 15.087, + "step": 114000 + }, + { + "epoch": 16.183108587650818, + "grad_norm": 0.38623306155204773, + "learning_rate": 8.382384669978709e-05, + "loss": 0.05369272828102112, + "step": 114010 + }, + { + "epoch": 16.184528034066712, + "grad_norm": 2.5667450428009033, + "learning_rate": 8.382242725337119e-05, + "loss": 0.0825517237186432, + "step": 114020 + }, + { + "epoch": 16.18594748048261, + "grad_norm": 4.854561805725098, + "learning_rate": 8.382100780695529e-05, + "loss": 0.03476034104824066, + "step": 114030 + }, + { + "epoch": 16.18736692689851, + "grad_norm": 10.146092414855957, + "learning_rate": 8.38195883605394e-05, + "loss": 0.022970007359981538, + "step": 114040 + }, + { + "epoch": 16.188786373314407, + "grad_norm": 14.312685012817383, + "learning_rate": 8.38181689141235e-05, + "loss": 0.08640811443328858, + "step": 114050 + }, + { + "epoch": 16.190205819730306, + "grad_norm": 0.31443431973457336, + "learning_rate": 8.38167494677076e-05, + "loss": 0.0024741746485233305, + "step": 114060 + }, + { + "epoch": 16.191625266146204, + "grad_norm": 0.1664002686738968, + "learning_rate": 8.38153300212917e-05, + "loss": 0.014358796179294586, + "step": 114070 + }, + { + "epoch": 16.193044712562102, + "grad_norm": 0.33076465129852295, + "learning_rate": 8.38139105748758e-05, + "loss": 0.009447012841701508, + "step": 114080 + }, + { + "epoch": 16.194464158977997, + "grad_norm": 0.12226522713899612, + "learning_rate": 8.38124911284599e-05, + "loss": 0.050778812170028685, + "step": 114090 + }, + { + "epoch": 16.195883605393895, + "grad_norm": 2.3829030990600586, + "learning_rate": 8.381107168204401e-05, + "loss": 0.0029199857264757155, + "step": 114100 + }, + { + "epoch": 16.197303051809794, + "grad_norm": 0.021862417459487915, + "learning_rate": 8.38096522356281e-05, + "loss": 0.014643014967441558, + "step": 114110 + }, + { + "epoch": 16.198722498225692, + "grad_norm": 5.501955986022949, + "learning_rate": 8.380823278921222e-05, + "loss": 0.06353362798690795, + "step": 114120 + }, + { + "epoch": 16.20014194464159, + "grad_norm": 0.030185649171471596, + "learning_rate": 8.380681334279631e-05, + "loss": 0.007039766758680344, + "step": 114130 + }, + { + "epoch": 16.20156139105749, + "grad_norm": 14.978668212890625, + "learning_rate": 8.380539389638041e-05, + "loss": 0.021280562877655028, + "step": 114140 + }, + { + "epoch": 16.202980837473387, + "grad_norm": 0.18859194219112396, + "learning_rate": 8.380397444996452e-05, + "loss": 0.04804631173610687, + "step": 114150 + }, + { + "epoch": 16.20440028388928, + "grad_norm": 0.3008385896682739, + "learning_rate": 8.380255500354862e-05, + "loss": 0.020325048267841338, + "step": 114160 + }, + { + "epoch": 16.20581973030518, + "grad_norm": 0.050892528146505356, + "learning_rate": 8.380113555713273e-05, + "loss": 0.006553132832050323, + "step": 114170 + }, + { + "epoch": 16.207239176721078, + "grad_norm": 17.227819442749023, + "learning_rate": 8.379971611071682e-05, + "loss": 0.04515390396118164, + "step": 114180 + }, + { + "epoch": 16.208658623136976, + "grad_norm": 0.6563828587532043, + "learning_rate": 8.379829666430093e-05, + "loss": 0.04622732698917389, + "step": 114190 + }, + { + "epoch": 16.210078069552875, + "grad_norm": 0.5986719131469727, + "learning_rate": 8.379687721788502e-05, + "loss": 0.008560654520988465, + "step": 114200 + }, + { + "epoch": 16.211497515968773, + "grad_norm": 5.077998161315918, + "learning_rate": 8.379545777146913e-05, + "loss": 0.01993117332458496, + "step": 114210 + }, + { + "epoch": 16.21291696238467, + "grad_norm": 1.5840833187103271, + "learning_rate": 8.379403832505323e-05, + "loss": 0.040464064478874205, + "step": 114220 + }, + { + "epoch": 16.214336408800566, + "grad_norm": 1.6033178567886353, + "learning_rate": 8.379261887863733e-05, + "loss": 0.03339195549488068, + "step": 114230 + }, + { + "epoch": 16.215755855216464, + "grad_norm": 0.9904631972312927, + "learning_rate": 8.379119943222144e-05, + "loss": 0.054726976156234744, + "step": 114240 + }, + { + "epoch": 16.217175301632363, + "grad_norm": 5.803867340087891, + "learning_rate": 8.378977998580554e-05, + "loss": 0.014914526045322419, + "step": 114250 + }, + { + "epoch": 16.21859474804826, + "grad_norm": 9.74636459350586, + "learning_rate": 8.378836053938965e-05, + "loss": 0.054322832822799684, + "step": 114260 + }, + { + "epoch": 16.22001419446416, + "grad_norm": 0.028677405789494514, + "learning_rate": 8.378694109297375e-05, + "loss": 0.03889646232128143, + "step": 114270 + }, + { + "epoch": 16.221433640880058, + "grad_norm": 5.0605149269104, + "learning_rate": 8.378552164655786e-05, + "loss": 0.017887987196445465, + "step": 114280 + }, + { + "epoch": 16.222853087295956, + "grad_norm": 0.1755291372537613, + "learning_rate": 8.378410220014194e-05, + "loss": 0.01712374985218048, + "step": 114290 + }, + { + "epoch": 16.22427253371185, + "grad_norm": 0.1283310055732727, + "learning_rate": 8.378268275372605e-05, + "loss": 0.03941631317138672, + "step": 114300 + }, + { + "epoch": 16.22569198012775, + "grad_norm": 0.12080294638872147, + "learning_rate": 8.378126330731015e-05, + "loss": 0.03501139581203461, + "step": 114310 + }, + { + "epoch": 16.227111426543647, + "grad_norm": 0.21980896592140198, + "learning_rate": 8.377984386089426e-05, + "loss": 0.01078852266073227, + "step": 114320 + }, + { + "epoch": 16.228530872959546, + "grad_norm": 3.0699965953826904, + "learning_rate": 8.377842441447836e-05, + "loss": 0.09886000752449035, + "step": 114330 + }, + { + "epoch": 16.229950319375444, + "grad_norm": 0.06542843580245972, + "learning_rate": 8.377700496806245e-05, + "loss": 0.029817229509353636, + "step": 114340 + }, + { + "epoch": 16.231369765791342, + "grad_norm": 0.34764522314071655, + "learning_rate": 8.377558552164657e-05, + "loss": 0.0022600889205932617, + "step": 114350 + }, + { + "epoch": 16.23278921220724, + "grad_norm": 0.052265383303165436, + "learning_rate": 8.377416607523066e-05, + "loss": 0.047760218381881714, + "step": 114360 + }, + { + "epoch": 16.234208658623135, + "grad_norm": 6.4861931800842285, + "learning_rate": 8.377274662881477e-05, + "loss": 0.027695602178573607, + "step": 114370 + }, + { + "epoch": 16.235628105039034, + "grad_norm": 3.700791358947754, + "learning_rate": 8.377132718239887e-05, + "loss": 0.01770424097776413, + "step": 114380 + }, + { + "epoch": 16.237047551454932, + "grad_norm": 5.992948055267334, + "learning_rate": 8.376990773598297e-05, + "loss": 0.03541705012321472, + "step": 114390 + }, + { + "epoch": 16.23846699787083, + "grad_norm": 2.1976397037506104, + "learning_rate": 8.376848828956707e-05, + "loss": 0.022951729595661163, + "step": 114400 + }, + { + "epoch": 16.23988644428673, + "grad_norm": 1.044998288154602, + "learning_rate": 8.376706884315118e-05, + "loss": 0.01119830459356308, + "step": 114410 + }, + { + "epoch": 16.241305890702627, + "grad_norm": 0.030515599995851517, + "learning_rate": 8.376564939673527e-05, + "loss": 0.024385225772857667, + "step": 114420 + }, + { + "epoch": 16.242725337118525, + "grad_norm": 2.9850716590881348, + "learning_rate": 8.376422995031939e-05, + "loss": 0.011873158812522887, + "step": 114430 + }, + { + "epoch": 16.24414478353442, + "grad_norm": 8.733880043029785, + "learning_rate": 8.376281050390348e-05, + "loss": 0.05018941760063171, + "step": 114440 + }, + { + "epoch": 16.24556422995032, + "grad_norm": 3.0520923137664795, + "learning_rate": 8.376139105748758e-05, + "loss": 0.029634937644004822, + "step": 114450 + }, + { + "epoch": 16.246983676366217, + "grad_norm": 0.42049431800842285, + "learning_rate": 8.375997161107169e-05, + "loss": 0.009971027076244355, + "step": 114460 + }, + { + "epoch": 16.248403122782115, + "grad_norm": 0.03036337159574032, + "learning_rate": 8.375855216465579e-05, + "loss": 0.01775699257850647, + "step": 114470 + }, + { + "epoch": 16.249822569198013, + "grad_norm": 5.403947353363037, + "learning_rate": 8.37571327182399e-05, + "loss": 0.041061696410179135, + "step": 114480 + }, + { + "epoch": 16.25124201561391, + "grad_norm": 0.026970932260155678, + "learning_rate": 8.375571327182398e-05, + "loss": 0.0037163462489843368, + "step": 114490 + }, + { + "epoch": 16.25266146202981, + "grad_norm": 0.009657211601734161, + "learning_rate": 8.37542938254081e-05, + "loss": 0.005383031442761421, + "step": 114500 + }, + { + "epoch": 16.25266146202981, + "eval_accuracy": 0.9855662236917403, + "eval_loss": 0.053308937698602676, + "eval_runtime": 32.8268, + "eval_samples_per_second": 479.091, + "eval_steps_per_second": 14.988, + "step": 114500 + }, + { + "epoch": 16.254080908445705, + "grad_norm": 3.289928436279297, + "learning_rate": 8.375287437899219e-05, + "loss": 0.04177338480949402, + "step": 114510 + }, + { + "epoch": 16.255500354861603, + "grad_norm": 1.0733915567398071, + "learning_rate": 8.37514549325763e-05, + "loss": 0.006131677702069282, + "step": 114520 + }, + { + "epoch": 16.2569198012775, + "grad_norm": 5.283725261688232, + "learning_rate": 8.37500354861604e-05, + "loss": 0.0118058480322361, + "step": 114530 + }, + { + "epoch": 16.2583392476934, + "grad_norm": 0.19665656983852386, + "learning_rate": 8.37486160397445e-05, + "loss": 0.0037314273416996003, + "step": 114540 + }, + { + "epoch": 16.259758694109298, + "grad_norm": 0.6442742347717285, + "learning_rate": 8.374719659332861e-05, + "loss": 0.017784593999385832, + "step": 114550 + }, + { + "epoch": 16.261178140525196, + "grad_norm": 0.9198021292686462, + "learning_rate": 8.37457771469127e-05, + "loss": 0.027311056852340698, + "step": 114560 + }, + { + "epoch": 16.262597586941094, + "grad_norm": 0.5428899526596069, + "learning_rate": 8.374435770049682e-05, + "loss": 0.008149975538253784, + "step": 114570 + }, + { + "epoch": 16.26401703335699, + "grad_norm": 0.33625754714012146, + "learning_rate": 8.374293825408091e-05, + "loss": 0.014088863134384155, + "step": 114580 + }, + { + "epoch": 16.265436479772887, + "grad_norm": 5.278527736663818, + "learning_rate": 8.374151880766501e-05, + "loss": 0.02593829035758972, + "step": 114590 + }, + { + "epoch": 16.266855926188786, + "grad_norm": 0.8755744695663452, + "learning_rate": 8.374009936124911e-05, + "loss": 0.01804676800966263, + "step": 114600 + }, + { + "epoch": 16.268275372604684, + "grad_norm": 3.1491434574127197, + "learning_rate": 8.373867991483322e-05, + "loss": 0.008193667232990264, + "step": 114610 + }, + { + "epoch": 16.269694819020582, + "grad_norm": 0.10345172882080078, + "learning_rate": 8.373726046841732e-05, + "loss": 0.03468181788921356, + "step": 114620 + }, + { + "epoch": 16.27111426543648, + "grad_norm": 10.353991508483887, + "learning_rate": 8.373584102200143e-05, + "loss": 0.02742985785007477, + "step": 114630 + }, + { + "epoch": 16.27253371185238, + "grad_norm": 0.13133634626865387, + "learning_rate": 8.373442157558553e-05, + "loss": 0.012948381900787353, + "step": 114640 + }, + { + "epoch": 16.273953158268274, + "grad_norm": 0.13561446964740753, + "learning_rate": 8.373300212916962e-05, + "loss": 0.03252634406089783, + "step": 114650 + }, + { + "epoch": 16.275372604684172, + "grad_norm": 0.01328427903354168, + "learning_rate": 8.373158268275373e-05, + "loss": 0.05154916048049927, + "step": 114660 + }, + { + "epoch": 16.27679205110007, + "grad_norm": 4.751542091369629, + "learning_rate": 8.373016323633783e-05, + "loss": 0.03669569492340088, + "step": 114670 + }, + { + "epoch": 16.27821149751597, + "grad_norm": 0.4849165976047516, + "learning_rate": 8.372874378992194e-05, + "loss": 0.006884780526161194, + "step": 114680 + }, + { + "epoch": 16.279630943931867, + "grad_norm": 2.379690647125244, + "learning_rate": 8.372732434350604e-05, + "loss": 0.009375137090682984, + "step": 114690 + }, + { + "epoch": 16.281050390347765, + "grad_norm": 1.0215786695480347, + "learning_rate": 8.372590489709014e-05, + "loss": 0.025673750042915344, + "step": 114700 + }, + { + "epoch": 16.282469836763664, + "grad_norm": 6.123466491699219, + "learning_rate": 8.372448545067423e-05, + "loss": 0.02046940624713898, + "step": 114710 + }, + { + "epoch": 16.28388928317956, + "grad_norm": 0.04557579383254051, + "learning_rate": 8.372306600425834e-05, + "loss": 0.011663874983787537, + "step": 114720 + }, + { + "epoch": 16.285308729595457, + "grad_norm": 2.435971736907959, + "learning_rate": 8.372164655784244e-05, + "loss": 0.02507122457027435, + "step": 114730 + }, + { + "epoch": 16.286728176011355, + "grad_norm": 0.46252408623695374, + "learning_rate": 8.372022711142655e-05, + "loss": 0.0026042815297842026, + "step": 114740 + }, + { + "epoch": 16.288147622427253, + "grad_norm": 0.07082727551460266, + "learning_rate": 8.371880766501065e-05, + "loss": 0.014566010236740113, + "step": 114750 + }, + { + "epoch": 16.28956706884315, + "grad_norm": 3.0574235916137695, + "learning_rate": 8.371738821859475e-05, + "loss": 0.02677365243434906, + "step": 114760 + }, + { + "epoch": 16.29098651525905, + "grad_norm": 0.7447109222412109, + "learning_rate": 8.371596877217886e-05, + "loss": 0.007968011498451232, + "step": 114770 + }, + { + "epoch": 16.292405961674948, + "grad_norm": 5.180814743041992, + "learning_rate": 8.371454932576296e-05, + "loss": 0.020020869374275208, + "step": 114780 + }, + { + "epoch": 16.293825408090843, + "grad_norm": 3.603285789489746, + "learning_rate": 8.371312987934707e-05, + "loss": 0.029456543922424316, + "step": 114790 + }, + { + "epoch": 16.29524485450674, + "grad_norm": 0.4434371888637543, + "learning_rate": 8.371171043293115e-05, + "loss": 0.04266427755355835, + "step": 114800 + }, + { + "epoch": 16.29666430092264, + "grad_norm": 0.6044859886169434, + "learning_rate": 8.371029098651526e-05, + "loss": 0.04656971991062164, + "step": 114810 + }, + { + "epoch": 16.298083747338538, + "grad_norm": 0.04528965428471565, + "learning_rate": 8.370887154009936e-05, + "loss": 0.01092740148305893, + "step": 114820 + }, + { + "epoch": 16.299503193754436, + "grad_norm": 0.0253436379134655, + "learning_rate": 8.370745209368347e-05, + "loss": 0.04149647951126099, + "step": 114830 + }, + { + "epoch": 16.300922640170334, + "grad_norm": 0.33020728826522827, + "learning_rate": 8.370603264726757e-05, + "loss": 0.029522156715393065, + "step": 114840 + }, + { + "epoch": 16.302342086586233, + "grad_norm": 3.6205527782440186, + "learning_rate": 8.370461320085166e-05, + "loss": 0.02586008906364441, + "step": 114850 + }, + { + "epoch": 16.303761533002127, + "grad_norm": 0.2821308970451355, + "learning_rate": 8.370319375443578e-05, + "loss": 0.013489672541618347, + "step": 114860 + }, + { + "epoch": 16.305180979418026, + "grad_norm": 0.031373679637908936, + "learning_rate": 8.370177430801987e-05, + "loss": 0.013849031925201417, + "step": 114870 + }, + { + "epoch": 16.306600425833924, + "grad_norm": 5.040038108825684, + "learning_rate": 8.370035486160398e-05, + "loss": 0.02151748687028885, + "step": 114880 + }, + { + "epoch": 16.308019872249822, + "grad_norm": 0.03968564793467522, + "learning_rate": 8.369893541518808e-05, + "loss": 0.024423137307167053, + "step": 114890 + }, + { + "epoch": 16.30943931866572, + "grad_norm": 0.42807242274284363, + "learning_rate": 8.369751596877218e-05, + "loss": 0.025084248185157774, + "step": 114900 + }, + { + "epoch": 16.31085876508162, + "grad_norm": 9.006585121154785, + "learning_rate": 8.369609652235628e-05, + "loss": 0.016592279076576233, + "step": 114910 + }, + { + "epoch": 16.312278211497517, + "grad_norm": 0.06665924936532974, + "learning_rate": 8.369467707594039e-05, + "loss": 0.020814579725265504, + "step": 114920 + }, + { + "epoch": 16.313697657913412, + "grad_norm": 7.368483066558838, + "learning_rate": 8.369325762952448e-05, + "loss": 0.022772680222988128, + "step": 114930 + }, + { + "epoch": 16.31511710432931, + "grad_norm": 0.04184337332844734, + "learning_rate": 8.36918381831086e-05, + "loss": 0.003722415864467621, + "step": 114940 + }, + { + "epoch": 16.31653655074521, + "grad_norm": 0.12398586422204971, + "learning_rate": 8.369041873669269e-05, + "loss": 0.014785408973693848, + "step": 114950 + }, + { + "epoch": 16.317955997161107, + "grad_norm": 7.1915764808654785, + "learning_rate": 8.368899929027679e-05, + "loss": 0.04960554540157318, + "step": 114960 + }, + { + "epoch": 16.319375443577005, + "grad_norm": 0.316967248916626, + "learning_rate": 8.36875798438609e-05, + "loss": 0.08032472729682923, + "step": 114970 + }, + { + "epoch": 16.320794889992904, + "grad_norm": 0.14770057797431946, + "learning_rate": 8.3686160397445e-05, + "loss": 0.012001116573810578, + "step": 114980 + }, + { + "epoch": 16.322214336408802, + "grad_norm": 0.02392687276005745, + "learning_rate": 8.368474095102911e-05, + "loss": 0.03899045586585999, + "step": 114990 + }, + { + "epoch": 16.323633782824697, + "grad_norm": 0.5080669522285461, + "learning_rate": 8.36833215046132e-05, + "loss": 0.06272426843643189, + "step": 115000 + }, + { + "epoch": 16.323633782824697, + "eval_accuracy": 0.9827684873148089, + "eval_loss": 0.05823565647006035, + "eval_runtime": 33.363, + "eval_samples_per_second": 471.39, + "eval_steps_per_second": 14.747, + "step": 115000 + }, + { + "epoch": 16.325053229240595, + "grad_norm": 1.3705179691314697, + "learning_rate": 8.36819020581973e-05, + "loss": 0.03353260755538941, + "step": 115010 + }, + { + "epoch": 16.326472675656493, + "grad_norm": 0.10776552557945251, + "learning_rate": 8.36804826117814e-05, + "loss": 0.017255675792694092, + "step": 115020 + }, + { + "epoch": 16.32789212207239, + "grad_norm": 0.04744476079940796, + "learning_rate": 8.367906316536551e-05, + "loss": 0.025518766045570372, + "step": 115030 + }, + { + "epoch": 16.32931156848829, + "grad_norm": 1.8002848625183105, + "learning_rate": 8.367764371894961e-05, + "loss": 0.013162341713905335, + "step": 115040 + }, + { + "epoch": 16.330731014904188, + "grad_norm": 7.3443732261657715, + "learning_rate": 8.367622427253372e-05, + "loss": 0.01212470680475235, + "step": 115050 + }, + { + "epoch": 16.332150461320087, + "grad_norm": 2.0661675930023193, + "learning_rate": 8.367480482611782e-05, + "loss": 0.033673611283302304, + "step": 115060 + }, + { + "epoch": 16.33356990773598, + "grad_norm": 0.07441291958093643, + "learning_rate": 8.367338537970192e-05, + "loss": 0.0027853518724441527, + "step": 115070 + }, + { + "epoch": 16.33498935415188, + "grad_norm": 1.5034619569778442, + "learning_rate": 8.367196593328603e-05, + "loss": 0.008223128318786622, + "step": 115080 + }, + { + "epoch": 16.336408800567778, + "grad_norm": 0.20786134898662567, + "learning_rate": 8.367054648687012e-05, + "loss": 0.004812454804778099, + "step": 115090 + }, + { + "epoch": 16.337828246983676, + "grad_norm": 0.5978586673736572, + "learning_rate": 8.366912704045423e-05, + "loss": 0.027519500255584715, + "step": 115100 + }, + { + "epoch": 16.339247693399575, + "grad_norm": 0.2658475339412689, + "learning_rate": 8.366770759403832e-05, + "loss": 0.026401248574256898, + "step": 115110 + }, + { + "epoch": 16.340667139815473, + "grad_norm": 0.026537051424384117, + "learning_rate": 8.366628814762243e-05, + "loss": 0.026840895414352417, + "step": 115120 + }, + { + "epoch": 16.34208658623137, + "grad_norm": 0.2847868800163269, + "learning_rate": 8.366486870120653e-05, + "loss": 0.007615000009536743, + "step": 115130 + }, + { + "epoch": 16.343506032647266, + "grad_norm": 8.656590461730957, + "learning_rate": 8.366344925479064e-05, + "loss": 0.03938590884208679, + "step": 115140 + }, + { + "epoch": 16.344925479063164, + "grad_norm": 0.22449363768100739, + "learning_rate": 8.366202980837475e-05, + "loss": 0.00203959122300148, + "step": 115150 + }, + { + "epoch": 16.346344925479062, + "grad_norm": 0.11375907808542252, + "learning_rate": 8.366061036195883e-05, + "loss": 0.006970361620187759, + "step": 115160 + }, + { + "epoch": 16.34776437189496, + "grad_norm": 0.22546268999576569, + "learning_rate": 8.365919091554294e-05, + "loss": 0.00478266179561615, + "step": 115170 + }, + { + "epoch": 16.34918381831086, + "grad_norm": 0.8449538946151733, + "learning_rate": 8.365777146912704e-05, + "loss": 0.0027214929461479185, + "step": 115180 + }, + { + "epoch": 16.350603264726757, + "grad_norm": 0.7154591083526611, + "learning_rate": 8.365635202271115e-05, + "loss": 0.04068278968334198, + "step": 115190 + }, + { + "epoch": 16.352022711142656, + "grad_norm": 3.3361713886260986, + "learning_rate": 8.365493257629525e-05, + "loss": 0.006870243698358536, + "step": 115200 + }, + { + "epoch": 16.35344215755855, + "grad_norm": 11.387276649475098, + "learning_rate": 8.365351312987935e-05, + "loss": 0.024857263267040252, + "step": 115210 + }, + { + "epoch": 16.35486160397445, + "grad_norm": 1.0348623991012573, + "learning_rate": 8.365209368346344e-05, + "loss": 0.007273174822330475, + "step": 115220 + }, + { + "epoch": 16.356281050390347, + "grad_norm": 0.0277334563434124, + "learning_rate": 8.365067423704755e-05, + "loss": 0.005058244615793228, + "step": 115230 + }, + { + "epoch": 16.357700496806245, + "grad_norm": 0.16035868227481842, + "learning_rate": 8.364925479063167e-05, + "loss": 0.024697315692901612, + "step": 115240 + }, + { + "epoch": 16.359119943222144, + "grad_norm": 8.151751518249512, + "learning_rate": 8.364783534421576e-05, + "loss": 0.01324802339076996, + "step": 115250 + }, + { + "epoch": 16.360539389638042, + "grad_norm": 0.006740411277860403, + "learning_rate": 8.364641589779986e-05, + "loss": 0.009046432375907899, + "step": 115260 + }, + { + "epoch": 16.36195883605394, + "grad_norm": 9.100384712219238, + "learning_rate": 8.364499645138396e-05, + "loss": 0.03189237117767334, + "step": 115270 + }, + { + "epoch": 16.363378282469835, + "grad_norm": 0.5900508761405945, + "learning_rate": 8.364357700496807e-05, + "loss": 0.004971956834197044, + "step": 115280 + }, + { + "epoch": 16.364797728885733, + "grad_norm": 2.169576406478882, + "learning_rate": 8.364215755855217e-05, + "loss": 0.012622570991516114, + "step": 115290 + }, + { + "epoch": 16.36621717530163, + "grad_norm": 3.9675159454345703, + "learning_rate": 8.364073811213628e-05, + "loss": 0.04543294906616211, + "step": 115300 + }, + { + "epoch": 16.36763662171753, + "grad_norm": 0.21474087238311768, + "learning_rate": 8.363931866572036e-05, + "loss": 0.07644522786140442, + "step": 115310 + }, + { + "epoch": 16.36905606813343, + "grad_norm": 11.529156684875488, + "learning_rate": 8.363789921930447e-05, + "loss": 0.08131443858146667, + "step": 115320 + }, + { + "epoch": 16.370475514549327, + "grad_norm": 0.2404913604259491, + "learning_rate": 8.363647977288858e-05, + "loss": 0.034796294569969174, + "step": 115330 + }, + { + "epoch": 16.371894960965225, + "grad_norm": 0.11170510947704315, + "learning_rate": 8.363506032647268e-05, + "loss": 0.0060304529964923855, + "step": 115340 + }, + { + "epoch": 16.37331440738112, + "grad_norm": 0.08581695705652237, + "learning_rate": 8.363364088005679e-05, + "loss": 0.018119427561759948, + "step": 115350 + }, + { + "epoch": 16.374733853797018, + "grad_norm": 0.657220184803009, + "learning_rate": 8.363222143364089e-05, + "loss": 0.015167847275733948, + "step": 115360 + }, + { + "epoch": 16.376153300212916, + "grad_norm": 0.07838866859674454, + "learning_rate": 8.363080198722499e-05, + "loss": 0.011263452470302582, + "step": 115370 + }, + { + "epoch": 16.377572746628815, + "grad_norm": 1.0153034925460815, + "learning_rate": 8.362938254080908e-05, + "loss": 0.005640817433595657, + "step": 115380 + }, + { + "epoch": 16.378992193044713, + "grad_norm": 0.6309021711349487, + "learning_rate": 8.36279630943932e-05, + "loss": 0.017265193164348602, + "step": 115390 + }, + { + "epoch": 16.38041163946061, + "grad_norm": 0.010063917376101017, + "learning_rate": 8.362654364797729e-05, + "loss": 0.019005651772022247, + "step": 115400 + }, + { + "epoch": 16.38183108587651, + "grad_norm": 0.26570451259613037, + "learning_rate": 8.36251242015614e-05, + "loss": 0.009477096050977707, + "step": 115410 + }, + { + "epoch": 16.383250532292404, + "grad_norm": 0.41002532839775085, + "learning_rate": 8.36237047551455e-05, + "loss": 0.008529486507177353, + "step": 115420 + }, + { + "epoch": 16.384669978708303, + "grad_norm": 14.01142692565918, + "learning_rate": 8.36222853087296e-05, + "loss": 0.028902316093444826, + "step": 115430 + }, + { + "epoch": 16.3860894251242, + "grad_norm": 0.2603977918624878, + "learning_rate": 8.362086586231371e-05, + "loss": 0.004179652780294418, + "step": 115440 + }, + { + "epoch": 16.3875088715401, + "grad_norm": 1.5551576614379883, + "learning_rate": 8.36194464158978e-05, + "loss": 0.008667966723442078, + "step": 115450 + }, + { + "epoch": 16.388928317955997, + "grad_norm": 0.09633925557136536, + "learning_rate": 8.361802696948192e-05, + "loss": 0.031859517097473145, + "step": 115460 + }, + { + "epoch": 16.390347764371896, + "grad_norm": 7.917778491973877, + "learning_rate": 8.3616607523066e-05, + "loss": 0.019880032539367674, + "step": 115470 + }, + { + "epoch": 16.391767210787794, + "grad_norm": 0.0779050812125206, + "learning_rate": 8.36153300212917e-05, + "loss": 0.03968299627304077, + "step": 115480 + }, + { + "epoch": 16.39318665720369, + "grad_norm": 0.15087741613388062, + "learning_rate": 8.36139105748758e-05, + "loss": 0.016289661824703216, + "step": 115490 + }, + { + "epoch": 16.394606103619587, + "grad_norm": 0.2024311125278473, + "learning_rate": 8.361249112845991e-05, + "loss": 0.00186014324426651, + "step": 115500 + }, + { + "epoch": 16.394606103619587, + "eval_accuracy": 0.9875373561391237, + "eval_loss": 0.049540504813194275, + "eval_runtime": 32.6337, + "eval_samples_per_second": 481.925, + "eval_steps_per_second": 15.076, + "step": 115500 + }, + { + "epoch": 16.396025550035485, + "grad_norm": 4.084429740905762, + "learning_rate": 8.3611071682044e-05, + "loss": 0.030351912975311278, + "step": 115510 + }, + { + "epoch": 16.397444996451384, + "grad_norm": 2.075824022293091, + "learning_rate": 8.360965223562812e-05, + "loss": 0.022257208824157715, + "step": 115520 + }, + { + "epoch": 16.398864442867282, + "grad_norm": 0.05323343724012375, + "learning_rate": 8.360823278921221e-05, + "loss": 0.009775599837303162, + "step": 115530 + }, + { + "epoch": 16.40028388928318, + "grad_norm": 0.227037250995636, + "learning_rate": 8.360681334279631e-05, + "loss": 0.01040622591972351, + "step": 115540 + }, + { + "epoch": 16.40170333569908, + "grad_norm": 0.36200225353240967, + "learning_rate": 8.360539389638041e-05, + "loss": 0.03365554213523865, + "step": 115550 + }, + { + "epoch": 16.403122782114973, + "grad_norm": 12.0408296585083, + "learning_rate": 8.360397444996452e-05, + "loss": 0.023565518856048583, + "step": 115560 + }, + { + "epoch": 16.40454222853087, + "grad_norm": 14.00751781463623, + "learning_rate": 8.360255500354862e-05, + "loss": 0.024163755774497985, + "step": 115570 + }, + { + "epoch": 16.40596167494677, + "grad_norm": 0.039277564734220505, + "learning_rate": 8.360113555713273e-05, + "loss": 0.03428987562656403, + "step": 115580 + }, + { + "epoch": 16.40738112136267, + "grad_norm": 9.152158737182617, + "learning_rate": 8.359971611071682e-05, + "loss": 0.06883146166801453, + "step": 115590 + }, + { + "epoch": 16.408800567778567, + "grad_norm": 0.1360839605331421, + "learning_rate": 8.359829666430092e-05, + "loss": 0.00723818838596344, + "step": 115600 + }, + { + "epoch": 16.410220014194465, + "grad_norm": 7.627287864685059, + "learning_rate": 8.359687721788503e-05, + "loss": 0.021432147920131685, + "step": 115610 + }, + { + "epoch": 16.411639460610363, + "grad_norm": 0.02474578469991684, + "learning_rate": 8.359545777146913e-05, + "loss": 0.011796319484710693, + "step": 115620 + }, + { + "epoch": 16.413058907026258, + "grad_norm": 2.2613930702209473, + "learning_rate": 8.359403832505324e-05, + "loss": 0.007085627317428589, + "step": 115630 + }, + { + "epoch": 16.414478353442156, + "grad_norm": 0.021743113175034523, + "learning_rate": 8.359261887863734e-05, + "loss": 0.011453683674335479, + "step": 115640 + }, + { + "epoch": 16.415897799858055, + "grad_norm": 0.01169249601662159, + "learning_rate": 8.359119943222144e-05, + "loss": 0.034659892320632935, + "step": 115650 + }, + { + "epoch": 16.417317246273953, + "grad_norm": 10.435478210449219, + "learning_rate": 8.358977998580553e-05, + "loss": 0.01766133904457092, + "step": 115660 + }, + { + "epoch": 16.41873669268985, + "grad_norm": 0.17260213196277618, + "learning_rate": 8.358836053938964e-05, + "loss": 0.00581606812775135, + "step": 115670 + }, + { + "epoch": 16.42015613910575, + "grad_norm": 2.1862330436706543, + "learning_rate": 8.358694109297374e-05, + "loss": 0.04490639269351959, + "step": 115680 + }, + { + "epoch": 16.421575585521648, + "grad_norm": 0.038389623165130615, + "learning_rate": 8.358552164655785e-05, + "loss": 0.014942404627799988, + "step": 115690 + }, + { + "epoch": 16.422995031937543, + "grad_norm": 0.029459958896040916, + "learning_rate": 8.358410220014195e-05, + "loss": 0.03503639698028564, + "step": 115700 + }, + { + "epoch": 16.42441447835344, + "grad_norm": 0.2350374162197113, + "learning_rate": 8.358268275372605e-05, + "loss": 0.012763646245002747, + "step": 115710 + }, + { + "epoch": 16.42583392476934, + "grad_norm": 0.03455529734492302, + "learning_rate": 8.358126330731016e-05, + "loss": 0.022325176000595092, + "step": 115720 + }, + { + "epoch": 16.427253371185238, + "grad_norm": 0.20192380249500275, + "learning_rate": 8.357984386089426e-05, + "loss": 0.0248690664768219, + "step": 115730 + }, + { + "epoch": 16.428672817601136, + "grad_norm": 12.080421447753906, + "learning_rate": 8.357842441447837e-05, + "loss": 0.026702883839607238, + "step": 115740 + }, + { + "epoch": 16.430092264017034, + "grad_norm": 7.304847717285156, + "learning_rate": 8.357700496806245e-05, + "loss": 0.036014878749847413, + "step": 115750 + }, + { + "epoch": 16.431511710432932, + "grad_norm": 0.007642616052180529, + "learning_rate": 8.357558552164656e-05, + "loss": 0.023060834407806395, + "step": 115760 + }, + { + "epoch": 16.432931156848827, + "grad_norm": 1.2195990085601807, + "learning_rate": 8.357416607523066e-05, + "loss": 0.01461809128522873, + "step": 115770 + }, + { + "epoch": 16.434350603264726, + "grad_norm": 0.3150467574596405, + "learning_rate": 8.357274662881477e-05, + "loss": 0.015410137176513673, + "step": 115780 + }, + { + "epoch": 16.435770049680624, + "grad_norm": 1.0256836414337158, + "learning_rate": 8.357132718239887e-05, + "loss": 0.03227716088294983, + "step": 115790 + }, + { + "epoch": 16.437189496096522, + "grad_norm": 0.051455672830343246, + "learning_rate": 8.356990773598296e-05, + "loss": 0.02644713819026947, + "step": 115800 + }, + { + "epoch": 16.43860894251242, + "grad_norm": 0.05469426512718201, + "learning_rate": 8.356848828956707e-05, + "loss": 0.02017551362514496, + "step": 115810 + }, + { + "epoch": 16.44002838892832, + "grad_norm": 0.6227673888206482, + "learning_rate": 8.356706884315117e-05, + "loss": 0.006663136184215546, + "step": 115820 + }, + { + "epoch": 16.441447835344217, + "grad_norm": 0.05501323565840721, + "learning_rate": 8.356564939673528e-05, + "loss": 0.03132602572441101, + "step": 115830 + }, + { + "epoch": 16.442867281760112, + "grad_norm": 6.319305419921875, + "learning_rate": 8.356422995031938e-05, + "loss": 0.024266751110553743, + "step": 115840 + }, + { + "epoch": 16.44428672817601, + "grad_norm": 1.0513020753860474, + "learning_rate": 8.356281050390348e-05, + "loss": 0.004631191864609719, + "step": 115850 + }, + { + "epoch": 16.44570617459191, + "grad_norm": 6.053133487701416, + "learning_rate": 8.356139105748758e-05, + "loss": 0.019884093105792998, + "step": 115860 + }, + { + "epoch": 16.447125621007807, + "grad_norm": 0.01819560118019581, + "learning_rate": 8.355997161107169e-05, + "loss": 0.009723077714443206, + "step": 115870 + }, + { + "epoch": 16.448545067423705, + "grad_norm": 0.7310961484909058, + "learning_rate": 8.355855216465578e-05, + "loss": 0.010221529006958007, + "step": 115880 + }, + { + "epoch": 16.449964513839603, + "grad_norm": 0.6194095611572266, + "learning_rate": 8.35571327182399e-05, + "loss": 0.0022616587579250337, + "step": 115890 + }, + { + "epoch": 16.4513839602555, + "grad_norm": 15.329816818237305, + "learning_rate": 8.355571327182399e-05, + "loss": 0.03254573047161102, + "step": 115900 + }, + { + "epoch": 16.4528034066714, + "grad_norm": 0.5103831887245178, + "learning_rate": 8.355429382540809e-05, + "loss": 0.0019066516309976579, + "step": 115910 + }, + { + "epoch": 16.454222853087295, + "grad_norm": 0.43424859642982483, + "learning_rate": 8.35528743789922e-05, + "loss": 0.009562914073467255, + "step": 115920 + }, + { + "epoch": 16.455642299503193, + "grad_norm": 0.04635334387421608, + "learning_rate": 8.35514549325763e-05, + "loss": 0.019181308150291444, + "step": 115930 + }, + { + "epoch": 16.45706174591909, + "grad_norm": 5.84125280380249, + "learning_rate": 8.355003548616041e-05, + "loss": 0.06330277919769287, + "step": 115940 + }, + { + "epoch": 16.45848119233499, + "grad_norm": 0.1608128547668457, + "learning_rate": 8.354861603974449e-05, + "loss": 0.027855148911476134, + "step": 115950 + }, + { + "epoch": 16.459900638750888, + "grad_norm": 0.13251209259033203, + "learning_rate": 8.35471965933286e-05, + "loss": 0.011945770680904388, + "step": 115960 + }, + { + "epoch": 16.461320085166786, + "grad_norm": 0.12929560244083405, + "learning_rate": 8.35457771469127e-05, + "loss": 0.006080722063779831, + "step": 115970 + }, + { + "epoch": 16.462739531582685, + "grad_norm": 4.530337333679199, + "learning_rate": 8.354435770049681e-05, + "loss": 0.0706814706325531, + "step": 115980 + }, + { + "epoch": 16.46415897799858, + "grad_norm": 0.32479044795036316, + "learning_rate": 8.354293825408092e-05, + "loss": 0.05216479897499084, + "step": 115990 + }, + { + "epoch": 16.465578424414478, + "grad_norm": 0.01489038486033678, + "learning_rate": 8.354151880766502e-05, + "loss": 0.018127787113189697, + "step": 116000 + }, + { + "epoch": 16.465578424414478, + "eval_accuracy": 0.9861384879506581, + "eval_loss": 0.04969051852822304, + "eval_runtime": 32.3768, + "eval_samples_per_second": 485.749, + "eval_steps_per_second": 15.196, + "step": 116000 + }, + { + "epoch": 16.466997870830376, + "grad_norm": 0.02967519871890545, + "learning_rate": 8.354009936124912e-05, + "loss": 0.02046469897031784, + "step": 116010 + }, + { + "epoch": 16.468417317246274, + "grad_norm": 3.218904733657837, + "learning_rate": 8.353867991483321e-05, + "loss": 0.020984625816345213, + "step": 116020 + }, + { + "epoch": 16.469836763662173, + "grad_norm": 0.4643838703632355, + "learning_rate": 8.353726046841733e-05, + "loss": 0.06885659098625183, + "step": 116030 + }, + { + "epoch": 16.47125621007807, + "grad_norm": 14.610981941223145, + "learning_rate": 8.353584102200142e-05, + "loss": 0.07456240057945251, + "step": 116040 + }, + { + "epoch": 16.47267565649397, + "grad_norm": 11.100933074951172, + "learning_rate": 8.353442157558553e-05, + "loss": 0.04586609303951263, + "step": 116050 + }, + { + "epoch": 16.474095102909864, + "grad_norm": 0.10718601942062378, + "learning_rate": 8.353300212916962e-05, + "loss": 0.016160254180431367, + "step": 116060 + }, + { + "epoch": 16.475514549325762, + "grad_norm": 0.06104143708944321, + "learning_rate": 8.353158268275373e-05, + "loss": 0.023186489939689636, + "step": 116070 + }, + { + "epoch": 16.47693399574166, + "grad_norm": 6.335537433624268, + "learning_rate": 8.353016323633784e-05, + "loss": 0.016118551790714263, + "step": 116080 + }, + { + "epoch": 16.47835344215756, + "grad_norm": 0.003421552013605833, + "learning_rate": 8.352874378992194e-05, + "loss": 0.0062081929296255115, + "step": 116090 + }, + { + "epoch": 16.479772888573457, + "grad_norm": 4.670344829559326, + "learning_rate": 8.352732434350605e-05, + "loss": 0.021376237273216248, + "step": 116100 + }, + { + "epoch": 16.481192334989355, + "grad_norm": 5.064271926879883, + "learning_rate": 8.352590489709013e-05, + "loss": 0.055617785453796385, + "step": 116110 + }, + { + "epoch": 16.482611781405254, + "grad_norm": 10.445257186889648, + "learning_rate": 8.352448545067424e-05, + "loss": 0.029583734273910523, + "step": 116120 + }, + { + "epoch": 16.48403122782115, + "grad_norm": 0.3211866319179535, + "learning_rate": 8.352306600425834e-05, + "loss": 0.013366077840328217, + "step": 116130 + }, + { + "epoch": 16.485450674237047, + "grad_norm": 1.0787125825881958, + "learning_rate": 8.352164655784245e-05, + "loss": 0.03924538791179657, + "step": 116140 + }, + { + "epoch": 16.486870120652945, + "grad_norm": 0.2747179865837097, + "learning_rate": 8.352022711142655e-05, + "loss": 0.05656103491783142, + "step": 116150 + }, + { + "epoch": 16.488289567068843, + "grad_norm": 0.15979036688804626, + "learning_rate": 8.351880766501065e-05, + "loss": 0.032468026876449584, + "step": 116160 + }, + { + "epoch": 16.48970901348474, + "grad_norm": 5.031485080718994, + "learning_rate": 8.351738821859476e-05, + "loss": 0.006507077068090439, + "step": 116170 + }, + { + "epoch": 16.49112845990064, + "grad_norm": 0.0589463971555233, + "learning_rate": 8.351596877217885e-05, + "loss": 0.009158772230148316, + "step": 116180 + }, + { + "epoch": 16.49254790631654, + "grad_norm": 0.01717454195022583, + "learning_rate": 8.351454932576296e-05, + "loss": 0.014610305428504944, + "step": 116190 + }, + { + "epoch": 16.493967352732433, + "grad_norm": 0.45543742179870605, + "learning_rate": 8.351312987934706e-05, + "loss": 0.027995049953460693, + "step": 116200 + }, + { + "epoch": 16.49538679914833, + "grad_norm": 0.3700833320617676, + "learning_rate": 8.351171043293116e-05, + "loss": 0.040512260794639585, + "step": 116210 + }, + { + "epoch": 16.49680624556423, + "grad_norm": 1.1924240589141846, + "learning_rate": 8.351029098651526e-05, + "loss": 0.03392375409603119, + "step": 116220 + }, + { + "epoch": 16.498225691980128, + "grad_norm": 1.7394284009933472, + "learning_rate": 8.350887154009937e-05, + "loss": 0.010041067004203796, + "step": 116230 + }, + { + "epoch": 16.499645138396026, + "grad_norm": 0.04915410280227661, + "learning_rate": 8.350745209368347e-05, + "loss": 0.020868843793869017, + "step": 116240 + }, + { + "epoch": 16.501064584811925, + "grad_norm": 0.034044016152620316, + "learning_rate": 8.350603264726758e-05, + "loss": 0.013935981690883637, + "step": 116250 + }, + { + "epoch": 16.502484031227823, + "grad_norm": 0.4427124261856079, + "learning_rate": 8.350461320085167e-05, + "loss": 0.03237035572528839, + "step": 116260 + }, + { + "epoch": 16.503903477643718, + "grad_norm": 0.012194049544632435, + "learning_rate": 8.350319375443577e-05, + "loss": 0.017374065518379212, + "step": 116270 + }, + { + "epoch": 16.505322924059616, + "grad_norm": 0.20120149850845337, + "learning_rate": 8.350177430801988e-05, + "loss": 0.02145771086215973, + "step": 116280 + }, + { + "epoch": 16.506742370475514, + "grad_norm": 0.022378822788596153, + "learning_rate": 8.350035486160398e-05, + "loss": 0.02700048089027405, + "step": 116290 + }, + { + "epoch": 16.508161816891413, + "grad_norm": 0.8439290523529053, + "learning_rate": 8.349893541518809e-05, + "loss": 0.02738071084022522, + "step": 116300 + }, + { + "epoch": 16.50958126330731, + "grad_norm": 0.1249089315533638, + "learning_rate": 8.349751596877217e-05, + "loss": 0.01525331735610962, + "step": 116310 + }, + { + "epoch": 16.51100070972321, + "grad_norm": 1.2921068668365479, + "learning_rate": 8.349609652235628e-05, + "loss": 0.022152553498744964, + "step": 116320 + }, + { + "epoch": 16.512420156139108, + "grad_norm": 4.518233776092529, + "learning_rate": 8.349467707594038e-05, + "loss": 0.030002409219741823, + "step": 116330 + }, + { + "epoch": 16.513839602555002, + "grad_norm": 3.4533886909484863, + "learning_rate": 8.349325762952449e-05, + "loss": 0.03558555543422699, + "step": 116340 + }, + { + "epoch": 16.5152590489709, + "grad_norm": 0.011736358515918255, + "learning_rate": 8.349183818310859e-05, + "loss": 0.05337393879890442, + "step": 116350 + }, + { + "epoch": 16.5166784953868, + "grad_norm": 0.0143458042293787, + "learning_rate": 8.34904187366927e-05, + "loss": 0.022696526348590852, + "step": 116360 + }, + { + "epoch": 16.518097941802697, + "grad_norm": 0.0822390615940094, + "learning_rate": 8.34889992902768e-05, + "loss": 0.023125678300857544, + "step": 116370 + }, + { + "epoch": 16.519517388218595, + "grad_norm": 0.06752567738294601, + "learning_rate": 8.34875798438609e-05, + "loss": 0.02257518470287323, + "step": 116380 + }, + { + "epoch": 16.520936834634494, + "grad_norm": 11.136147499084473, + "learning_rate": 8.348616039744501e-05, + "loss": 0.023729005455970766, + "step": 116390 + }, + { + "epoch": 16.522356281050392, + "grad_norm": 0.02640192024409771, + "learning_rate": 8.34847409510291e-05, + "loss": 0.028013849258422853, + "step": 116400 + }, + { + "epoch": 16.523775727466287, + "grad_norm": 1.2313165664672852, + "learning_rate": 8.348332150461322e-05, + "loss": 0.004355183988809586, + "step": 116410 + }, + { + "epoch": 16.525195173882185, + "grad_norm": 0.11673381179571152, + "learning_rate": 8.34819020581973e-05, + "loss": 0.03289211392402649, + "step": 116420 + }, + { + "epoch": 16.526614620298083, + "grad_norm": 1.9656383991241455, + "learning_rate": 8.348048261178141e-05, + "loss": 0.03792242407798767, + "step": 116430 + }, + { + "epoch": 16.528034066713982, + "grad_norm": 0.4912014305591583, + "learning_rate": 8.347906316536551e-05, + "loss": 0.03354101479053497, + "step": 116440 + }, + { + "epoch": 16.52945351312988, + "grad_norm": 10.189749717712402, + "learning_rate": 8.347764371894962e-05, + "loss": 0.008781054615974426, + "step": 116450 + }, + { + "epoch": 16.53087295954578, + "grad_norm": 0.001697646570391953, + "learning_rate": 8.347622427253372e-05, + "loss": 0.004727036133408547, + "step": 116460 + }, + { + "epoch": 16.532292405961677, + "grad_norm": 0.20702528953552246, + "learning_rate": 8.347480482611781e-05, + "loss": 0.00879761427640915, + "step": 116470 + }, + { + "epoch": 16.53371185237757, + "grad_norm": 5.2699713706970215, + "learning_rate": 8.347338537970192e-05, + "loss": 0.012068004161119462, + "step": 116480 + }, + { + "epoch": 16.53513129879347, + "grad_norm": 2.320582389831543, + "learning_rate": 8.347196593328602e-05, + "loss": 0.005346919223666191, + "step": 116490 + }, + { + "epoch": 16.536550745209368, + "grad_norm": 0.1566736102104187, + "learning_rate": 8.347054648687013e-05, + "loss": 0.010593122243881226, + "step": 116500 + }, + { + "epoch": 16.536550745209368, + "eval_accuracy": 0.9840401856679596, + "eval_loss": 0.061827003955841064, + "eval_runtime": 32.9233, + "eval_samples_per_second": 477.686, + "eval_steps_per_second": 14.944, + "step": 116500 + }, + { + "epoch": 16.537970191625266, + "grad_norm": 0.008619280532002449, + "learning_rate": 8.346912704045423e-05, + "loss": 0.0030029378831386566, + "step": 116510 + }, + { + "epoch": 16.539389638041165, + "grad_norm": 0.008758191019296646, + "learning_rate": 8.346770759403833e-05, + "loss": 0.02375074625015259, + "step": 116520 + }, + { + "epoch": 16.540809084457063, + "grad_norm": 1.1835864782333374, + "learning_rate": 8.346628814762242e-05, + "loss": 0.0320775032043457, + "step": 116530 + }, + { + "epoch": 16.54222853087296, + "grad_norm": 0.5120196342468262, + "learning_rate": 8.346486870120654e-05, + "loss": 0.009070900827646255, + "step": 116540 + }, + { + "epoch": 16.543647977288856, + "grad_norm": 0.06194985657930374, + "learning_rate": 8.346344925479063e-05, + "loss": 0.014806480705738067, + "step": 116550 + }, + { + "epoch": 16.545067423704754, + "grad_norm": 0.012789204716682434, + "learning_rate": 8.346202980837474e-05, + "loss": 0.014970606565475464, + "step": 116560 + }, + { + "epoch": 16.546486870120653, + "grad_norm": 0.2259586751461029, + "learning_rate": 8.346061036195884e-05, + "loss": 0.019840967655181885, + "step": 116570 + }, + { + "epoch": 16.54790631653655, + "grad_norm": 0.03640298917889595, + "learning_rate": 8.345919091554294e-05, + "loss": 0.0335048109292984, + "step": 116580 + }, + { + "epoch": 16.54932576295245, + "grad_norm": 1.2783691883087158, + "learning_rate": 8.345777146912705e-05, + "loss": 0.018758539855480195, + "step": 116590 + }, + { + "epoch": 16.550745209368348, + "grad_norm": 0.260236531496048, + "learning_rate": 8.345635202271115e-05, + "loss": 0.04677286446094513, + "step": 116600 + }, + { + "epoch": 16.552164655784246, + "grad_norm": 0.854559063911438, + "learning_rate": 8.345493257629526e-05, + "loss": 0.01778472363948822, + "step": 116610 + }, + { + "epoch": 16.55358410220014, + "grad_norm": 17.11147117614746, + "learning_rate": 8.345351312987934e-05, + "loss": 0.057949680089950564, + "step": 116620 + }, + { + "epoch": 16.55500354861604, + "grad_norm": 4.914608001708984, + "learning_rate": 8.345209368346345e-05, + "loss": 0.006153997406363488, + "step": 116630 + }, + { + "epoch": 16.556422995031937, + "grad_norm": 0.3897269070148468, + "learning_rate": 8.345067423704755e-05, + "loss": 0.004015297442674637, + "step": 116640 + }, + { + "epoch": 16.557842441447836, + "grad_norm": 2.7007765769958496, + "learning_rate": 8.344925479063166e-05, + "loss": 0.006022479012608528, + "step": 116650 + }, + { + "epoch": 16.559261887863734, + "grad_norm": 3.9975502490997314, + "learning_rate": 8.344783534421576e-05, + "loss": 0.02496832013130188, + "step": 116660 + }, + { + "epoch": 16.560681334279632, + "grad_norm": 0.02255144529044628, + "learning_rate": 8.344641589779986e-05, + "loss": 0.0035631228238344193, + "step": 116670 + }, + { + "epoch": 16.56210078069553, + "grad_norm": 0.02827240154147148, + "learning_rate": 8.344499645138397e-05, + "loss": 0.006084645912051201, + "step": 116680 + }, + { + "epoch": 16.563520227111425, + "grad_norm": 1.6112371683120728, + "learning_rate": 8.344357700496806e-05, + "loss": 0.048271042108535764, + "step": 116690 + }, + { + "epoch": 16.564939673527324, + "grad_norm": 0.9190977811813354, + "learning_rate": 8.344215755855217e-05, + "loss": 0.03557129204273224, + "step": 116700 + }, + { + "epoch": 16.566359119943222, + "grad_norm": 0.5334121584892273, + "learning_rate": 8.344073811213627e-05, + "loss": 0.03682603240013123, + "step": 116710 + }, + { + "epoch": 16.56777856635912, + "grad_norm": 2.058285713195801, + "learning_rate": 8.343931866572038e-05, + "loss": 0.012582701444625855, + "step": 116720 + }, + { + "epoch": 16.56919801277502, + "grad_norm": 7.025942325592041, + "learning_rate": 8.343789921930447e-05, + "loss": 0.03197660446166992, + "step": 116730 + }, + { + "epoch": 16.570617459190917, + "grad_norm": 3.6591897010803223, + "learning_rate": 8.343647977288858e-05, + "loss": 0.027104607224464415, + "step": 116740 + }, + { + "epoch": 16.572036905606815, + "grad_norm": 0.9631967544555664, + "learning_rate": 8.343506032647268e-05, + "loss": 0.015047216415405273, + "step": 116750 + }, + { + "epoch": 16.57345635202271, + "grad_norm": 0.07745859771966934, + "learning_rate": 8.343364088005679e-05, + "loss": 0.008823959529399872, + "step": 116760 + }, + { + "epoch": 16.574875798438608, + "grad_norm": 0.19734224677085876, + "learning_rate": 8.343222143364088e-05, + "loss": 0.02915046215057373, + "step": 116770 + }, + { + "epoch": 16.576295244854506, + "grad_norm": 0.09668063372373581, + "learning_rate": 8.343080198722498e-05, + "loss": 0.026794278621673585, + "step": 116780 + }, + { + "epoch": 16.577714691270405, + "grad_norm": 0.08512010425329208, + "learning_rate": 8.342938254080909e-05, + "loss": 0.009084032475948333, + "step": 116790 + }, + { + "epoch": 16.579134137686303, + "grad_norm": 0.07173248380422592, + "learning_rate": 8.342796309439319e-05, + "loss": 0.009995944797992706, + "step": 116800 + }, + { + "epoch": 16.5805535841022, + "grad_norm": 0.018395403400063515, + "learning_rate": 8.342668559261887e-05, + "loss": 0.049830347299575806, + "step": 116810 + }, + { + "epoch": 16.5819730305181, + "grad_norm": 0.7037346363067627, + "learning_rate": 8.342526614620299e-05, + "loss": 0.011580074578523636, + "step": 116820 + }, + { + "epoch": 16.583392476933994, + "grad_norm": 0.91822350025177, + "learning_rate": 8.34238466997871e-05, + "loss": 0.003994914516806602, + "step": 116830 + }, + { + "epoch": 16.584811923349893, + "grad_norm": 0.8074606657028198, + "learning_rate": 8.34224272533712e-05, + "loss": 0.012307696789503098, + "step": 116840 + }, + { + "epoch": 16.58623136976579, + "grad_norm": 2.2989635467529297, + "learning_rate": 8.342100780695529e-05, + "loss": 0.013350567221641541, + "step": 116850 + }, + { + "epoch": 16.58765081618169, + "grad_norm": 1.6210821866989136, + "learning_rate": 8.341958836053939e-05, + "loss": 0.01092873364686966, + "step": 116860 + }, + { + "epoch": 16.589070262597588, + "grad_norm": 1.5248768329620361, + "learning_rate": 8.34181689141235e-05, + "loss": 0.014216583967208863, + "step": 116870 + }, + { + "epoch": 16.590489709013486, + "grad_norm": 0.028146639466285706, + "learning_rate": 8.34167494677076e-05, + "loss": 0.055502718687057494, + "step": 116880 + }, + { + "epoch": 16.591909155429384, + "grad_norm": 7.138301372528076, + "learning_rate": 8.341533002129171e-05, + "loss": 0.0446750670671463, + "step": 116890 + }, + { + "epoch": 16.59332860184528, + "grad_norm": 10.649222373962402, + "learning_rate": 8.341391057487579e-05, + "loss": 0.019469711184501647, + "step": 116900 + }, + { + "epoch": 16.594748048261177, + "grad_norm": 4.18618106842041, + "learning_rate": 8.34124911284599e-05, + "loss": 0.020049738883972167, + "step": 116910 + }, + { + "epoch": 16.596167494677076, + "grad_norm": 0.057533327490091324, + "learning_rate": 8.341107168204401e-05, + "loss": 0.0431816428899765, + "step": 116920 + }, + { + "epoch": 16.597586941092974, + "grad_norm": 0.42292356491088867, + "learning_rate": 8.340965223562811e-05, + "loss": 0.0034976493567228316, + "step": 116930 + }, + { + "epoch": 16.599006387508872, + "grad_norm": 0.04697134718298912, + "learning_rate": 8.340823278921222e-05, + "loss": 0.004483159631490707, + "step": 116940 + }, + { + "epoch": 16.60042583392477, + "grad_norm": 0.044272731989622116, + "learning_rate": 8.34068133427963e-05, + "loss": 0.038423961400985716, + "step": 116950 + }, + { + "epoch": 16.60184528034067, + "grad_norm": 3.4408552646636963, + "learning_rate": 8.340539389638042e-05, + "loss": 0.07658275365829467, + "step": 116960 + }, + { + "epoch": 16.603264726756564, + "grad_norm": 12.716185569763184, + "learning_rate": 8.340397444996451e-05, + "loss": 0.04419266879558563, + "step": 116970 + }, + { + "epoch": 16.604684173172462, + "grad_norm": 5.369838714599609, + "learning_rate": 8.340255500354862e-05, + "loss": 0.02143881618976593, + "step": 116980 + }, + { + "epoch": 16.60610361958836, + "grad_norm": 0.009581176564097404, + "learning_rate": 8.340113555713272e-05, + "loss": 0.011463432759046554, + "step": 116990 + }, + { + "epoch": 16.60752306600426, + "grad_norm": 13.691011428833008, + "learning_rate": 8.339971611071682e-05, + "loss": 0.010283533483743668, + "step": 117000 + }, + { + "epoch": 16.60752306600426, + "eval_accuracy": 0.982895657150124, + "eval_loss": 0.06755528599023819, + "eval_runtime": 33.6337, + "eval_samples_per_second": 467.596, + "eval_steps_per_second": 14.628, + "step": 117000 + }, + { + "epoch": 16.608942512420157, + "grad_norm": 0.05318843573331833, + "learning_rate": 8.339829666430092e-05, + "loss": 0.00797543078660965, + "step": 117010 + }, + { + "epoch": 16.610361958836055, + "grad_norm": 8.886881828308105, + "learning_rate": 8.339687721788503e-05, + "loss": 0.04408339262008667, + "step": 117020 + }, + { + "epoch": 16.611781405251953, + "grad_norm": 12.320178031921387, + "learning_rate": 8.339545777146914e-05, + "loss": 0.04468807280063629, + "step": 117030 + }, + { + "epoch": 16.613200851667848, + "grad_norm": 0.2367558479309082, + "learning_rate": 8.339403832505324e-05, + "loss": 0.06928732991218567, + "step": 117040 + }, + { + "epoch": 16.614620298083747, + "grad_norm": 0.6578727960586548, + "learning_rate": 8.339261887863735e-05, + "loss": 0.006959295272827149, + "step": 117050 + }, + { + "epoch": 16.616039744499645, + "grad_norm": 4.500197410583496, + "learning_rate": 8.339119943222143e-05, + "loss": 0.008282274007797241, + "step": 117060 + }, + { + "epoch": 16.617459190915543, + "grad_norm": 0.07957390695810318, + "learning_rate": 8.338977998580554e-05, + "loss": 0.018632544577121733, + "step": 117070 + }, + { + "epoch": 16.61887863733144, + "grad_norm": 0.03085346519947052, + "learning_rate": 8.338836053938964e-05, + "loss": 0.018058757483959197, + "step": 117080 + }, + { + "epoch": 16.62029808374734, + "grad_norm": 2.6925551891326904, + "learning_rate": 8.338694109297375e-05, + "loss": 0.03852428793907166, + "step": 117090 + }, + { + "epoch": 16.621717530163238, + "grad_norm": 1.5794312953948975, + "learning_rate": 8.338552164655785e-05, + "loss": 0.012525925040245056, + "step": 117100 + }, + { + "epoch": 16.623136976579133, + "grad_norm": 0.02748379483819008, + "learning_rate": 8.338410220014194e-05, + "loss": 0.03372379541397095, + "step": 117110 + }, + { + "epoch": 16.62455642299503, + "grad_norm": 7.666720390319824, + "learning_rate": 8.338268275372606e-05, + "loss": 0.015265947580337525, + "step": 117120 + }, + { + "epoch": 16.62597586941093, + "grad_norm": 3.351346254348755, + "learning_rate": 8.338126330731015e-05, + "loss": 0.03214417695999146, + "step": 117130 + }, + { + "epoch": 16.627395315826828, + "grad_norm": 2.961798667907715, + "learning_rate": 8.337984386089426e-05, + "loss": 0.0034553803503513335, + "step": 117140 + }, + { + "epoch": 16.628814762242726, + "grad_norm": 0.037895407527685165, + "learning_rate": 8.337842441447836e-05, + "loss": 0.0046453546732664105, + "step": 117150 + }, + { + "epoch": 16.630234208658624, + "grad_norm": 0.02167833223938942, + "learning_rate": 8.337700496806246e-05, + "loss": 0.03771307468414307, + "step": 117160 + }, + { + "epoch": 16.631653655074523, + "grad_norm": 0.37683844566345215, + "learning_rate": 8.337558552164656e-05, + "loss": 0.014065866172313691, + "step": 117170 + }, + { + "epoch": 16.633073101490417, + "grad_norm": 0.1941373646259308, + "learning_rate": 8.337416607523067e-05, + "loss": 0.026754018664360047, + "step": 117180 + }, + { + "epoch": 16.634492547906316, + "grad_norm": 0.07540564239025116, + "learning_rate": 8.337274662881476e-05, + "loss": 0.029977530241012573, + "step": 117190 + }, + { + "epoch": 16.635911994322214, + "grad_norm": 0.07340825349092484, + "learning_rate": 8.337132718239888e-05, + "loss": 0.011456934362649917, + "step": 117200 + }, + { + "epoch": 16.637331440738112, + "grad_norm": 3.494420289993286, + "learning_rate": 8.336990773598297e-05, + "loss": 0.022322097420692445, + "step": 117210 + }, + { + "epoch": 16.63875088715401, + "grad_norm": 1.0809097290039062, + "learning_rate": 8.336848828956707e-05, + "loss": 0.01884945034980774, + "step": 117220 + }, + { + "epoch": 16.64017033356991, + "grad_norm": 3.570888042449951, + "learning_rate": 8.336706884315118e-05, + "loss": 0.009533677250146866, + "step": 117230 + }, + { + "epoch": 16.641589779985807, + "grad_norm": 0.03743152692914009, + "learning_rate": 8.336564939673528e-05, + "loss": 0.02686296999454498, + "step": 117240 + }, + { + "epoch": 16.643009226401702, + "grad_norm": 0.36875879764556885, + "learning_rate": 8.336422995031939e-05, + "loss": 0.03535675704479217, + "step": 117250 + }, + { + "epoch": 16.6444286728176, + "grad_norm": 1.0940033197402954, + "learning_rate": 8.336281050390347e-05, + "loss": 0.0029364045709371566, + "step": 117260 + }, + { + "epoch": 16.6458481192335, + "grad_norm": 4.938434600830078, + "learning_rate": 8.336139105748758e-05, + "loss": 0.023037466406822204, + "step": 117270 + }, + { + "epoch": 16.647267565649397, + "grad_norm": 0.1725301295518875, + "learning_rate": 8.335997161107168e-05, + "loss": 0.05853109955787659, + "step": 117280 + }, + { + "epoch": 16.648687012065295, + "grad_norm": 0.9875553846359253, + "learning_rate": 8.335855216465579e-05, + "loss": 0.028866416215896605, + "step": 117290 + }, + { + "epoch": 16.650106458481194, + "grad_norm": 0.014165399596095085, + "learning_rate": 8.335713271823989e-05, + "loss": 0.04174271821975708, + "step": 117300 + }, + { + "epoch": 16.651525904897092, + "grad_norm": 3.5790436267852783, + "learning_rate": 8.335571327182399e-05, + "loss": 0.015095795691013335, + "step": 117310 + }, + { + "epoch": 16.652945351312987, + "grad_norm": 0.6211684346199036, + "learning_rate": 8.33542938254081e-05, + "loss": 0.011040177941322327, + "step": 117320 + }, + { + "epoch": 16.654364797728885, + "grad_norm": 0.011647253297269344, + "learning_rate": 8.33528743789922e-05, + "loss": 0.03260062634944916, + "step": 117330 + }, + { + "epoch": 16.655784244144783, + "grad_norm": 3.1049857139587402, + "learning_rate": 8.33514549325763e-05, + "loss": 0.04729237854480743, + "step": 117340 + }, + { + "epoch": 16.65720369056068, + "grad_norm": 2.3427257537841797, + "learning_rate": 8.33500354861604e-05, + "loss": 0.04432548880577088, + "step": 117350 + }, + { + "epoch": 16.65862313697658, + "grad_norm": 5.978562355041504, + "learning_rate": 8.33486160397445e-05, + "loss": 0.03646363019943237, + "step": 117360 + }, + { + "epoch": 16.660042583392478, + "grad_norm": 12.11841106414795, + "learning_rate": 8.33471965933286e-05, + "loss": 0.0496336430311203, + "step": 117370 + }, + { + "epoch": 16.661462029808376, + "grad_norm": 11.358500480651855, + "learning_rate": 8.334577714691271e-05, + "loss": 0.03017066717147827, + "step": 117380 + }, + { + "epoch": 16.66288147622427, + "grad_norm": 0.1127539798617363, + "learning_rate": 8.33443577004968e-05, + "loss": 0.02873893976211548, + "step": 117390 + }, + { + "epoch": 16.66430092264017, + "grad_norm": 0.4423753023147583, + "learning_rate": 8.334293825408092e-05, + "loss": 0.004543831199407577, + "step": 117400 + }, + { + "epoch": 16.665720369056068, + "grad_norm": 0.3045527935028076, + "learning_rate": 8.334151880766501e-05, + "loss": 0.005070832371711731, + "step": 117410 + }, + { + "epoch": 16.667139815471966, + "grad_norm": 1.01546049118042, + "learning_rate": 8.334009936124911e-05, + "loss": 0.020494504272937773, + "step": 117420 + }, + { + "epoch": 16.668559261887864, + "grad_norm": 0.43640536069869995, + "learning_rate": 8.333867991483322e-05, + "loss": 0.0030459832400083543, + "step": 117430 + }, + { + "epoch": 16.669978708303763, + "grad_norm": 4.950283527374268, + "learning_rate": 8.333726046841732e-05, + "loss": 0.05004417896270752, + "step": 117440 + }, + { + "epoch": 16.67139815471966, + "grad_norm": 8.34631061553955, + "learning_rate": 8.333584102200143e-05, + "loss": 0.009309016168117523, + "step": 117450 + }, + { + "epoch": 16.672817601135556, + "grad_norm": 1.3374550342559814, + "learning_rate": 8.333442157558553e-05, + "loss": 0.013670036196708679, + "step": 117460 + }, + { + "epoch": 16.674237047551454, + "grad_norm": 4.7788801193237305, + "learning_rate": 8.333300212916963e-05, + "loss": 0.03752616047859192, + "step": 117470 + }, + { + "epoch": 16.675656493967352, + "grad_norm": 0.3163778483867645, + "learning_rate": 8.333158268275372e-05, + "loss": 0.017105105519294738, + "step": 117480 + }, + { + "epoch": 16.67707594038325, + "grad_norm": 0.020877504721283913, + "learning_rate": 8.333016323633783e-05, + "loss": 0.03657255172729492, + "step": 117490 + }, + { + "epoch": 16.67849538679915, + "grad_norm": 0.011093349196016788, + "learning_rate": 8.332874378992193e-05, + "loss": 0.009079907834529877, + "step": 117500 + }, + { + "epoch": 16.67849538679915, + "eval_accuracy": 0.9876645259744389, + "eval_loss": 0.04434996098279953, + "eval_runtime": 32.6727, + "eval_samples_per_second": 481.35, + "eval_steps_per_second": 15.058, + "step": 117500 + }, + { + "epoch": 16.679914833215047, + "grad_norm": 0.21959349513053894, + "learning_rate": 8.332732434350604e-05, + "loss": 0.010873357951641082, + "step": 117510 + }, + { + "epoch": 16.681334279630946, + "grad_norm": 0.028821425512433052, + "learning_rate": 8.332590489709014e-05, + "loss": 0.01830779165029526, + "step": 117520 + }, + { + "epoch": 16.68275372604684, + "grad_norm": 0.00650831637904048, + "learning_rate": 8.332448545067424e-05, + "loss": 0.02057510018348694, + "step": 117530 + }, + { + "epoch": 16.68417317246274, + "grad_norm": 0.07420149445533752, + "learning_rate": 8.332306600425835e-05, + "loss": 0.02898067533969879, + "step": 117540 + }, + { + "epoch": 16.685592618878637, + "grad_norm": 0.13417570292949677, + "learning_rate": 8.332164655784245e-05, + "loss": 0.04370847940444946, + "step": 117550 + }, + { + "epoch": 16.687012065294535, + "grad_norm": 0.3044644892215729, + "learning_rate": 8.332022711142656e-05, + "loss": 0.040468630194664, + "step": 117560 + }, + { + "epoch": 16.688431511710434, + "grad_norm": 0.005468321498483419, + "learning_rate": 8.331880766501064e-05, + "loss": 0.03988224864006042, + "step": 117570 + }, + { + "epoch": 16.689850958126332, + "grad_norm": 1.0364768505096436, + "learning_rate": 8.331738821859475e-05, + "loss": 0.0045741118490695955, + "step": 117580 + }, + { + "epoch": 16.69127040454223, + "grad_norm": 9.827880859375, + "learning_rate": 8.331596877217885e-05, + "loss": 0.020103031396865846, + "step": 117590 + }, + { + "epoch": 16.692689850958125, + "grad_norm": 4.745016098022461, + "learning_rate": 8.331454932576296e-05, + "loss": 0.04226047396659851, + "step": 117600 + }, + { + "epoch": 16.694109297374023, + "grad_norm": 0.5448516607284546, + "learning_rate": 8.331312987934706e-05, + "loss": 0.016260065138339996, + "step": 117610 + }, + { + "epoch": 16.69552874378992, + "grad_norm": 0.011220871470868587, + "learning_rate": 8.331171043293115e-05, + "loss": 0.020905345678329468, + "step": 117620 + }, + { + "epoch": 16.69694819020582, + "grad_norm": 0.45287594199180603, + "learning_rate": 8.331029098651527e-05, + "loss": 0.01711665540933609, + "step": 117630 + }, + { + "epoch": 16.698367636621718, + "grad_norm": 3.3729403018951416, + "learning_rate": 8.330887154009936e-05, + "loss": 0.01988566517829895, + "step": 117640 + }, + { + "epoch": 16.699787083037616, + "grad_norm": 0.024243533611297607, + "learning_rate": 8.330745209368347e-05, + "loss": 0.0944155514240265, + "step": 117650 + }, + { + "epoch": 16.701206529453515, + "grad_norm": 3.3614089488983154, + "learning_rate": 8.330603264726757e-05, + "loss": 0.013256210088729858, + "step": 117660 + }, + { + "epoch": 16.70262597586941, + "grad_norm": 0.16608507931232452, + "learning_rate": 8.330461320085167e-05, + "loss": 0.029009857773780824, + "step": 117670 + }, + { + "epoch": 16.704045422285308, + "grad_norm": 4.247665882110596, + "learning_rate": 8.330319375443577e-05, + "loss": 0.03191390335559845, + "step": 117680 + }, + { + "epoch": 16.705464868701206, + "grad_norm": 4.687053203582764, + "learning_rate": 8.330177430801988e-05, + "loss": 0.07247737050056458, + "step": 117690 + }, + { + "epoch": 16.706884315117104, + "grad_norm": 1.3678719997406006, + "learning_rate": 8.330035486160397e-05, + "loss": 0.016752901673316955, + "step": 117700 + }, + { + "epoch": 16.708303761533003, + "grad_norm": 2.391916275024414, + "learning_rate": 8.329893541518809e-05, + "loss": 0.015759438276290894, + "step": 117710 + }, + { + "epoch": 16.7097232079489, + "grad_norm": 0.06527828425168991, + "learning_rate": 8.329751596877218e-05, + "loss": 0.043265300989151004, + "step": 117720 + }, + { + "epoch": 16.7111426543648, + "grad_norm": 0.5923253297805786, + "learning_rate": 8.329609652235628e-05, + "loss": 0.02035187929868698, + "step": 117730 + }, + { + "epoch": 16.712562100780694, + "grad_norm": 9.604898452758789, + "learning_rate": 8.329467707594039e-05, + "loss": 0.01412811279296875, + "step": 117740 + }, + { + "epoch": 16.713981547196592, + "grad_norm": 2.5967063903808594, + "learning_rate": 8.329325762952449e-05, + "loss": 0.005281927064061165, + "step": 117750 + }, + { + "epoch": 16.71540099361249, + "grad_norm": 0.33082273602485657, + "learning_rate": 8.32918381831086e-05, + "loss": 0.017080453038215638, + "step": 117760 + }, + { + "epoch": 16.71682044002839, + "grad_norm": 0.6532198786735535, + "learning_rate": 8.32904187366927e-05, + "loss": 0.029032516479492187, + "step": 117770 + }, + { + "epoch": 16.718239886444287, + "grad_norm": 0.05124543979763985, + "learning_rate": 8.32889992902768e-05, + "loss": 0.013310250639915467, + "step": 117780 + }, + { + "epoch": 16.719659332860186, + "grad_norm": 6.831470966339111, + "learning_rate": 8.328757984386089e-05, + "loss": 0.007946215569972992, + "step": 117790 + }, + { + "epoch": 16.721078779276084, + "grad_norm": 9.196264266967773, + "learning_rate": 8.3286160397445e-05, + "loss": 0.04549582004547119, + "step": 117800 + }, + { + "epoch": 16.72249822569198, + "grad_norm": 0.03787418454885483, + "learning_rate": 8.32847409510291e-05, + "loss": 0.018013326823711394, + "step": 117810 + }, + { + "epoch": 16.723917672107877, + "grad_norm": 0.1705215722322464, + "learning_rate": 8.328332150461321e-05, + "loss": 0.009039975702762604, + "step": 117820 + }, + { + "epoch": 16.725337118523775, + "grad_norm": 0.023820318281650543, + "learning_rate": 8.328190205819731e-05, + "loss": 0.010485945641994477, + "step": 117830 + }, + { + "epoch": 16.726756564939674, + "grad_norm": 0.03546424210071564, + "learning_rate": 8.32804826117814e-05, + "loss": 0.04599918127059936, + "step": 117840 + }, + { + "epoch": 16.728176011355572, + "grad_norm": 0.09232477098703384, + "learning_rate": 8.327906316536552e-05, + "loss": 0.005126936361193657, + "step": 117850 + }, + { + "epoch": 16.72959545777147, + "grad_norm": 0.22951701283454895, + "learning_rate": 8.327764371894961e-05, + "loss": 0.055786031484603885, + "step": 117860 + }, + { + "epoch": 16.73101490418737, + "grad_norm": 0.005998775362968445, + "learning_rate": 8.327622427253372e-05, + "loss": 0.04999669194221497, + "step": 117870 + }, + { + "epoch": 16.732434350603263, + "grad_norm": 0.004882345907390118, + "learning_rate": 8.327480482611781e-05, + "loss": 0.011063285171985626, + "step": 117880 + }, + { + "epoch": 16.73385379701916, + "grad_norm": 0.29720327258110046, + "learning_rate": 8.327338537970192e-05, + "loss": 0.0025852181017398832, + "step": 117890 + }, + { + "epoch": 16.73527324343506, + "grad_norm": 0.08615440875291824, + "learning_rate": 8.327196593328602e-05, + "loss": 0.0022923082113265993, + "step": 117900 + }, + { + "epoch": 16.73669268985096, + "grad_norm": 0.2930206060409546, + "learning_rate": 8.327054648687013e-05, + "loss": 0.028328916430473326, + "step": 117910 + }, + { + "epoch": 16.738112136266857, + "grad_norm": 0.052065297961235046, + "learning_rate": 8.326912704045423e-05, + "loss": 0.006777378171682358, + "step": 117920 + }, + { + "epoch": 16.739531582682755, + "grad_norm": 7.862561225891113, + "learning_rate": 8.326770759403832e-05, + "loss": 0.01529969722032547, + "step": 117930 + }, + { + "epoch": 16.740951029098653, + "grad_norm": 0.03088100254535675, + "learning_rate": 8.326628814762243e-05, + "loss": 0.053841644525527955, + "step": 117940 + }, + { + "epoch": 16.742370475514548, + "grad_norm": 3.4934494495391846, + "learning_rate": 8.326486870120653e-05, + "loss": 0.014754070341587067, + "step": 117950 + }, + { + "epoch": 16.743789921930446, + "grad_norm": 17.251235961914062, + "learning_rate": 8.326344925479064e-05, + "loss": 0.051452040672302246, + "step": 117960 + }, + { + "epoch": 16.745209368346345, + "grad_norm": 5.5792951583862305, + "learning_rate": 8.326202980837474e-05, + "loss": 0.03119921386241913, + "step": 117970 + }, + { + "epoch": 16.746628814762243, + "grad_norm": 0.1415969580411911, + "learning_rate": 8.326061036195884e-05, + "loss": 0.009391264617443084, + "step": 117980 + }, + { + "epoch": 16.74804826117814, + "grad_norm": 0.07763611525297165, + "learning_rate": 8.325919091554293e-05, + "loss": 0.022009353339672088, + "step": 117990 + }, + { + "epoch": 16.74946770759404, + "grad_norm": 0.29674217104911804, + "learning_rate": 8.325777146912704e-05, + "loss": 0.01453334093093872, + "step": 118000 + }, + { + "epoch": 16.74946770759404, + "eval_accuracy": 0.984103770585617, + "eval_loss": 0.06307372450828552, + "eval_runtime": 34.4236, + "eval_samples_per_second": 456.866, + "eval_steps_per_second": 14.293, + "step": 118000 + }, + { + "epoch": 16.750887154009938, + "grad_norm": 1.6057296991348267, + "learning_rate": 8.325635202271114e-05, + "loss": 0.010459934175014497, + "step": 118010 + }, + { + "epoch": 16.752306600425833, + "grad_norm": 0.00853043794631958, + "learning_rate": 8.325493257629525e-05, + "loss": 0.017493090033531188, + "step": 118020 + }, + { + "epoch": 16.75372604684173, + "grad_norm": 0.8278746604919434, + "learning_rate": 8.325351312987935e-05, + "loss": 0.024293732643127442, + "step": 118030 + }, + { + "epoch": 16.75514549325763, + "grad_norm": 0.06697604060173035, + "learning_rate": 8.325209368346345e-05, + "loss": 0.014129532873630524, + "step": 118040 + }, + { + "epoch": 16.756564939673527, + "grad_norm": 1.1142808198928833, + "learning_rate": 8.325067423704756e-05, + "loss": 0.008899647742509842, + "step": 118050 + }, + { + "epoch": 16.757984386089426, + "grad_norm": 0.6534438729286194, + "learning_rate": 8.324925479063166e-05, + "loss": 0.0418839693069458, + "step": 118060 + }, + { + "epoch": 16.759403832505324, + "grad_norm": 0.8519339561462402, + "learning_rate": 8.324783534421577e-05, + "loss": 0.014938108623027802, + "step": 118070 + }, + { + "epoch": 16.760823278921222, + "grad_norm": 0.3277834951877594, + "learning_rate": 8.324641589779985e-05, + "loss": 0.02368403822183609, + "step": 118080 + }, + { + "epoch": 16.762242725337117, + "grad_norm": 0.11566946655511856, + "learning_rate": 8.324499645138396e-05, + "loss": 0.04180347919464111, + "step": 118090 + }, + { + "epoch": 16.763662171753015, + "grad_norm": 1.1494996547698975, + "learning_rate": 8.324357700496806e-05, + "loss": 0.0094342440366745, + "step": 118100 + }, + { + "epoch": 16.765081618168914, + "grad_norm": 0.013919823803007603, + "learning_rate": 8.324215755855217e-05, + "loss": 0.06293154954910278, + "step": 118110 + }, + { + "epoch": 16.766501064584812, + "grad_norm": 4.400853157043457, + "learning_rate": 8.324073811213627e-05, + "loss": 0.027945888042449952, + "step": 118120 + }, + { + "epoch": 16.76792051100071, + "grad_norm": 2.673232078552246, + "learning_rate": 8.323931866572038e-05, + "loss": 0.02772565484046936, + "step": 118130 + }, + { + "epoch": 16.76933995741661, + "grad_norm": 0.4370087683200836, + "learning_rate": 8.323789921930448e-05, + "loss": 0.04705857038497925, + "step": 118140 + }, + { + "epoch": 16.770759403832507, + "grad_norm": 0.4404630661010742, + "learning_rate": 8.323647977288857e-05, + "loss": 0.01638329178094864, + "step": 118150 + }, + { + "epoch": 16.7721788502484, + "grad_norm": 14.777122497558594, + "learning_rate": 8.323506032647268e-05, + "loss": 0.015768852829933167, + "step": 118160 + }, + { + "epoch": 16.7735982966643, + "grad_norm": 0.024510599672794342, + "learning_rate": 8.323364088005678e-05, + "loss": 0.03490220904350281, + "step": 118170 + }, + { + "epoch": 16.7750177430802, + "grad_norm": 0.20726227760314941, + "learning_rate": 8.323222143364089e-05, + "loss": 0.017764410376548766, + "step": 118180 + }, + { + "epoch": 16.776437189496097, + "grad_norm": 0.646827757358551, + "learning_rate": 8.323080198722498e-05, + "loss": 0.006039583683013916, + "step": 118190 + }, + { + "epoch": 16.777856635911995, + "grad_norm": 7.485454082489014, + "learning_rate": 8.322938254080909e-05, + "loss": 0.014593130350112915, + "step": 118200 + }, + { + "epoch": 16.779276082327893, + "grad_norm": 0.013658017851412296, + "learning_rate": 8.322796309439318e-05, + "loss": 0.0015197057276964188, + "step": 118210 + }, + { + "epoch": 16.78069552874379, + "grad_norm": 0.10143446922302246, + "learning_rate": 8.32265436479773e-05, + "loss": 0.036323907971382144, + "step": 118220 + }, + { + "epoch": 16.782114975159686, + "grad_norm": 0.6719693541526794, + "learning_rate": 8.32251242015614e-05, + "loss": 0.013683655858039856, + "step": 118230 + }, + { + "epoch": 16.783534421575585, + "grad_norm": 2.3426082134246826, + "learning_rate": 8.322370475514549e-05, + "loss": 0.028924247622489928, + "step": 118240 + }, + { + "epoch": 16.784953867991483, + "grad_norm": 18.29952621459961, + "learning_rate": 8.32222853087296e-05, + "loss": 0.030454546213150024, + "step": 118250 + }, + { + "epoch": 16.78637331440738, + "grad_norm": 0.7703476548194885, + "learning_rate": 8.32208658623137e-05, + "loss": 0.021138817071914673, + "step": 118260 + }, + { + "epoch": 16.78779276082328, + "grad_norm": 0.6782881617546082, + "learning_rate": 8.321944641589781e-05, + "loss": 0.05559571981430054, + "step": 118270 + }, + { + "epoch": 16.789212207239178, + "grad_norm": 3.4025721549987793, + "learning_rate": 8.32180269694819e-05, + "loss": 0.03414316773414612, + "step": 118280 + }, + { + "epoch": 16.790631653655076, + "grad_norm": 9.724266052246094, + "learning_rate": 8.3216607523066e-05, + "loss": 0.014724119007587433, + "step": 118290 + }, + { + "epoch": 16.79205110007097, + "grad_norm": 0.03437022492289543, + "learning_rate": 8.32151880766501e-05, + "loss": 0.025532713532447814, + "step": 118300 + }, + { + "epoch": 16.79347054648687, + "grad_norm": 4.5364766120910645, + "learning_rate": 8.321376863023421e-05, + "loss": 0.009097173064947128, + "step": 118310 + }, + { + "epoch": 16.794889992902768, + "grad_norm": 0.1253851056098938, + "learning_rate": 8.321234918381832e-05, + "loss": 0.014451992511749268, + "step": 118320 + }, + { + "epoch": 16.796309439318666, + "grad_norm": 1.6803902387619019, + "learning_rate": 8.321092973740242e-05, + "loss": 0.039790353178977965, + "step": 118330 + }, + { + "epoch": 16.797728885734564, + "grad_norm": 10.777732849121094, + "learning_rate": 8.320951029098652e-05, + "loss": 0.026776975393295287, + "step": 118340 + }, + { + "epoch": 16.799148332150462, + "grad_norm": 0.556053876876831, + "learning_rate": 8.320809084457062e-05, + "loss": 0.04346528351306915, + "step": 118350 + }, + { + "epoch": 16.80056777856636, + "grad_norm": 1.746227741241455, + "learning_rate": 8.320667139815473e-05, + "loss": 0.017362989485263824, + "step": 118360 + }, + { + "epoch": 16.801987224982255, + "grad_norm": 0.28359484672546387, + "learning_rate": 8.320525195173882e-05, + "loss": 0.05255056619644165, + "step": 118370 + }, + { + "epoch": 16.803406671398154, + "grad_norm": 0.07087396085262299, + "learning_rate": 8.320383250532293e-05, + "loss": 0.0291361004114151, + "step": 118380 + }, + { + "epoch": 16.804826117814052, + "grad_norm": 6.125046730041504, + "learning_rate": 8.320241305890702e-05, + "loss": 0.0548758864402771, + "step": 118390 + }, + { + "epoch": 16.80624556422995, + "grad_norm": 0.7101525068283081, + "learning_rate": 8.320099361249113e-05, + "loss": 0.015115344524383545, + "step": 118400 + }, + { + "epoch": 16.80766501064585, + "grad_norm": 1.2516734600067139, + "learning_rate": 8.319957416607524e-05, + "loss": 0.03866508603096008, + "step": 118410 + }, + { + "epoch": 16.809084457061747, + "grad_norm": 0.5947251319885254, + "learning_rate": 8.319815471965934e-05, + "loss": 0.025516587495803832, + "step": 118420 + }, + { + "epoch": 16.810503903477645, + "grad_norm": 0.09165836125612259, + "learning_rate": 8.319673527324345e-05, + "loss": 0.018566188216209412, + "step": 118430 + }, + { + "epoch": 16.81192334989354, + "grad_norm": 0.45714905858039856, + "learning_rate": 8.319531582682755e-05, + "loss": 0.026043158769607545, + "step": 118440 + }, + { + "epoch": 16.81334279630944, + "grad_norm": 0.05650230124592781, + "learning_rate": 8.319389638041164e-05, + "loss": 0.0274466872215271, + "step": 118450 + }, + { + "epoch": 16.814762242725337, + "grad_norm": 0.006217160262167454, + "learning_rate": 8.319247693399574e-05, + "loss": 0.03689888119697571, + "step": 118460 + }, + { + "epoch": 16.816181689141235, + "grad_norm": 1.209546685218811, + "learning_rate": 8.319105748757985e-05, + "loss": 0.004774202033877373, + "step": 118470 + }, + { + "epoch": 16.817601135557133, + "grad_norm": 1.517468810081482, + "learning_rate": 8.318963804116395e-05, + "loss": 0.016903454065322877, + "step": 118480 + }, + { + "epoch": 16.81902058197303, + "grad_norm": 0.11526894569396973, + "learning_rate": 8.318821859474806e-05, + "loss": 0.039901772141456605, + "step": 118490 + }, + { + "epoch": 16.82044002838893, + "grad_norm": 0.47346436977386475, + "learning_rate": 8.318679914833216e-05, + "loss": 0.04827032089233398, + "step": 118500 + }, + { + "epoch": 16.82044002838893, + "eval_accuracy": 0.9791441470083296, + "eval_loss": 0.08274804055690765, + "eval_runtime": 32.5356, + "eval_samples_per_second": 483.378, + "eval_steps_per_second": 15.122, + "step": 118500 + }, + { + "epoch": 16.821859474804825, + "grad_norm": 0.843011736869812, + "learning_rate": 8.318537970191625e-05, + "loss": 0.029598337411880494, + "step": 118510 + }, + { + "epoch": 16.823278921220723, + "grad_norm": 7.9473371505737305, + "learning_rate": 8.318396025550037e-05, + "loss": 0.02198069393634796, + "step": 118520 + }, + { + "epoch": 16.82469836763662, + "grad_norm": 0.07339850068092346, + "learning_rate": 8.318254080908446e-05, + "loss": 0.011041966825723648, + "step": 118530 + }, + { + "epoch": 16.82611781405252, + "grad_norm": 0.6384608745574951, + "learning_rate": 8.318112136266857e-05, + "loss": 0.04286980330944061, + "step": 118540 + }, + { + "epoch": 16.827537260468418, + "grad_norm": 10.388303756713867, + "learning_rate": 8.317970191625266e-05, + "loss": 0.032366585731506345, + "step": 118550 + }, + { + "epoch": 16.828956706884316, + "grad_norm": 0.1697257161140442, + "learning_rate": 8.317828246983677e-05, + "loss": 0.06269558072090149, + "step": 118560 + }, + { + "epoch": 16.830376153300215, + "grad_norm": 0.03282211720943451, + "learning_rate": 8.317686302342087e-05, + "loss": 0.005208947509527206, + "step": 118570 + }, + { + "epoch": 16.83179559971611, + "grad_norm": 0.026220278814435005, + "learning_rate": 8.317544357700498e-05, + "loss": 0.010369515419006348, + "step": 118580 + }, + { + "epoch": 16.833215046132008, + "grad_norm": 0.057037319988012314, + "learning_rate": 8.317402413058907e-05, + "loss": 0.05828549861907959, + "step": 118590 + }, + { + "epoch": 16.834634492547906, + "grad_norm": 0.006431225221604109, + "learning_rate": 8.317260468417317e-05, + "loss": 0.00878979116678238, + "step": 118600 + }, + { + "epoch": 16.836053938963804, + "grad_norm": 3.5843164920806885, + "learning_rate": 8.317118523775728e-05, + "loss": 0.007160958647727966, + "step": 118610 + }, + { + "epoch": 16.837473385379703, + "grad_norm": 8.680359840393066, + "learning_rate": 8.316976579134138e-05, + "loss": 0.04745030403137207, + "step": 118620 + }, + { + "epoch": 16.8388928317956, + "grad_norm": 0.028621600940823555, + "learning_rate": 8.316834634492549e-05, + "loss": 0.01384253203868866, + "step": 118630 + }, + { + "epoch": 16.8403122782115, + "grad_norm": 6.854605197906494, + "learning_rate": 8.316692689850959e-05, + "loss": 0.0036096815019845963, + "step": 118640 + }, + { + "epoch": 16.841731724627394, + "grad_norm": 0.07637093961238861, + "learning_rate": 8.316550745209369e-05, + "loss": 0.0033883821219205857, + "step": 118650 + }, + { + "epoch": 16.843151171043292, + "grad_norm": 0.10442734509706497, + "learning_rate": 8.316408800567778e-05, + "loss": 0.002956448122859001, + "step": 118660 + }, + { + "epoch": 16.84457061745919, + "grad_norm": 0.02588530443608761, + "learning_rate": 8.31626685592619e-05, + "loss": 0.03580034673213959, + "step": 118670 + }, + { + "epoch": 16.84599006387509, + "grad_norm": 0.3507205545902252, + "learning_rate": 8.316124911284599e-05, + "loss": 0.02349122762680054, + "step": 118680 + }, + { + "epoch": 16.847409510290987, + "grad_norm": 0.3434644937515259, + "learning_rate": 8.31598296664301e-05, + "loss": 0.01267555058002472, + "step": 118690 + }, + { + "epoch": 16.848828956706885, + "grad_norm": 6.556180000305176, + "learning_rate": 8.31584102200142e-05, + "loss": 0.006739020347595215, + "step": 118700 + }, + { + "epoch": 16.850248403122784, + "grad_norm": 0.01769874058663845, + "learning_rate": 8.31569907735983e-05, + "loss": 0.001453077420592308, + "step": 118710 + }, + { + "epoch": 16.85166784953868, + "grad_norm": 0.01593855768442154, + "learning_rate": 8.315557132718241e-05, + "loss": 0.02313033491373062, + "step": 118720 + }, + { + "epoch": 16.853087295954577, + "grad_norm": 0.7602181434631348, + "learning_rate": 8.31541518807665e-05, + "loss": 0.005456413701176643, + "step": 118730 + }, + { + "epoch": 16.854506742370475, + "grad_norm": 7.872560024261475, + "learning_rate": 8.315273243435062e-05, + "loss": 0.007511251419782638, + "step": 118740 + }, + { + "epoch": 16.855926188786373, + "grad_norm": 0.27230358123779297, + "learning_rate": 8.31513129879347e-05, + "loss": 0.02225078046321869, + "step": 118750 + }, + { + "epoch": 16.85734563520227, + "grad_norm": 1.2540390491485596, + "learning_rate": 8.314989354151881e-05, + "loss": 0.010367317497730255, + "step": 118760 + }, + { + "epoch": 16.85876508161817, + "grad_norm": 0.20370694994926453, + "learning_rate": 8.314847409510291e-05, + "loss": 0.007272961735725403, + "step": 118770 + }, + { + "epoch": 16.86018452803407, + "grad_norm": 0.1761959195137024, + "learning_rate": 8.314705464868702e-05, + "loss": 0.004762361571192741, + "step": 118780 + }, + { + "epoch": 16.861603974449963, + "grad_norm": 0.09174513816833496, + "learning_rate": 8.314563520227112e-05, + "loss": 0.014287218451499939, + "step": 118790 + }, + { + "epoch": 16.86302342086586, + "grad_norm": 1.3199766874313354, + "learning_rate": 8.314421575585523e-05, + "loss": 0.012278030812740325, + "step": 118800 + }, + { + "epoch": 16.86444286728176, + "grad_norm": 0.07524289190769196, + "learning_rate": 8.314279630943933e-05, + "loss": 0.0019908275455236433, + "step": 118810 + }, + { + "epoch": 16.865862313697658, + "grad_norm": 0.8612643480300903, + "learning_rate": 8.314137686302342e-05, + "loss": 0.010582181811332702, + "step": 118820 + }, + { + "epoch": 16.867281760113556, + "grad_norm": 0.04525861144065857, + "learning_rate": 8.313995741660753e-05, + "loss": 0.0019207149744033813, + "step": 118830 + }, + { + "epoch": 16.868701206529455, + "grad_norm": 0.05441594496369362, + "learning_rate": 8.313853797019163e-05, + "loss": 0.0021228346973657607, + "step": 118840 + }, + { + "epoch": 16.870120652945353, + "grad_norm": 2.334911584854126, + "learning_rate": 8.313711852377574e-05, + "loss": 0.004405818507075309, + "step": 118850 + }, + { + "epoch": 16.871540099361248, + "grad_norm": 9.895645141601562, + "learning_rate": 8.313569907735983e-05, + "loss": 0.018842428922653198, + "step": 118860 + }, + { + "epoch": 16.872959545777146, + "grad_norm": 1.2320467233657837, + "learning_rate": 8.313427963094394e-05, + "loss": 0.022146573662757872, + "step": 118870 + }, + { + "epoch": 16.874378992193044, + "grad_norm": 0.09316658228635788, + "learning_rate": 8.313286018452803e-05, + "loss": 0.014387600123882294, + "step": 118880 + }, + { + "epoch": 16.875798438608943, + "grad_norm": 0.4701049029827118, + "learning_rate": 8.313144073811214e-05, + "loss": 0.02150590270757675, + "step": 118890 + }, + { + "epoch": 16.87721788502484, + "grad_norm": 0.18994688987731934, + "learning_rate": 8.313002129169624e-05, + "loss": 0.06476402878761292, + "step": 118900 + }, + { + "epoch": 16.87863733144074, + "grad_norm": 0.664664089679718, + "learning_rate": 8.312860184528034e-05, + "loss": 0.06443645358085633, + "step": 118910 + }, + { + "epoch": 16.880056777856637, + "grad_norm": 11.022554397583008, + "learning_rate": 8.312718239886445e-05, + "loss": 0.010656381398439408, + "step": 118920 + }, + { + "epoch": 16.881476224272532, + "grad_norm": 0.023860175162553787, + "learning_rate": 8.312576295244855e-05, + "loss": 0.009106354415416717, + "step": 118930 + }, + { + "epoch": 16.88289567068843, + "grad_norm": 10.94786548614502, + "learning_rate": 8.312434350603266e-05, + "loss": 0.026509439945220946, + "step": 118940 + }, + { + "epoch": 16.88431511710433, + "grad_norm": 4.937229156494141, + "learning_rate": 8.312292405961676e-05, + "loss": 0.018260036408901215, + "step": 118950 + }, + { + "epoch": 16.885734563520227, + "grad_norm": 0.05745793506503105, + "learning_rate": 8.312150461320085e-05, + "loss": 0.09326847195625305, + "step": 118960 + }, + { + "epoch": 16.887154009936125, + "grad_norm": 6.062938690185547, + "learning_rate": 8.312008516678495e-05, + "loss": 0.044204458594322205, + "step": 118970 + }, + { + "epoch": 16.888573456352024, + "grad_norm": 8.90611743927002, + "learning_rate": 8.311866572036906e-05, + "loss": 0.018737226724624634, + "step": 118980 + }, + { + "epoch": 16.889992902767922, + "grad_norm": 0.03786252439022064, + "learning_rate": 8.311724627395316e-05, + "loss": 0.07848379015922546, + "step": 118990 + }, + { + "epoch": 16.891412349183817, + "grad_norm": 0.3104473054409027, + "learning_rate": 8.311582682753727e-05, + "loss": 0.01446186900138855, + "step": 119000 + }, + { + "epoch": 16.891412349183817, + "eval_accuracy": 0.9876009410567813, + "eval_loss": 0.045681487768888474, + "eval_runtime": 32.6866, + "eval_samples_per_second": 481.146, + "eval_steps_per_second": 15.052, + "step": 119000 + }, + { + "epoch": 16.892831795599715, + "grad_norm": 1.2676396369934082, + "learning_rate": 8.311440738112137e-05, + "loss": 0.02440263032913208, + "step": 119010 + }, + { + "epoch": 16.894251242015613, + "grad_norm": 0.10896630585193634, + "learning_rate": 8.311298793470546e-05, + "loss": 0.04403060376644134, + "step": 119020 + }, + { + "epoch": 16.89567068843151, + "grad_norm": 0.6196576952934265, + "learning_rate": 8.311156848828958e-05, + "loss": 0.01849239319562912, + "step": 119030 + }, + { + "epoch": 16.89709013484741, + "grad_norm": 4.4326090812683105, + "learning_rate": 8.311014904187367e-05, + "loss": 0.044346022605896, + "step": 119040 + }, + { + "epoch": 16.89850958126331, + "grad_norm": 0.3181878626346588, + "learning_rate": 8.310872959545778e-05, + "loss": 0.06927153468132019, + "step": 119050 + }, + { + "epoch": 16.899929027679207, + "grad_norm": 0.17974849045276642, + "learning_rate": 8.310731014904187e-05, + "loss": 0.005778995156288147, + "step": 119060 + }, + { + "epoch": 16.9013484740951, + "grad_norm": 0.14680950343608856, + "learning_rate": 8.310589070262598e-05, + "loss": 0.030992990732192992, + "step": 119070 + }, + { + "epoch": 16.902767920511, + "grad_norm": 0.28509706258773804, + "learning_rate": 8.310447125621008e-05, + "loss": 0.008585918694734573, + "step": 119080 + }, + { + "epoch": 16.904187366926898, + "grad_norm": 1.0589877367019653, + "learning_rate": 8.310305180979419e-05, + "loss": 0.014909610152244568, + "step": 119090 + }, + { + "epoch": 16.905606813342796, + "grad_norm": 5.249059200286865, + "learning_rate": 8.310163236337828e-05, + "loss": 0.026605024933815002, + "step": 119100 + }, + { + "epoch": 16.907026259758695, + "grad_norm": 0.2209336906671524, + "learning_rate": 8.310021291696238e-05, + "loss": 0.03418198227882385, + "step": 119110 + }, + { + "epoch": 16.908445706174593, + "grad_norm": 0.33446890115737915, + "learning_rate": 8.309879347054649e-05, + "loss": 0.0033166084438562395, + "step": 119120 + }, + { + "epoch": 16.90986515259049, + "grad_norm": 3.7871241569519043, + "learning_rate": 8.309737402413059e-05, + "loss": 0.06138277053833008, + "step": 119130 + }, + { + "epoch": 16.911284599006386, + "grad_norm": 1.6204237937927246, + "learning_rate": 8.30959545777147e-05, + "loss": 0.027744191884994506, + "step": 119140 + }, + { + "epoch": 16.912704045422284, + "grad_norm": 0.0447792150080204, + "learning_rate": 8.30945351312988e-05, + "loss": 0.022081327438354493, + "step": 119150 + }, + { + "epoch": 16.914123491838183, + "grad_norm": 1.4444817304611206, + "learning_rate": 8.309311568488291e-05, + "loss": 0.020131246745586397, + "step": 119160 + }, + { + "epoch": 16.91554293825408, + "grad_norm": 9.670873641967773, + "learning_rate": 8.3091696238467e-05, + "loss": 0.04307814538478851, + "step": 119170 + }, + { + "epoch": 16.91696238466998, + "grad_norm": 3.049792766571045, + "learning_rate": 8.30902767920511e-05, + "loss": 0.013657885789871215, + "step": 119180 + }, + { + "epoch": 16.918381831085878, + "grad_norm": 0.13626520335674286, + "learning_rate": 8.30888573456352e-05, + "loss": 0.03500192165374756, + "step": 119190 + }, + { + "epoch": 16.919801277501776, + "grad_norm": 3.698606252670288, + "learning_rate": 8.308743789921931e-05, + "loss": 0.02214457541704178, + "step": 119200 + }, + { + "epoch": 16.92122072391767, + "grad_norm": 0.14914394915103912, + "learning_rate": 8.308601845280341e-05, + "loss": 0.034185728430747984, + "step": 119210 + }, + { + "epoch": 16.92264017033357, + "grad_norm": 0.009660141542553902, + "learning_rate": 8.308459900638751e-05, + "loss": 0.019360674917697905, + "step": 119220 + }, + { + "epoch": 16.924059616749467, + "grad_norm": 1.695876121520996, + "learning_rate": 8.308317955997162e-05, + "loss": 0.0037443704903125765, + "step": 119230 + }, + { + "epoch": 16.925479063165366, + "grad_norm": 1.125134825706482, + "learning_rate": 8.308176011355572e-05, + "loss": 0.04052403867244721, + "step": 119240 + }, + { + "epoch": 16.926898509581264, + "grad_norm": 0.0727100819349289, + "learning_rate": 8.308034066713983e-05, + "loss": 0.030857852101325987, + "step": 119250 + }, + { + "epoch": 16.928317955997162, + "grad_norm": 5.554221153259277, + "learning_rate": 8.307892122072392e-05, + "loss": 0.03768602609634399, + "step": 119260 + }, + { + "epoch": 16.92973740241306, + "grad_norm": 0.8900468349456787, + "learning_rate": 8.307750177430802e-05, + "loss": 0.009594323486089707, + "step": 119270 + }, + { + "epoch": 16.931156848828955, + "grad_norm": 1.195831537246704, + "learning_rate": 8.307608232789212e-05, + "loss": 0.026123252511024476, + "step": 119280 + }, + { + "epoch": 16.932576295244854, + "grad_norm": 0.023000942543148994, + "learning_rate": 8.307466288147623e-05, + "loss": 0.004097868502140045, + "step": 119290 + }, + { + "epoch": 16.933995741660752, + "grad_norm": 0.07576500624418259, + "learning_rate": 8.307324343506033e-05, + "loss": 0.018370094895362853, + "step": 119300 + }, + { + "epoch": 16.93541518807665, + "grad_norm": 1.6272691488265991, + "learning_rate": 8.307182398864444e-05, + "loss": 0.007767494022846222, + "step": 119310 + }, + { + "epoch": 16.93683463449255, + "grad_norm": 0.062356818467378616, + "learning_rate": 8.307040454222854e-05, + "loss": 0.005623319372534752, + "step": 119320 + }, + { + "epoch": 16.938254080908447, + "grad_norm": 0.7122419476509094, + "learning_rate": 8.306898509581263e-05, + "loss": 0.02900628447532654, + "step": 119330 + }, + { + "epoch": 16.939673527324345, + "grad_norm": 0.7076064944267273, + "learning_rate": 8.306756564939674e-05, + "loss": 0.07354017496109008, + "step": 119340 + }, + { + "epoch": 16.94109297374024, + "grad_norm": 0.0743996724486351, + "learning_rate": 8.306614620298084e-05, + "loss": 0.007228873670101166, + "step": 119350 + }, + { + "epoch": 16.942512420156138, + "grad_norm": 0.05068621039390564, + "learning_rate": 8.306472675656495e-05, + "loss": 0.03541290760040283, + "step": 119360 + }, + { + "epoch": 16.943931866572036, + "grad_norm": 0.4136679768562317, + "learning_rate": 8.306330731014904e-05, + "loss": 0.003943739831447602, + "step": 119370 + }, + { + "epoch": 16.945351312987935, + "grad_norm": 0.21719232201576233, + "learning_rate": 8.306188786373315e-05, + "loss": 0.005219864472746849, + "step": 119380 + }, + { + "epoch": 16.946770759403833, + "grad_norm": 4.703658580780029, + "learning_rate": 8.306046841731724e-05, + "loss": 0.014817482233047486, + "step": 119390 + }, + { + "epoch": 16.94819020581973, + "grad_norm": 0.1651671677827835, + "learning_rate": 8.305904897090135e-05, + "loss": 0.009183306992053986, + "step": 119400 + }, + { + "epoch": 16.94960965223563, + "grad_norm": 3.381152391433716, + "learning_rate": 8.305762952448545e-05, + "loss": 0.02705332338809967, + "step": 119410 + }, + { + "epoch": 16.951029098651524, + "grad_norm": 0.05078170821070671, + "learning_rate": 8.305621007806955e-05, + "loss": 0.0509130597114563, + "step": 119420 + }, + { + "epoch": 16.952448545067423, + "grad_norm": 6.6379313468933105, + "learning_rate": 8.305479063165366e-05, + "loss": 0.028801146149635314, + "step": 119430 + }, + { + "epoch": 16.95386799148332, + "grad_norm": 6.881738662719727, + "learning_rate": 8.305337118523776e-05, + "loss": 0.010584786534309387, + "step": 119440 + }, + { + "epoch": 16.95528743789922, + "grad_norm": 0.21296283602714539, + "learning_rate": 8.305195173882187e-05, + "loss": 0.009670565277338028, + "step": 119450 + }, + { + "epoch": 16.956706884315118, + "grad_norm": 0.04307686537504196, + "learning_rate": 8.305053229240597e-05, + "loss": 0.008084750920534133, + "step": 119460 + }, + { + "epoch": 16.958126330731016, + "grad_norm": 3.313858985900879, + "learning_rate": 8.304911284599006e-05, + "loss": 0.005667193233966828, + "step": 119470 + }, + { + "epoch": 16.959545777146914, + "grad_norm": 0.04059341922402382, + "learning_rate": 8.304769339957416e-05, + "loss": 0.007192540168762207, + "step": 119480 + }, + { + "epoch": 16.96096522356281, + "grad_norm": 0.09984651952981949, + "learning_rate": 8.304627395315827e-05, + "loss": 0.006505205482244492, + "step": 119490 + }, + { + "epoch": 16.962384669978707, + "grad_norm": 0.019779440015554428, + "learning_rate": 8.304485450674237e-05, + "loss": 0.008879543095827103, + "step": 119500 + }, + { + "epoch": 16.962384669978707, + "eval_accuracy": 0.9835950912443568, + "eval_loss": 0.06713375449180603, + "eval_runtime": 33.0669, + "eval_samples_per_second": 475.611, + "eval_steps_per_second": 14.879, + "step": 119500 + }, + { + "epoch": 16.963804116394606, + "grad_norm": 0.07041562348604202, + "learning_rate": 8.304343506032648e-05, + "loss": 0.04628815948963165, + "step": 119510 + }, + { + "epoch": 16.965223562810504, + "grad_norm": 0.5285801887512207, + "learning_rate": 8.304201561391058e-05, + "loss": 0.025107762217521666, + "step": 119520 + }, + { + "epoch": 16.966643009226402, + "grad_norm": 12.144177436828613, + "learning_rate": 8.304059616749467e-05, + "loss": 0.06175790429115295, + "step": 119530 + }, + { + "epoch": 16.9680624556423, + "grad_norm": 0.12573617696762085, + "learning_rate": 8.303917672107879e-05, + "loss": 0.0293417751789093, + "step": 119540 + }, + { + "epoch": 16.9694819020582, + "grad_norm": 0.13848525285720825, + "learning_rate": 8.303775727466288e-05, + "loss": 0.006557562947273254, + "step": 119550 + }, + { + "epoch": 16.970901348474094, + "grad_norm": 0.01964346133172512, + "learning_rate": 8.3036337828247e-05, + "loss": 0.010669238865375519, + "step": 119560 + }, + { + "epoch": 16.972320794889992, + "grad_norm": 0.2593211829662323, + "learning_rate": 8.303491838183109e-05, + "loss": 0.0015080250799655915, + "step": 119570 + }, + { + "epoch": 16.97374024130589, + "grad_norm": 15.115253448486328, + "learning_rate": 8.303349893541519e-05, + "loss": 0.023484209179878236, + "step": 119580 + }, + { + "epoch": 16.97515968772179, + "grad_norm": 0.018677575513720512, + "learning_rate": 8.303207948899929e-05, + "loss": 0.004597761482000351, + "step": 119590 + }, + { + "epoch": 16.976579134137687, + "grad_norm": 2.686915874481201, + "learning_rate": 8.30306600425834e-05, + "loss": 0.0300980269908905, + "step": 119600 + }, + { + "epoch": 16.977998580553585, + "grad_norm": 0.6160420179367065, + "learning_rate": 8.30292405961675e-05, + "loss": 0.024341486394405365, + "step": 119610 + }, + { + "epoch": 16.979418026969483, + "grad_norm": 0.10150213539600372, + "learning_rate": 8.30278211497516e-05, + "loss": 0.013426159322261811, + "step": 119620 + }, + { + "epoch": 16.980837473385378, + "grad_norm": 0.00996475201100111, + "learning_rate": 8.30264017033357e-05, + "loss": 0.019675298035144805, + "step": 119630 + }, + { + "epoch": 16.982256919801276, + "grad_norm": 0.14315427839756012, + "learning_rate": 8.30249822569198e-05, + "loss": 0.003599084168672562, + "step": 119640 + }, + { + "epoch": 16.983676366217175, + "grad_norm": 0.23158226907253265, + "learning_rate": 8.302356281050391e-05, + "loss": 0.006026803702116013, + "step": 119650 + }, + { + "epoch": 16.985095812633073, + "grad_norm": 0.622970700263977, + "learning_rate": 8.302214336408801e-05, + "loss": 0.0015147797763347625, + "step": 119660 + }, + { + "epoch": 16.98651525904897, + "grad_norm": 0.9638655781745911, + "learning_rate": 8.302072391767212e-05, + "loss": 0.002849790453910828, + "step": 119670 + }, + { + "epoch": 16.98793470546487, + "grad_norm": 2.5214967727661133, + "learning_rate": 8.30193044712562e-05, + "loss": 0.0074197396636009215, + "step": 119680 + }, + { + "epoch": 16.989354151880768, + "grad_norm": 0.2692882716655731, + "learning_rate": 8.301788502484031e-05, + "loss": 0.005155200883746147, + "step": 119690 + }, + { + "epoch": 16.990773598296663, + "grad_norm": 0.41026079654693604, + "learning_rate": 8.301646557842441e-05, + "loss": 0.0035352624952793123, + "step": 119700 + }, + { + "epoch": 16.99219304471256, + "grad_norm": 3.5877914428710938, + "learning_rate": 8.301504613200852e-05, + "loss": 0.009292619675397873, + "step": 119710 + }, + { + "epoch": 16.99361249112846, + "grad_norm": 0.016373714432120323, + "learning_rate": 8.301362668559263e-05, + "loss": 0.004624640569090843, + "step": 119720 + }, + { + "epoch": 16.995031937544358, + "grad_norm": 0.15189428627490997, + "learning_rate": 8.301220723917672e-05, + "loss": 0.03775623440742493, + "step": 119730 + }, + { + "epoch": 16.996451383960256, + "grad_norm": 0.027784127742052078, + "learning_rate": 8.301078779276083e-05, + "loss": 0.010945260524749756, + "step": 119740 + }, + { + "epoch": 16.997870830376154, + "grad_norm": 0.5772379636764526, + "learning_rate": 8.300936834634493e-05, + "loss": 0.020091539621353148, + "step": 119750 + }, + { + "epoch": 16.999290276792053, + "grad_norm": 12.586730003356934, + "learning_rate": 8.300794889992904e-05, + "loss": 0.03808550238609314, + "step": 119760 + }, + { + "epoch": 17.000709723207947, + "grad_norm": 9.696110725402832, + "learning_rate": 8.300652945351313e-05, + "loss": 0.044254863262176515, + "step": 119770 + }, + { + "epoch": 17.002129169623846, + "grad_norm": 0.37043657898902893, + "learning_rate": 8.300511000709723e-05, + "loss": 0.0012788783758878707, + "step": 119780 + }, + { + "epoch": 17.003548616039744, + "grad_norm": 8.824217796325684, + "learning_rate": 8.300369056068133e-05, + "loss": 0.017943207919597626, + "step": 119790 + }, + { + "epoch": 17.004968062455642, + "grad_norm": 0.37432098388671875, + "learning_rate": 8.300227111426544e-05, + "loss": 0.0019177347421646118, + "step": 119800 + }, + { + "epoch": 17.00638750887154, + "grad_norm": 0.8555291295051575, + "learning_rate": 8.300085166784955e-05, + "loss": 0.014973253011703491, + "step": 119810 + }, + { + "epoch": 17.00780695528744, + "grad_norm": 4.908260822296143, + "learning_rate": 8.299943222143365e-05, + "loss": 0.047904562950134275, + "step": 119820 + }, + { + "epoch": 17.009226401703337, + "grad_norm": 1.7123541831970215, + "learning_rate": 8.299801277501775e-05, + "loss": 0.05410314798355102, + "step": 119830 + }, + { + "epoch": 17.010645848119232, + "grad_norm": 0.4361095726490021, + "learning_rate": 8.299659332860184e-05, + "loss": 0.027262836694717407, + "step": 119840 + }, + { + "epoch": 17.01206529453513, + "grad_norm": 0.17793868482112885, + "learning_rate": 8.299517388218595e-05, + "loss": 0.04731735289096832, + "step": 119850 + }, + { + "epoch": 17.01348474095103, + "grad_norm": 1.0978947877883911, + "learning_rate": 8.299375443577005e-05, + "loss": 0.022935733199119568, + "step": 119860 + }, + { + "epoch": 17.014904187366927, + "grad_norm": 2.6851255893707275, + "learning_rate": 8.299233498935416e-05, + "loss": 0.014493119716644288, + "step": 119870 + }, + { + "epoch": 17.016323633782825, + "grad_norm": 0.12189056724309921, + "learning_rate": 8.299091554293826e-05, + "loss": 0.008826699852943421, + "step": 119880 + }, + { + "epoch": 17.017743080198724, + "grad_norm": 4.789248943328857, + "learning_rate": 8.298949609652236e-05, + "loss": 0.04857603013515473, + "step": 119890 + }, + { + "epoch": 17.019162526614622, + "grad_norm": 0.46851640939712524, + "learning_rate": 8.298807665010647e-05, + "loss": 0.01318027526140213, + "step": 119900 + }, + { + "epoch": 17.020581973030517, + "grad_norm": 7.147706031799316, + "learning_rate": 8.298665720369056e-05, + "loss": 0.03265936076641083, + "step": 119910 + }, + { + "epoch": 17.022001419446415, + "grad_norm": 6.2478861808776855, + "learning_rate": 8.298523775727468e-05, + "loss": 0.01385502964258194, + "step": 119920 + }, + { + "epoch": 17.023420865862313, + "grad_norm": 0.4883221983909607, + "learning_rate": 8.298381831085877e-05, + "loss": 0.014128404855728149, + "step": 119930 + }, + { + "epoch": 17.02484031227821, + "grad_norm": 0.1252172440290451, + "learning_rate": 8.298239886444287e-05, + "loss": 0.024860450625419618, + "step": 119940 + }, + { + "epoch": 17.02625975869411, + "grad_norm": 4.0285491943359375, + "learning_rate": 8.298097941802697e-05, + "loss": 0.02474023848772049, + "step": 119950 + }, + { + "epoch": 17.027679205110008, + "grad_norm": 10.738550186157227, + "learning_rate": 8.297955997161108e-05, + "loss": 0.029977038502693176, + "step": 119960 + }, + { + "epoch": 17.029098651525906, + "grad_norm": 1.833249807357788, + "learning_rate": 8.297814052519518e-05, + "loss": 0.03657858073711395, + "step": 119970 + }, + { + "epoch": 17.0305180979418, + "grad_norm": 0.35576480627059937, + "learning_rate": 8.297672107877929e-05, + "loss": 0.012883573770523071, + "step": 119980 + }, + { + "epoch": 17.0319375443577, + "grad_norm": 0.8515356779098511, + "learning_rate": 8.297530163236338e-05, + "loss": 0.0216094046831131, + "step": 119990 + }, + { + "epoch": 17.033356990773598, + "grad_norm": 10.729890823364258, + "learning_rate": 8.297388218594748e-05, + "loss": 0.026598137617111207, + "step": 120000 + }, + { + "epoch": 17.033356990773598, + "eval_accuracy": 0.9808609397850829, + "eval_loss": 0.07485253363847733, + "eval_runtime": 33.4946, + "eval_samples_per_second": 469.539, + "eval_steps_per_second": 14.689, + "step": 120000 + }, + { + "epoch": 17.034776437189496, + "grad_norm": 0.18618199229240417, + "learning_rate": 8.297246273953159e-05, + "loss": 0.043621528148651126, + "step": 120010 + }, + { + "epoch": 17.036195883605394, + "grad_norm": 9.199254989624023, + "learning_rate": 8.297104329311569e-05, + "loss": 0.04960590302944183, + "step": 120020 + }, + { + "epoch": 17.037615330021293, + "grad_norm": 0.9070085287094116, + "learning_rate": 8.29696238466998e-05, + "loss": 0.008391667902469636, + "step": 120030 + }, + { + "epoch": 17.03903477643719, + "grad_norm": 4.971753120422363, + "learning_rate": 8.296820440028389e-05, + "loss": 0.02203463315963745, + "step": 120040 + }, + { + "epoch": 17.040454222853086, + "grad_norm": 0.009288261644542217, + "learning_rate": 8.2966784953868e-05, + "loss": 0.030891206860542298, + "step": 120050 + }, + { + "epoch": 17.041873669268984, + "grad_norm": 0.1559710055589676, + "learning_rate": 8.29653655074521e-05, + "loss": 0.03783779442310333, + "step": 120060 + }, + { + "epoch": 17.043293115684882, + "grad_norm": 0.01857968419790268, + "learning_rate": 8.29639460610362e-05, + "loss": 0.0034990094602108, + "step": 120070 + }, + { + "epoch": 17.04471256210078, + "grad_norm": 0.101988784968853, + "learning_rate": 8.29625266146203e-05, + "loss": 0.020602494478225708, + "step": 120080 + }, + { + "epoch": 17.04613200851668, + "grad_norm": 1.6410483121871948, + "learning_rate": 8.29611071682044e-05, + "loss": 0.005050593242049217, + "step": 120090 + }, + { + "epoch": 17.047551454932577, + "grad_norm": 0.1444510519504547, + "learning_rate": 8.295968772178851e-05, + "loss": 0.029584896564483643, + "step": 120100 + }, + { + "epoch": 17.048970901348476, + "grad_norm": 0.8876551985740662, + "learning_rate": 8.295826827537261e-05, + "loss": 0.0014938555657863617, + "step": 120110 + }, + { + "epoch": 17.05039034776437, + "grad_norm": 4.955441951751709, + "learning_rate": 8.295684882895672e-05, + "loss": 0.006155164912343025, + "step": 120120 + }, + { + "epoch": 17.05180979418027, + "grad_norm": 0.16779440641403198, + "learning_rate": 8.295542938254082e-05, + "loss": 0.005987958237528801, + "step": 120130 + }, + { + "epoch": 17.053229240596167, + "grad_norm": 0.022262701764702797, + "learning_rate": 8.295400993612491e-05, + "loss": 0.010092838108539582, + "step": 120140 + }, + { + "epoch": 17.054648687012065, + "grad_norm": 0.5307697057723999, + "learning_rate": 8.295259048970901e-05, + "loss": 0.014554566144943238, + "step": 120150 + }, + { + "epoch": 17.056068133427964, + "grad_norm": 4.913189888000488, + "learning_rate": 8.295117104329312e-05, + "loss": 0.0074235409498214725, + "step": 120160 + }, + { + "epoch": 17.057487579843862, + "grad_norm": 0.4308110475540161, + "learning_rate": 8.294975159687722e-05, + "loss": 0.03521883189678192, + "step": 120170 + }, + { + "epoch": 17.05890702625976, + "grad_norm": 0.12361428141593933, + "learning_rate": 8.294833215046133e-05, + "loss": 0.021901145577430725, + "step": 120180 + }, + { + "epoch": 17.060326472675655, + "grad_norm": 0.020824233070015907, + "learning_rate": 8.294691270404543e-05, + "loss": 0.008531014621257781, + "step": 120190 + }, + { + "epoch": 17.061745919091553, + "grad_norm": 0.01912788301706314, + "learning_rate": 8.294549325762952e-05, + "loss": 0.04999278485774994, + "step": 120200 + }, + { + "epoch": 17.06316536550745, + "grad_norm": 10.640271186828613, + "learning_rate": 8.294407381121364e-05, + "loss": 0.028563928604125977, + "step": 120210 + }, + { + "epoch": 17.06458481192335, + "grad_norm": 0.020733371376991272, + "learning_rate": 8.294265436479773e-05, + "loss": 0.016567130386829377, + "step": 120220 + }, + { + "epoch": 17.066004258339248, + "grad_norm": 0.9342880249023438, + "learning_rate": 8.294123491838184e-05, + "loss": 0.004354240372776985, + "step": 120230 + }, + { + "epoch": 17.067423704755146, + "grad_norm": 10.978273391723633, + "learning_rate": 8.293981547196594e-05, + "loss": 0.020922911167144776, + "step": 120240 + }, + { + "epoch": 17.068843151171045, + "grad_norm": 0.08470244705677032, + "learning_rate": 8.293839602555004e-05, + "loss": 0.0323804646730423, + "step": 120250 + }, + { + "epoch": 17.07026259758694, + "grad_norm": Infinity, + "learning_rate": 8.293697657913414e-05, + "loss": 0.02929849028587341, + "step": 120260 + }, + { + "epoch": 17.071682044002838, + "grad_norm": 0.2951527237892151, + "learning_rate": 8.293569907735983e-05, + "loss": 0.012332384288311005, + "step": 120270 + }, + { + "epoch": 17.073101490418736, + "grad_norm": 0.23411825299263, + "learning_rate": 8.293427963094393e-05, + "loss": 0.002320580929517746, + "step": 120280 + }, + { + "epoch": 17.074520936834634, + "grad_norm": 0.03805660083889961, + "learning_rate": 8.293286018452804e-05, + "loss": 0.0022540684789419175, + "step": 120290 + }, + { + "epoch": 17.075940383250533, + "grad_norm": 0.1722414642572403, + "learning_rate": 8.293144073811214e-05, + "loss": 0.014226651191711426, + "step": 120300 + }, + { + "epoch": 17.07735982966643, + "grad_norm": 0.022016188129782677, + "learning_rate": 8.293002129169625e-05, + "loss": 0.019705028831958772, + "step": 120310 + }, + { + "epoch": 17.07877927608233, + "grad_norm": 2.4051177501678467, + "learning_rate": 8.292860184528033e-05, + "loss": 0.032577145099639895, + "step": 120320 + }, + { + "epoch": 17.080198722498224, + "grad_norm": 0.014880457893013954, + "learning_rate": 8.292718239886445e-05, + "loss": 0.08195329308509827, + "step": 120330 + }, + { + "epoch": 17.081618168914122, + "grad_norm": 0.017962895333766937, + "learning_rate": 8.292576295244854e-05, + "loss": 0.04217566847801209, + "step": 120340 + }, + { + "epoch": 17.08303761533002, + "grad_norm": 0.20503850281238556, + "learning_rate": 8.292434350603265e-05, + "loss": 0.033675742149353025, + "step": 120350 + }, + { + "epoch": 17.08445706174592, + "grad_norm": 0.3100937604904175, + "learning_rate": 8.292292405961675e-05, + "loss": 0.03976408541202545, + "step": 120360 + }, + { + "epoch": 17.085876508161817, + "grad_norm": 1.0643759965896606, + "learning_rate": 8.292150461320085e-05, + "loss": 0.017562437057495116, + "step": 120370 + }, + { + "epoch": 17.087295954577716, + "grad_norm": 2.6324429512023926, + "learning_rate": 8.292008516678496e-05, + "loss": 0.01645803153514862, + "step": 120380 + }, + { + "epoch": 17.088715400993614, + "grad_norm": 0.010898143984377384, + "learning_rate": 8.291866572036906e-05, + "loss": 0.03926792144775391, + "step": 120390 + }, + { + "epoch": 17.09013484740951, + "grad_norm": 0.06077408045530319, + "learning_rate": 8.291724627395317e-05, + "loss": 0.06492968797683715, + "step": 120400 + }, + { + "epoch": 17.091554293825407, + "grad_norm": 0.03656835854053497, + "learning_rate": 8.291582682753727e-05, + "loss": 0.003984153643250466, + "step": 120410 + }, + { + "epoch": 17.092973740241305, + "grad_norm": 0.011007381603121758, + "learning_rate": 8.291440738112136e-05, + "loss": 0.03613340556621551, + "step": 120420 + }, + { + "epoch": 17.094393186657204, + "grad_norm": 36.203392028808594, + "learning_rate": 8.291298793470546e-05, + "loss": 0.052862972021102905, + "step": 120430 + }, + { + "epoch": 17.095812633073102, + "grad_norm": 2.5039913654327393, + "learning_rate": 8.291156848828957e-05, + "loss": 0.038491514325141904, + "step": 120440 + }, + { + "epoch": 17.097232079489, + "grad_norm": 0.05600656941533089, + "learning_rate": 8.291014904187367e-05, + "loss": 0.04204607605934143, + "step": 120450 + }, + { + "epoch": 17.0986515259049, + "grad_norm": 5.478761672973633, + "learning_rate": 8.290872959545778e-05, + "loss": 0.014320333302021027, + "step": 120460 + }, + { + "epoch": 17.100070972320793, + "grad_norm": 0.1664823591709137, + "learning_rate": 8.290731014904188e-05, + "loss": 0.029626739025115967, + "step": 120470 + }, + { + "epoch": 17.10149041873669, + "grad_norm": 0.4680706560611725, + "learning_rate": 8.290589070262597e-05, + "loss": 0.016333407163619994, + "step": 120480 + }, + { + "epoch": 17.10290986515259, + "grad_norm": 15.665885925292969, + "learning_rate": 8.290447125621008e-05, + "loss": 0.018834175169467927, + "step": 120490 + }, + { + "epoch": 17.10432931156849, + "grad_norm": 0.8051186203956604, + "learning_rate": 8.290305180979418e-05, + "loss": 0.006639273464679718, + "step": 120500 + }, + { + "epoch": 17.10432931156849, + "eval_accuracy": 0.9870286767978635, + "eval_loss": 0.045041538774967194, + "eval_runtime": 32.0723, + "eval_samples_per_second": 490.361, + "eval_steps_per_second": 15.34, + "step": 120500 + }, + { + "epoch": 17.105748757984387, + "grad_norm": 2.2258312702178955, + "learning_rate": 8.29016323633783e-05, + "loss": 0.04136236906051636, + "step": 120510 + }, + { + "epoch": 17.107168204400285, + "grad_norm": 0.1064390316605568, + "learning_rate": 8.290021291696238e-05, + "loss": 0.010789933800697326, + "step": 120520 + }, + { + "epoch": 17.108587650816183, + "grad_norm": 3.2055423259735107, + "learning_rate": 8.289879347054649e-05, + "loss": 0.003744557872414589, + "step": 120530 + }, + { + "epoch": 17.110007097232078, + "grad_norm": 0.0287025086581707, + "learning_rate": 8.289737402413059e-05, + "loss": 0.020008505880832674, + "step": 120540 + }, + { + "epoch": 17.111426543647976, + "grad_norm": 0.08441568911075592, + "learning_rate": 8.28959545777147e-05, + "loss": 0.05495128631591797, + "step": 120550 + }, + { + "epoch": 17.112845990063875, + "grad_norm": 0.016766395419836044, + "learning_rate": 8.289453513129881e-05, + "loss": 0.019158127903938293, + "step": 120560 + }, + { + "epoch": 17.114265436479773, + "grad_norm": 3.567664384841919, + "learning_rate": 8.28931156848829e-05, + "loss": 0.021029704809188844, + "step": 120570 + }, + { + "epoch": 17.11568488289567, + "grad_norm": 0.0842832401394844, + "learning_rate": 8.2891696238467e-05, + "loss": 0.012277697026729584, + "step": 120580 + }, + { + "epoch": 17.11710432931157, + "grad_norm": 4.586905479431152, + "learning_rate": 8.28902767920511e-05, + "loss": 0.011499184370040893, + "step": 120590 + }, + { + "epoch": 17.118523775727468, + "grad_norm": 0.027589887380599976, + "learning_rate": 8.288885734563521e-05, + "loss": 0.025917389988899232, + "step": 120600 + }, + { + "epoch": 17.119943222143363, + "grad_norm": 5.147835731506348, + "learning_rate": 8.288743789921931e-05, + "loss": 0.02600862979888916, + "step": 120610 + }, + { + "epoch": 17.12136266855926, + "grad_norm": 0.028256772086024284, + "learning_rate": 8.288601845280342e-05, + "loss": 0.04589993357658386, + "step": 120620 + }, + { + "epoch": 17.12278211497516, + "grad_norm": 10.49030876159668, + "learning_rate": 8.28845990063875e-05, + "loss": 0.02437007427215576, + "step": 120630 + }, + { + "epoch": 17.124201561391057, + "grad_norm": 0.09230612218379974, + "learning_rate": 8.288317955997161e-05, + "loss": 0.028782013058662414, + "step": 120640 + }, + { + "epoch": 17.125621007806956, + "grad_norm": 2.3693878650665283, + "learning_rate": 8.288176011355572e-05, + "loss": 0.023813261091709136, + "step": 120650 + }, + { + "epoch": 17.127040454222854, + "grad_norm": 0.04619796574115753, + "learning_rate": 8.288034066713982e-05, + "loss": 0.051468032598495486, + "step": 120660 + }, + { + "epoch": 17.128459900638752, + "grad_norm": 0.03357632830739021, + "learning_rate": 8.287892122072393e-05, + "loss": 0.0446009486913681, + "step": 120670 + }, + { + "epoch": 17.129879347054647, + "grad_norm": 0.021320484578609467, + "learning_rate": 8.287750177430802e-05, + "loss": 0.028787761926651, + "step": 120680 + }, + { + "epoch": 17.131298793470545, + "grad_norm": 0.3757644593715668, + "learning_rate": 8.287608232789213e-05, + "loss": 0.009390568733215332, + "step": 120690 + }, + { + "epoch": 17.132718239886444, + "grad_norm": 8.768340110778809, + "learning_rate": 8.287466288147622e-05, + "loss": 0.03199634850025177, + "step": 120700 + }, + { + "epoch": 17.134137686302342, + "grad_norm": 1.7924593687057495, + "learning_rate": 8.287324343506034e-05, + "loss": 0.014015412330627442, + "step": 120710 + }, + { + "epoch": 17.13555713271824, + "grad_norm": 0.03260492905974388, + "learning_rate": 8.287182398864443e-05, + "loss": 0.04740467071533203, + "step": 120720 + }, + { + "epoch": 17.13697657913414, + "grad_norm": 29.415185928344727, + "learning_rate": 8.287040454222853e-05, + "loss": 0.038969749212265016, + "step": 120730 + }, + { + "epoch": 17.138396025550037, + "grad_norm": 0.05544610321521759, + "learning_rate": 8.286898509581264e-05, + "loss": 0.011639602482318878, + "step": 120740 + }, + { + "epoch": 17.13981547196593, + "grad_norm": 0.8772669434547424, + "learning_rate": 8.286756564939674e-05, + "loss": 0.028922209143638612, + "step": 120750 + }, + { + "epoch": 17.14123491838183, + "grad_norm": 0.09601476788520813, + "learning_rate": 8.286614620298085e-05, + "loss": 0.04748598635196686, + "step": 120760 + }, + { + "epoch": 17.14265436479773, + "grad_norm": 0.09443246573209763, + "learning_rate": 8.286472675656495e-05, + "loss": 0.016865630447864533, + "step": 120770 + }, + { + "epoch": 17.144073811213627, + "grad_norm": 1.598710298538208, + "learning_rate": 8.286330731014904e-05, + "loss": 0.029343438148498536, + "step": 120780 + }, + { + "epoch": 17.145493257629525, + "grad_norm": 0.13849106431007385, + "learning_rate": 8.286188786373314e-05, + "loss": 0.008465579897165298, + "step": 120790 + }, + { + "epoch": 17.146912704045423, + "grad_norm": 1.7292816638946533, + "learning_rate": 8.286046841731725e-05, + "loss": 0.01654750108718872, + "step": 120800 + }, + { + "epoch": 17.14833215046132, + "grad_norm": 3.3607993125915527, + "learning_rate": 8.285904897090135e-05, + "loss": 0.008860719203948975, + "step": 120810 + }, + { + "epoch": 17.149751596877216, + "grad_norm": 0.058206669986248016, + "learning_rate": 8.285762952448546e-05, + "loss": 0.04676066637039185, + "step": 120820 + }, + { + "epoch": 17.151171043293115, + "grad_norm": 1.7293013334274292, + "learning_rate": 8.285621007806956e-05, + "loss": 0.006193574145436287, + "step": 120830 + }, + { + "epoch": 17.152590489709013, + "grad_norm": 0.01077141985297203, + "learning_rate": 8.285479063165366e-05, + "loss": 0.019160452485084533, + "step": 120840 + }, + { + "epoch": 17.15400993612491, + "grad_norm": 7.551402568817139, + "learning_rate": 8.285337118523777e-05, + "loss": 0.04898516833782196, + "step": 120850 + }, + { + "epoch": 17.15542938254081, + "grad_norm": 2.899712324142456, + "learning_rate": 8.285195173882186e-05, + "loss": 0.017336773872375488, + "step": 120860 + }, + { + "epoch": 17.156848828956708, + "grad_norm": 0.03883068636059761, + "learning_rate": 8.285053229240597e-05, + "loss": 0.0031766846776008608, + "step": 120870 + }, + { + "epoch": 17.158268275372606, + "grad_norm": 0.9664191007614136, + "learning_rate": 8.284911284599007e-05, + "loss": 0.012236092239618301, + "step": 120880 + }, + { + "epoch": 17.1596877217885, + "grad_norm": 13.277713775634766, + "learning_rate": 8.284769339957417e-05, + "loss": 0.01570097804069519, + "step": 120890 + }, + { + "epoch": 17.1611071682044, + "grad_norm": 6.466482639312744, + "learning_rate": 8.284627395315827e-05, + "loss": 0.004724283888936043, + "step": 120900 + }, + { + "epoch": 17.162526614620297, + "grad_norm": 0.30006489157676697, + "learning_rate": 8.284485450674238e-05, + "loss": 0.025148922204971315, + "step": 120910 + }, + { + "epoch": 17.163946061036196, + "grad_norm": 0.01525171473622322, + "learning_rate": 8.284343506032648e-05, + "loss": 0.005151396989822388, + "step": 120920 + }, + { + "epoch": 17.165365507452094, + "grad_norm": 0.9813533425331116, + "learning_rate": 8.284201561391059e-05, + "loss": 0.014321206510066986, + "step": 120930 + }, + { + "epoch": 17.166784953867992, + "grad_norm": 0.272886723279953, + "learning_rate": 8.284059616749468e-05, + "loss": 0.0091228649020195, + "step": 120940 + }, + { + "epoch": 17.16820440028389, + "grad_norm": 12.312128067016602, + "learning_rate": 8.283917672107878e-05, + "loss": 0.04469579458236694, + "step": 120950 + }, + { + "epoch": 17.169623846699785, + "grad_norm": 4.318283557891846, + "learning_rate": 8.283775727466289e-05, + "loss": 0.0037714622914791105, + "step": 120960 + }, + { + "epoch": 17.171043293115684, + "grad_norm": 1.3079272508621216, + "learning_rate": 8.283633782824699e-05, + "loss": 0.012800368666648864, + "step": 120970 + }, + { + "epoch": 17.172462739531582, + "grad_norm": 0.02630901150405407, + "learning_rate": 8.28349183818311e-05, + "loss": 0.007006227970123291, + "step": 120980 + }, + { + "epoch": 17.17388218594748, + "grad_norm": 0.018336854875087738, + "learning_rate": 8.283349893541518e-05, + "loss": 0.035109061002731326, + "step": 120990 + }, + { + "epoch": 17.17530163236338, + "grad_norm": 0.03967367857694626, + "learning_rate": 8.28320794889993e-05, + "loss": 0.00726487934589386, + "step": 121000 + }, + { + "epoch": 17.17530163236338, + "eval_accuracy": 0.9865199974566033, + "eval_loss": 0.05103665217757225, + "eval_runtime": 32.6428, + "eval_samples_per_second": 481.791, + "eval_steps_per_second": 15.072, + "step": 121000 + }, + { + "epoch": 17.176721078779277, + "grad_norm": 0.012679275125265121, + "learning_rate": 8.283066004258339e-05, + "loss": 0.01789276897907257, + "step": 121010 + }, + { + "epoch": 17.178140525195175, + "grad_norm": 0.02514667622745037, + "learning_rate": 8.28292405961675e-05, + "loss": 0.0161560595035553, + "step": 121020 + }, + { + "epoch": 17.17955997161107, + "grad_norm": 0.04653330147266388, + "learning_rate": 8.28278211497516e-05, + "loss": 0.003413556143641472, + "step": 121030 + }, + { + "epoch": 17.18097941802697, + "grad_norm": 2.3200175762176514, + "learning_rate": 8.28264017033357e-05, + "loss": 0.009373904764652252, + "step": 121040 + }, + { + "epoch": 17.182398864442867, + "grad_norm": 0.04067833721637726, + "learning_rate": 8.282498225691981e-05, + "loss": 0.016110491752624512, + "step": 121050 + }, + { + "epoch": 17.183818310858765, + "grad_norm": 8.260937690734863, + "learning_rate": 8.28235628105039e-05, + "loss": 0.0567485511302948, + "step": 121060 + }, + { + "epoch": 17.185237757274663, + "grad_norm": 3.1584644317626953, + "learning_rate": 8.282214336408802e-05, + "loss": 0.005105612799525261, + "step": 121070 + }, + { + "epoch": 17.18665720369056, + "grad_norm": 1.6670535802841187, + "learning_rate": 8.282072391767211e-05, + "loss": 0.006072807312011719, + "step": 121080 + }, + { + "epoch": 17.18807665010646, + "grad_norm": 0.24677909910678864, + "learning_rate": 8.281930447125621e-05, + "loss": 0.02320306599140167, + "step": 121090 + }, + { + "epoch": 17.189496096522355, + "grad_norm": 0.1326128989458084, + "learning_rate": 8.281788502484031e-05, + "loss": 0.015490972995758056, + "step": 121100 + }, + { + "epoch": 17.190915542938253, + "grad_norm": 0.8754839301109314, + "learning_rate": 8.281646557842442e-05, + "loss": 0.03079564869403839, + "step": 121110 + }, + { + "epoch": 17.19233498935415, + "grad_norm": 0.04686080291867256, + "learning_rate": 8.281504613200852e-05, + "loss": 0.007701759040355682, + "step": 121120 + }, + { + "epoch": 17.19375443577005, + "grad_norm": 9.40129566192627, + "learning_rate": 8.281362668559263e-05, + "loss": 0.008802379667758941, + "step": 121130 + }, + { + "epoch": 17.195173882185948, + "grad_norm": 0.033822499215602875, + "learning_rate": 8.281220723917673e-05, + "loss": 0.01620979458093643, + "step": 121140 + }, + { + "epoch": 17.196593328601846, + "grad_norm": 0.35262858867645264, + "learning_rate": 8.281078779276082e-05, + "loss": 0.01161227524280548, + "step": 121150 + }, + { + "epoch": 17.198012775017745, + "grad_norm": 0.006954657379537821, + "learning_rate": 8.280936834634493e-05, + "loss": 0.004226695373654365, + "step": 121160 + }, + { + "epoch": 17.19943222143364, + "grad_norm": 0.048306047916412354, + "learning_rate": 8.280794889992903e-05, + "loss": 0.030858996510505676, + "step": 121170 + }, + { + "epoch": 17.200851667849538, + "grad_norm": 6.007036209106445, + "learning_rate": 8.280652945351314e-05, + "loss": 0.010294271260499954, + "step": 121180 + }, + { + "epoch": 17.202271114265436, + "grad_norm": 12.101383209228516, + "learning_rate": 8.280511000709723e-05, + "loss": 0.03878332376480102, + "step": 121190 + }, + { + "epoch": 17.203690560681334, + "grad_norm": 0.39908748865127563, + "learning_rate": 8.280369056068134e-05, + "loss": 0.022022242844104766, + "step": 121200 + }, + { + "epoch": 17.205110007097232, + "grad_norm": 0.17057831585407257, + "learning_rate": 8.280227111426543e-05, + "loss": 0.004962685331702232, + "step": 121210 + }, + { + "epoch": 17.20652945351313, + "grad_norm": 1.1699227094650269, + "learning_rate": 8.280085166784955e-05, + "loss": 0.005730428919196129, + "step": 121220 + }, + { + "epoch": 17.20794889992903, + "grad_norm": 6.226938724517822, + "learning_rate": 8.279943222143364e-05, + "loss": 0.028130099177360535, + "step": 121230 + }, + { + "epoch": 17.209368346344924, + "grad_norm": 13.366537094116211, + "learning_rate": 8.279801277501775e-05, + "loss": 0.020548251271247864, + "step": 121240 + }, + { + "epoch": 17.210787792760822, + "grad_norm": 0.5270779728889465, + "learning_rate": 8.279659332860185e-05, + "loss": 0.01929387152194977, + "step": 121250 + }, + { + "epoch": 17.21220723917672, + "grad_norm": 0.3697451949119568, + "learning_rate": 8.279517388218595e-05, + "loss": 0.006632042676210403, + "step": 121260 + }, + { + "epoch": 17.21362668559262, + "grad_norm": 0.06791370362043381, + "learning_rate": 8.279375443577006e-05, + "loss": 0.00866928994655609, + "step": 121270 + }, + { + "epoch": 17.215046132008517, + "grad_norm": 0.09903264790773392, + "learning_rate": 8.279233498935416e-05, + "loss": 0.008664032816886902, + "step": 121280 + }, + { + "epoch": 17.216465578424415, + "grad_norm": 7.171376705169678, + "learning_rate": 8.279091554293827e-05, + "loss": 0.03021889626979828, + "step": 121290 + }, + { + "epoch": 17.217885024840314, + "grad_norm": 0.05718240141868591, + "learning_rate": 8.278949609652235e-05, + "loss": 0.0042637944221496586, + "step": 121300 + }, + { + "epoch": 17.21930447125621, + "grad_norm": 1.197914719581604, + "learning_rate": 8.278807665010646e-05, + "loss": 0.0027816496789455415, + "step": 121310 + }, + { + "epoch": 17.220723917672107, + "grad_norm": 0.26062679290771484, + "learning_rate": 8.278665720369056e-05, + "loss": 0.011219573765993118, + "step": 121320 + }, + { + "epoch": 17.222143364088005, + "grad_norm": 0.691809356212616, + "learning_rate": 8.278523775727467e-05, + "loss": 0.018367362022399903, + "step": 121330 + }, + { + "epoch": 17.223562810503903, + "grad_norm": 0.027361121028661728, + "learning_rate": 8.278381831085877e-05, + "loss": 0.0044341754168272015, + "step": 121340 + }, + { + "epoch": 17.2249822569198, + "grad_norm": 0.09908737242221832, + "learning_rate": 8.278239886444287e-05, + "loss": 0.028595495223999023, + "step": 121350 + }, + { + "epoch": 17.2264017033357, + "grad_norm": 0.009565730579197407, + "learning_rate": 8.278097941802698e-05, + "loss": 0.01315288245677948, + "step": 121360 + }, + { + "epoch": 17.2278211497516, + "grad_norm": 0.06306063383817673, + "learning_rate": 8.277955997161107e-05, + "loss": 0.004615992680191994, + "step": 121370 + }, + { + "epoch": 17.229240596167493, + "grad_norm": 0.03943710774183273, + "learning_rate": 8.277814052519519e-05, + "loss": 0.005523869767785072, + "step": 121380 + }, + { + "epoch": 17.23066004258339, + "grad_norm": 0.16066552698612213, + "learning_rate": 8.277672107877928e-05, + "loss": 0.030437877774238585, + "step": 121390 + }, + { + "epoch": 17.23207948899929, + "grad_norm": 0.5343520641326904, + "learning_rate": 8.277530163236338e-05, + "loss": 0.016141740977764128, + "step": 121400 + }, + { + "epoch": 17.233498935415188, + "grad_norm": 0.10204742848873138, + "learning_rate": 8.277388218594748e-05, + "loss": 0.04754527807235718, + "step": 121410 + }, + { + "epoch": 17.234918381831086, + "grad_norm": 0.04669100418686867, + "learning_rate": 8.277246273953159e-05, + "loss": 0.00649983286857605, + "step": 121420 + }, + { + "epoch": 17.236337828246985, + "grad_norm": 19.72222900390625, + "learning_rate": 8.277104329311569e-05, + "loss": 0.03361916840076447, + "step": 121430 + }, + { + "epoch": 17.237757274662883, + "grad_norm": 1.6603556871414185, + "learning_rate": 8.27696238466998e-05, + "loss": 0.006782519817352295, + "step": 121440 + }, + { + "epoch": 17.239176721078778, + "grad_norm": 2.3674466609954834, + "learning_rate": 8.27682044002839e-05, + "loss": 0.024116578698158263, + "step": 121450 + }, + { + "epoch": 17.240596167494676, + "grad_norm": 2.121873140335083, + "learning_rate": 8.276678495386799e-05, + "loss": 0.007182253897190094, + "step": 121460 + }, + { + "epoch": 17.242015613910574, + "grad_norm": 9.855586051940918, + "learning_rate": 8.27653655074521e-05, + "loss": 0.028914478421211243, + "step": 121470 + }, + { + "epoch": 17.243435060326473, + "grad_norm": 6.052547454833984, + "learning_rate": 8.27639460610362e-05, + "loss": 0.029678156971931456, + "step": 121480 + }, + { + "epoch": 17.24485450674237, + "grad_norm": 0.012733505107462406, + "learning_rate": 8.276252661462031e-05, + "loss": 0.015345364809036255, + "step": 121490 + }, + { + "epoch": 17.24627395315827, + "grad_norm": 8.1347017288208, + "learning_rate": 8.27611071682044e-05, + "loss": 0.021826182305812836, + "step": 121500 + }, + { + "epoch": 17.24627395315827, + "eval_accuracy": 0.9786990525847269, + "eval_loss": 0.07808168977499008, + "eval_runtime": 32.7353, + "eval_samples_per_second": 480.429, + "eval_steps_per_second": 15.03, + "step": 121500 + }, + { + "epoch": 17.247693399574167, + "grad_norm": 1.4311810731887817, + "learning_rate": 8.27596877217885e-05, + "loss": 0.03259517550468445, + "step": 121510 + }, + { + "epoch": 17.249112845990062, + "grad_norm": 0.18520532548427582, + "learning_rate": 8.27582682753726e-05, + "loss": 0.029880058765411378, + "step": 121520 + }, + { + "epoch": 17.25053229240596, + "grad_norm": 3.1363275051116943, + "learning_rate": 8.275684882895671e-05, + "loss": 0.01889028251171112, + "step": 121530 + }, + { + "epoch": 17.25195173882186, + "grad_norm": 7.482678413391113, + "learning_rate": 8.275542938254081e-05, + "loss": 0.04509726464748383, + "step": 121540 + }, + { + "epoch": 17.253371185237757, + "grad_norm": 0.14285561442375183, + "learning_rate": 8.275400993612491e-05, + "loss": 0.0074087709188461305, + "step": 121550 + }, + { + "epoch": 17.254790631653655, + "grad_norm": 5.067549228668213, + "learning_rate": 8.275259048970902e-05, + "loss": 0.06535211801528931, + "step": 121560 + }, + { + "epoch": 17.256210078069554, + "grad_norm": 1.6123889684677124, + "learning_rate": 8.275117104329312e-05, + "loss": 0.005470262467861175, + "step": 121570 + }, + { + "epoch": 17.257629524485452, + "grad_norm": 0.4612501561641693, + "learning_rate": 8.274975159687723e-05, + "loss": 0.02524920701980591, + "step": 121580 + }, + { + "epoch": 17.259048970901347, + "grad_norm": 0.20753082633018494, + "learning_rate": 8.274833215046132e-05, + "loss": 0.007443346083164215, + "step": 121590 + }, + { + "epoch": 17.260468417317245, + "grad_norm": 2.3877856731414795, + "learning_rate": 8.274691270404544e-05, + "loss": 0.029876506328582762, + "step": 121600 + }, + { + "epoch": 17.261887863733143, + "grad_norm": 0.1580190658569336, + "learning_rate": 8.274549325762952e-05, + "loss": 0.010098446160554886, + "step": 121610 + }, + { + "epoch": 17.26330731014904, + "grad_norm": 1.5521838665008545, + "learning_rate": 8.274407381121363e-05, + "loss": 0.01123420000076294, + "step": 121620 + }, + { + "epoch": 17.26472675656494, + "grad_norm": 0.2570251226425171, + "learning_rate": 8.274265436479773e-05, + "loss": 0.03267633020877838, + "step": 121630 + }, + { + "epoch": 17.26614620298084, + "grad_norm": 0.011689351871609688, + "learning_rate": 8.274123491838184e-05, + "loss": 0.022249601781368256, + "step": 121640 + }, + { + "epoch": 17.267565649396737, + "grad_norm": 0.24122844636440277, + "learning_rate": 8.273981547196594e-05, + "loss": 0.08932775855064393, + "step": 121650 + }, + { + "epoch": 17.26898509581263, + "grad_norm": 1.660841703414917, + "learning_rate": 8.273839602555003e-05, + "loss": 0.030342379212379457, + "step": 121660 + }, + { + "epoch": 17.27040454222853, + "grad_norm": 0.09175290167331696, + "learning_rate": 8.273697657913414e-05, + "loss": 0.042768290638923644, + "step": 121670 + }, + { + "epoch": 17.271823988644428, + "grad_norm": 0.018420705571770668, + "learning_rate": 8.273555713271824e-05, + "loss": 0.03903657495975495, + "step": 121680 + }, + { + "epoch": 17.273243435060326, + "grad_norm": 0.12965130805969238, + "learning_rate": 8.273413768630235e-05, + "loss": 0.004619887471199036, + "step": 121690 + }, + { + "epoch": 17.274662881476225, + "grad_norm": 0.061711788177490234, + "learning_rate": 8.273271823988645e-05, + "loss": 0.0060319904237985614, + "step": 121700 + }, + { + "epoch": 17.276082327892123, + "grad_norm": 2.4323067665100098, + "learning_rate": 8.273129879347055e-05, + "loss": 0.007204280793666839, + "step": 121710 + }, + { + "epoch": 17.27750177430802, + "grad_norm": 0.02041490562260151, + "learning_rate": 8.272987934705464e-05, + "loss": 0.018590529263019562, + "step": 121720 + }, + { + "epoch": 17.278921220723916, + "grad_norm": 0.2832045257091522, + "learning_rate": 8.272845990063876e-05, + "loss": 0.03935945630073547, + "step": 121730 + }, + { + "epoch": 17.280340667139814, + "grad_norm": 0.08537329733371735, + "learning_rate": 8.272704045422285e-05, + "loss": 0.0617376446723938, + "step": 121740 + }, + { + "epoch": 17.281760113555713, + "grad_norm": 10.530901908874512, + "learning_rate": 8.272562100780696e-05, + "loss": 0.013979089260101319, + "step": 121750 + }, + { + "epoch": 17.28317955997161, + "grad_norm": 0.13446474075317383, + "learning_rate": 8.272420156139106e-05, + "loss": 0.038662633299827574, + "step": 121760 + }, + { + "epoch": 17.28459900638751, + "grad_norm": 0.2811603546142578, + "learning_rate": 8.272278211497516e-05, + "loss": 0.028803160786628722, + "step": 121770 + }, + { + "epoch": 17.286018452803408, + "grad_norm": 0.03432750329375267, + "learning_rate": 8.272136266855927e-05, + "loss": 0.010302607715129853, + "step": 121780 + }, + { + "epoch": 17.287437899219306, + "grad_norm": 0.030038023367524147, + "learning_rate": 8.271994322214337e-05, + "loss": 0.024182312190532684, + "step": 121790 + }, + { + "epoch": 17.2888573456352, + "grad_norm": 1.8961012363433838, + "learning_rate": 8.271852377572748e-05, + "loss": 0.018085433542728423, + "step": 121800 + }, + { + "epoch": 17.2902767920511, + "grad_norm": 4.688851356506348, + "learning_rate": 8.271710432931156e-05, + "loss": 0.009723028540611267, + "step": 121810 + }, + { + "epoch": 17.291696238466997, + "grad_norm": 0.5379230976104736, + "learning_rate": 8.271568488289567e-05, + "loss": 0.015512585639953613, + "step": 121820 + }, + { + "epoch": 17.293115684882896, + "grad_norm": 0.060846734791994095, + "learning_rate": 8.271426543647977e-05, + "loss": 0.0035803850740194322, + "step": 121830 + }, + { + "epoch": 17.294535131298794, + "grad_norm": 0.1275729387998581, + "learning_rate": 8.271284599006388e-05, + "loss": 0.02291179299354553, + "step": 121840 + }, + { + "epoch": 17.295954577714692, + "grad_norm": 0.041570551693439484, + "learning_rate": 8.271142654364798e-05, + "loss": 0.03817935287952423, + "step": 121850 + }, + { + "epoch": 17.29737402413059, + "grad_norm": 9.03583812713623, + "learning_rate": 8.271000709723208e-05, + "loss": 0.030699890851974488, + "step": 121860 + }, + { + "epoch": 17.298793470546485, + "grad_norm": 0.2126571238040924, + "learning_rate": 8.270858765081619e-05, + "loss": 0.005072015523910523, + "step": 121870 + }, + { + "epoch": 17.300212916962384, + "grad_norm": 0.5314030051231384, + "learning_rate": 8.270716820440028e-05, + "loss": 0.03464093804359436, + "step": 121880 + }, + { + "epoch": 17.301632363378282, + "grad_norm": 0.17660662531852722, + "learning_rate": 8.27057487579844e-05, + "loss": 0.02229333221912384, + "step": 121890 + }, + { + "epoch": 17.30305180979418, + "grad_norm": 0.10263410210609436, + "learning_rate": 8.270432931156849e-05, + "loss": 0.03747016191482544, + "step": 121900 + }, + { + "epoch": 17.30447125621008, + "grad_norm": 0.6229006052017212, + "learning_rate": 8.270290986515259e-05, + "loss": 0.010727620124816895, + "step": 121910 + }, + { + "epoch": 17.305890702625977, + "grad_norm": 3.878967523574829, + "learning_rate": 8.270149041873669e-05, + "loss": 0.06718948483467102, + "step": 121920 + }, + { + "epoch": 17.307310149041875, + "grad_norm": 0.12267734855413437, + "learning_rate": 8.27000709723208e-05, + "loss": 0.04643978476524353, + "step": 121930 + }, + { + "epoch": 17.30872959545777, + "grad_norm": 12.556729316711426, + "learning_rate": 8.26986515259049e-05, + "loss": 0.012549221515655518, + "step": 121940 + }, + { + "epoch": 17.310149041873668, + "grad_norm": 1.200626254081726, + "learning_rate": 8.2697232079489e-05, + "loss": 0.0023645937442779543, + "step": 121950 + }, + { + "epoch": 17.311568488289566, + "grad_norm": 1.4385911226272583, + "learning_rate": 8.269581263307312e-05, + "loss": 0.018995174765586854, + "step": 121960 + }, + { + "epoch": 17.312987934705465, + "grad_norm": 1.4057707786560059, + "learning_rate": 8.26943931866572e-05, + "loss": 0.006676866114139557, + "step": 121970 + }, + { + "epoch": 17.314407381121363, + "grad_norm": 0.4110714793205261, + "learning_rate": 8.269297374024131e-05, + "loss": 0.004142989590764046, + "step": 121980 + }, + { + "epoch": 17.31582682753726, + "grad_norm": 2.908630609512329, + "learning_rate": 8.269155429382541e-05, + "loss": 0.003224276378750801, + "step": 121990 + }, + { + "epoch": 17.31724627395316, + "grad_norm": 0.3079705536365509, + "learning_rate": 8.269013484740952e-05, + "loss": 0.009928861260414123, + "step": 122000 + }, + { + "epoch": 17.31724627395316, + "eval_accuracy": 0.9861384879506581, + "eval_loss": 0.04895373061299324, + "eval_runtime": 33.3139, + "eval_samples_per_second": 472.085, + "eval_steps_per_second": 14.769, + "step": 122000 + }, + { + "epoch": 17.318665720369054, + "grad_norm": 0.014612467028200626, + "learning_rate": 8.268871540099362e-05, + "loss": 0.006916648149490357, + "step": 122010 + }, + { + "epoch": 17.320085166784953, + "grad_norm": 11.766655921936035, + "learning_rate": 8.268729595457772e-05, + "loss": 0.016812363266944887, + "step": 122020 + }, + { + "epoch": 17.32150461320085, + "grad_norm": 8.348554611206055, + "learning_rate": 8.268587650816181e-05, + "loss": 0.005784840509295464, + "step": 122030 + }, + { + "epoch": 17.32292405961675, + "grad_norm": 0.24908818304538727, + "learning_rate": 8.268445706174592e-05, + "loss": 0.006596586108207703, + "step": 122040 + }, + { + "epoch": 17.324343506032648, + "grad_norm": 0.5274747014045715, + "learning_rate": 8.268303761533003e-05, + "loss": 0.01484144628047943, + "step": 122050 + }, + { + "epoch": 17.325762952448546, + "grad_norm": 0.1944778710603714, + "learning_rate": 8.268161816891413e-05, + "loss": 0.02620922923088074, + "step": 122060 + }, + { + "epoch": 17.327182398864444, + "grad_norm": 26.002870559692383, + "learning_rate": 8.268019872249823e-05, + "loss": 0.07551113963127136, + "step": 122070 + }, + { + "epoch": 17.32860184528034, + "grad_norm": 9.135618209838867, + "learning_rate": 8.267877927608233e-05, + "loss": 0.03293728232383728, + "step": 122080 + }, + { + "epoch": 17.330021291696237, + "grad_norm": 0.08039802312850952, + "learning_rate": 8.267735982966644e-05, + "loss": 0.013569638133049011, + "step": 122090 + }, + { + "epoch": 17.331440738112136, + "grad_norm": 0.0037199612706899643, + "learning_rate": 8.267594038325053e-05, + "loss": 0.024605175852775572, + "step": 122100 + }, + { + "epoch": 17.332860184528034, + "grad_norm": 0.17483936250209808, + "learning_rate": 8.267452093683465e-05, + "loss": 0.049089929461479186, + "step": 122110 + }, + { + "epoch": 17.334279630943932, + "grad_norm": 8.319599151611328, + "learning_rate": 8.267310149041873e-05, + "loss": 0.048873132467269896, + "step": 122120 + }, + { + "epoch": 17.33569907735983, + "grad_norm": 0.996322751045227, + "learning_rate": 8.267168204400284e-05, + "loss": 0.020006181299686433, + "step": 122130 + }, + { + "epoch": 17.33711852377573, + "grad_norm": 12.19799518585205, + "learning_rate": 8.267026259758695e-05, + "loss": 0.02444319725036621, + "step": 122140 + }, + { + "epoch": 17.338537970191624, + "grad_norm": 0.5746012330055237, + "learning_rate": 8.266884315117105e-05, + "loss": 0.014555779099464417, + "step": 122150 + }, + { + "epoch": 17.339957416607522, + "grad_norm": 0.1442962884902954, + "learning_rate": 8.266742370475516e-05, + "loss": 0.0333253413438797, + "step": 122160 + }, + { + "epoch": 17.34137686302342, + "grad_norm": 0.23911288380622864, + "learning_rate": 8.266600425833924e-05, + "loss": 0.02843923568725586, + "step": 122170 + }, + { + "epoch": 17.34279630943932, + "grad_norm": 3.6721339225769043, + "learning_rate": 8.266458481192335e-05, + "loss": 0.015358135104179382, + "step": 122180 + }, + { + "epoch": 17.344215755855217, + "grad_norm": 0.6218733191490173, + "learning_rate": 8.266316536550745e-05, + "loss": 0.02243678867816925, + "step": 122190 + }, + { + "epoch": 17.345635202271115, + "grad_norm": 1.7230888605117798, + "learning_rate": 8.266174591909156e-05, + "loss": 0.02495470941066742, + "step": 122200 + }, + { + "epoch": 17.347054648687013, + "grad_norm": 14.371573448181152, + "learning_rate": 8.266032647267566e-05, + "loss": 0.02764323353767395, + "step": 122210 + }, + { + "epoch": 17.348474095102908, + "grad_norm": 2.212644338607788, + "learning_rate": 8.265890702625976e-05, + "loss": 0.0040929153561592106, + "step": 122220 + }, + { + "epoch": 17.349893541518806, + "grad_norm": 4.548835754394531, + "learning_rate": 8.265748757984387e-05, + "loss": 0.08133320808410645, + "step": 122230 + }, + { + "epoch": 17.351312987934705, + "grad_norm": 4.254519939422607, + "learning_rate": 8.265606813342797e-05, + "loss": 0.006079412624239921, + "step": 122240 + }, + { + "epoch": 17.352732434350603, + "grad_norm": 0.9229611158370972, + "learning_rate": 8.265464868701208e-05, + "loss": 0.012695755064487457, + "step": 122250 + }, + { + "epoch": 17.3541518807665, + "grad_norm": 10.626506805419922, + "learning_rate": 8.265322924059617e-05, + "loss": 0.06295732855796814, + "step": 122260 + }, + { + "epoch": 17.3555713271824, + "grad_norm": 0.015296485275030136, + "learning_rate": 8.265180979418027e-05, + "loss": 0.009174713492393493, + "step": 122270 + }, + { + "epoch": 17.356990773598298, + "grad_norm": 6.6800031661987305, + "learning_rate": 8.265039034776437e-05, + "loss": 0.0033336080610752105, + "step": 122280 + }, + { + "epoch": 17.358410220014193, + "grad_norm": 0.12137117236852646, + "learning_rate": 8.264897090134848e-05, + "loss": 0.003195350244641304, + "step": 122290 + }, + { + "epoch": 17.35982966643009, + "grad_norm": 0.057339224964380264, + "learning_rate": 8.264755145493258e-05, + "loss": 0.011133617162704468, + "step": 122300 + }, + { + "epoch": 17.36124911284599, + "grad_norm": 0.023208029568195343, + "learning_rate": 8.264613200851669e-05, + "loss": 0.012216722965240479, + "step": 122310 + }, + { + "epoch": 17.362668559261888, + "grad_norm": 8.354445457458496, + "learning_rate": 8.264471256210079e-05, + "loss": 0.019915086030960084, + "step": 122320 + }, + { + "epoch": 17.364088005677786, + "grad_norm": 10.264193534851074, + "learning_rate": 8.264329311568488e-05, + "loss": 0.060139000415802, + "step": 122330 + }, + { + "epoch": 17.365507452093684, + "grad_norm": 1.1199427843093872, + "learning_rate": 8.2641873669269e-05, + "loss": 0.03597682416439056, + "step": 122340 + }, + { + "epoch": 17.366926898509583, + "grad_norm": 10.73701286315918, + "learning_rate": 8.264045422285309e-05, + "loss": 0.01692398190498352, + "step": 122350 + }, + { + "epoch": 17.368346344925477, + "grad_norm": 0.06042497605085373, + "learning_rate": 8.26390347764372e-05, + "loss": 0.007322944700717926, + "step": 122360 + }, + { + "epoch": 17.369765791341376, + "grad_norm": 1.2759391069412231, + "learning_rate": 8.26376153300213e-05, + "loss": 0.016948218643665313, + "step": 122370 + }, + { + "epoch": 17.371185237757274, + "grad_norm": 1.5327088832855225, + "learning_rate": 8.26361958836054e-05, + "loss": 0.040832871198654176, + "step": 122380 + }, + { + "epoch": 17.372604684173172, + "grad_norm": 2.6886208057403564, + "learning_rate": 8.26347764371895e-05, + "loss": 0.006062294170260429, + "step": 122390 + }, + { + "epoch": 17.37402413058907, + "grad_norm": 1.667949914932251, + "learning_rate": 8.26333569907736e-05, + "loss": 0.009034250676631928, + "step": 122400 + }, + { + "epoch": 17.37544357700497, + "grad_norm": 5.088666915893555, + "learning_rate": 8.26319375443577e-05, + "loss": 0.03124779760837555, + "step": 122410 + }, + { + "epoch": 17.376863023420867, + "grad_norm": 1.2055740356445312, + "learning_rate": 8.263051809794181e-05, + "loss": 0.05579102635383606, + "step": 122420 + }, + { + "epoch": 17.378282469836762, + "grad_norm": 11.72123908996582, + "learning_rate": 8.262909865152591e-05, + "loss": 0.05243555903434753, + "step": 122430 + }, + { + "epoch": 17.37970191625266, + "grad_norm": 0.23794768750667572, + "learning_rate": 8.262767920511001e-05, + "loss": 0.034197428822517396, + "step": 122440 + }, + { + "epoch": 17.38112136266856, + "grad_norm": 0.012232398614287376, + "learning_rate": 8.262625975869412e-05, + "loss": 0.008090271055698395, + "step": 122450 + }, + { + "epoch": 17.382540809084457, + "grad_norm": 0.5846762657165527, + "learning_rate": 8.262484031227822e-05, + "loss": 0.020568540692329405, + "step": 122460 + }, + { + "epoch": 17.383960255500355, + "grad_norm": 7.196073055267334, + "learning_rate": 8.262342086586233e-05, + "loss": 0.052748876810073855, + "step": 122470 + }, + { + "epoch": 17.385379701916253, + "grad_norm": 6.977900981903076, + "learning_rate": 8.262200141944641e-05, + "loss": 0.07267424464225769, + "step": 122480 + }, + { + "epoch": 17.386799148332152, + "grad_norm": 1.2999193668365479, + "learning_rate": 8.262058197303052e-05, + "loss": 0.008215299993753433, + "step": 122490 + }, + { + "epoch": 17.388218594748047, + "grad_norm": 0.12169066816568375, + "learning_rate": 8.261916252661462e-05, + "loss": 0.03030954897403717, + "step": 122500 + }, + { + "epoch": 17.388218594748047, + "eval_accuracy": 0.9837858459973294, + "eval_loss": 0.059243522584438324, + "eval_runtime": 32.5355, + "eval_samples_per_second": 483.379, + "eval_steps_per_second": 15.122, + "step": 122500 + }, + { + "epoch": 17.389638041163945, + "grad_norm": 0.11882514506578445, + "learning_rate": 8.261774308019873e-05, + "loss": 0.011411736905574798, + "step": 122510 + }, + { + "epoch": 17.391057487579843, + "grad_norm": 0.280032217502594, + "learning_rate": 8.261632363378283e-05, + "loss": 0.0065232284367084505, + "step": 122520 + }, + { + "epoch": 17.39247693399574, + "grad_norm": 10.429749488830566, + "learning_rate": 8.261490418736693e-05, + "loss": 0.017095638811588286, + "step": 122530 + }, + { + "epoch": 17.39389638041164, + "grad_norm": 0.5666552186012268, + "learning_rate": 8.261348474095104e-05, + "loss": 0.01274164617061615, + "step": 122540 + }, + { + "epoch": 17.395315826827538, + "grad_norm": 0.42551353573799133, + "learning_rate": 8.261206529453513e-05, + "loss": 0.007419595122337341, + "step": 122550 + }, + { + "epoch": 17.396735273243436, + "grad_norm": 0.09856267273426056, + "learning_rate": 8.261064584811924e-05, + "loss": 0.0018107056617736816, + "step": 122560 + }, + { + "epoch": 17.39815471965933, + "grad_norm": 0.04244295507669449, + "learning_rate": 8.260922640170334e-05, + "loss": 0.04652504920959473, + "step": 122570 + }, + { + "epoch": 17.39957416607523, + "grad_norm": 0.38564613461494446, + "learning_rate": 8.260780695528744e-05, + "loss": 0.026515766978263855, + "step": 122580 + }, + { + "epoch": 17.400993612491128, + "grad_norm": 4.476937770843506, + "learning_rate": 8.260638750887154e-05, + "loss": 0.0071770638227462765, + "step": 122590 + }, + { + "epoch": 17.402413058907026, + "grad_norm": 0.33714860677719116, + "learning_rate": 8.260496806245565e-05, + "loss": 0.02237725555896759, + "step": 122600 + }, + { + "epoch": 17.403832505322924, + "grad_norm": 4.282869338989258, + "learning_rate": 8.260354861603974e-05, + "loss": 0.04165985584259033, + "step": 122610 + }, + { + "epoch": 17.405251951738823, + "grad_norm": 6.363341808319092, + "learning_rate": 8.260212916962386e-05, + "loss": 0.049985453486442566, + "step": 122620 + }, + { + "epoch": 17.40667139815472, + "grad_norm": 0.8332139849662781, + "learning_rate": 8.260070972320795e-05, + "loss": 0.00713193416595459, + "step": 122630 + }, + { + "epoch": 17.408090844570616, + "grad_norm": 0.33843275904655457, + "learning_rate": 8.259929027679205e-05, + "loss": 0.03059439957141876, + "step": 122640 + }, + { + "epoch": 17.409510290986514, + "grad_norm": 14.195676803588867, + "learning_rate": 8.259787083037616e-05, + "loss": 0.018163122236728668, + "step": 122650 + }, + { + "epoch": 17.410929737402412, + "grad_norm": 9.23517894744873, + "learning_rate": 8.259645138396026e-05, + "loss": 0.026523211598396303, + "step": 122660 + }, + { + "epoch": 17.41234918381831, + "grad_norm": 0.05418802425265312, + "learning_rate": 8.259503193754437e-05, + "loss": 0.02210947871208191, + "step": 122670 + }, + { + "epoch": 17.41376863023421, + "grad_norm": 1.8904386758804321, + "learning_rate": 8.259361249112847e-05, + "loss": 0.004061230272054672, + "step": 122680 + }, + { + "epoch": 17.415188076650107, + "grad_norm": 0.045158129185438156, + "learning_rate": 8.259219304471256e-05, + "loss": 0.004564892128109932, + "step": 122690 + }, + { + "epoch": 17.416607523066006, + "grad_norm": 3.936619997024536, + "learning_rate": 8.259077359829666e-05, + "loss": 0.020028875768184663, + "step": 122700 + }, + { + "epoch": 17.4180269694819, + "grad_norm": 12.677026748657227, + "learning_rate": 8.258935415188077e-05, + "loss": 0.022690902650356292, + "step": 122710 + }, + { + "epoch": 17.4194464158978, + "grad_norm": 1.4186701774597168, + "learning_rate": 8.258793470546487e-05, + "loss": 0.0031532850116491318, + "step": 122720 + }, + { + "epoch": 17.420865862313697, + "grad_norm": 1.636500358581543, + "learning_rate": 8.258651525904898e-05, + "loss": 0.005997669324278832, + "step": 122730 + }, + { + "epoch": 17.422285308729595, + "grad_norm": 7.789987087249756, + "learning_rate": 8.258509581263308e-05, + "loss": 0.018677373230457307, + "step": 122740 + }, + { + "epoch": 17.423704755145494, + "grad_norm": 0.18241193890571594, + "learning_rate": 8.258367636621718e-05, + "loss": 0.0023742862045764922, + "step": 122750 + }, + { + "epoch": 17.425124201561392, + "grad_norm": 7.324385643005371, + "learning_rate": 8.258225691980129e-05, + "loss": 0.013009896874427796, + "step": 122760 + }, + { + "epoch": 17.42654364797729, + "grad_norm": 0.04243845120072365, + "learning_rate": 8.258083747338538e-05, + "loss": 0.011265130341053009, + "step": 122770 + }, + { + "epoch": 17.427963094393185, + "grad_norm": 6.735060214996338, + "learning_rate": 8.25794180269695e-05, + "loss": 0.049876469373703006, + "step": 122780 + }, + { + "epoch": 17.429382540809083, + "grad_norm": 7.400155544281006, + "learning_rate": 8.257799858055358e-05, + "loss": 0.0073287680745124815, + "step": 122790 + }, + { + "epoch": 17.43080198722498, + "grad_norm": 0.12465141713619232, + "learning_rate": 8.257657913413769e-05, + "loss": 0.027358931303024293, + "step": 122800 + }, + { + "epoch": 17.43222143364088, + "grad_norm": 0.1785537749528885, + "learning_rate": 8.257515968772179e-05, + "loss": 0.032534864544868466, + "step": 122810 + }, + { + "epoch": 17.433640880056778, + "grad_norm": 11.284260749816895, + "learning_rate": 8.25737402413059e-05, + "loss": 0.064073246717453, + "step": 122820 + }, + { + "epoch": 17.435060326472676, + "grad_norm": 8.10233211517334, + "learning_rate": 8.257232079489e-05, + "loss": 0.016384488344192503, + "step": 122830 + }, + { + "epoch": 17.436479772888575, + "grad_norm": 0.02594900317490101, + "learning_rate": 8.257090134847409e-05, + "loss": 0.06889531016349792, + "step": 122840 + }, + { + "epoch": 17.43789921930447, + "grad_norm": 7.683279514312744, + "learning_rate": 8.25694819020582e-05, + "loss": 0.007571496069431305, + "step": 122850 + }, + { + "epoch": 17.439318665720368, + "grad_norm": 0.5677283406257629, + "learning_rate": 8.25680624556423e-05, + "loss": 0.024730677902698516, + "step": 122860 + }, + { + "epoch": 17.440738112136266, + "grad_norm": 0.08760765194892883, + "learning_rate": 8.256664300922641e-05, + "loss": 0.015320856869220734, + "step": 122870 + }, + { + "epoch": 17.442157558552164, + "grad_norm": 19.979145050048828, + "learning_rate": 8.256522356281051e-05, + "loss": 0.058557575941085814, + "step": 122880 + }, + { + "epoch": 17.443577004968063, + "grad_norm": 0.24306319653987885, + "learning_rate": 8.256380411639461e-05, + "loss": 0.0089756540954113, + "step": 122890 + }, + { + "epoch": 17.44499645138396, + "grad_norm": 0.09230535477399826, + "learning_rate": 8.25623846699787e-05, + "loss": 0.036602020263671875, + "step": 122900 + }, + { + "epoch": 17.44641589779986, + "grad_norm": 10.769634246826172, + "learning_rate": 8.256096522356282e-05, + "loss": 0.03061630725860596, + "step": 122910 + }, + { + "epoch": 17.447835344215754, + "grad_norm": 1.5239731073379517, + "learning_rate": 8.255954577714691e-05, + "loss": 0.00846146047115326, + "step": 122920 + }, + { + "epoch": 17.449254790631652, + "grad_norm": 0.0684681311249733, + "learning_rate": 8.255812633073102e-05, + "loss": 0.005754857137799263, + "step": 122930 + }, + { + "epoch": 17.45067423704755, + "grad_norm": 1.251187801361084, + "learning_rate": 8.255670688431512e-05, + "loss": 0.011119232326745988, + "step": 122940 + }, + { + "epoch": 17.45209368346345, + "grad_norm": 7.6524224281311035, + "learning_rate": 8.255528743789922e-05, + "loss": 0.012783028185367584, + "step": 122950 + }, + { + "epoch": 17.453513129879347, + "grad_norm": 1.5091584920883179, + "learning_rate": 8.255386799148333e-05, + "loss": 0.005531028658151626, + "step": 122960 + }, + { + "epoch": 17.454932576295246, + "grad_norm": 10.661508560180664, + "learning_rate": 8.255244854506743e-05, + "loss": 0.029649490118026735, + "step": 122970 + }, + { + "epoch": 17.456352022711144, + "grad_norm": 0.053090885281562805, + "learning_rate": 8.255102909865154e-05, + "loss": 0.021497386693954467, + "step": 122980 + }, + { + "epoch": 17.45777146912704, + "grad_norm": 0.14183126389980316, + "learning_rate": 8.254960965223562e-05, + "loss": 0.016825027763843536, + "step": 122990 + }, + { + "epoch": 17.459190915542937, + "grad_norm": 0.06370903551578522, + "learning_rate": 8.254819020581973e-05, + "loss": 0.02068781554698944, + "step": 123000 + }, + { + "epoch": 17.459190915542937, + "eval_accuracy": 0.9883639600686717, + "eval_loss": 0.04308421537280083, + "eval_runtime": 32.7742, + "eval_samples_per_second": 479.859, + "eval_steps_per_second": 15.012, + "step": 123000 + }, + { + "epoch": 17.460610361958835, + "grad_norm": 0.2073710411787033, + "learning_rate": 8.254677075940383e-05, + "loss": 0.01421993225812912, + "step": 123010 + }, + { + "epoch": 17.462029808374734, + "grad_norm": 0.29374533891677856, + "learning_rate": 8.254535131298794e-05, + "loss": 0.009316784143447877, + "step": 123020 + }, + { + "epoch": 17.463449254790632, + "grad_norm": 1.4187288284301758, + "learning_rate": 8.254393186657204e-05, + "loss": 0.026600950956344606, + "step": 123030 + }, + { + "epoch": 17.46486870120653, + "grad_norm": 1.2814127206802368, + "learning_rate": 8.254251242015615e-05, + "loss": 0.033495780825614926, + "step": 123040 + }, + { + "epoch": 17.46628814762243, + "grad_norm": 0.09562135487794876, + "learning_rate": 8.254109297374025e-05, + "loss": 0.029808908700942993, + "step": 123050 + }, + { + "epoch": 17.467707594038323, + "grad_norm": 14.986631393432617, + "learning_rate": 8.253967352732434e-05, + "loss": 0.02389901578426361, + "step": 123060 + }, + { + "epoch": 17.46912704045422, + "grad_norm": 0.3095096945762634, + "learning_rate": 8.253825408090845e-05, + "loss": 0.01468038409948349, + "step": 123070 + }, + { + "epoch": 17.47054648687012, + "grad_norm": 0.12282241135835648, + "learning_rate": 8.253683463449255e-05, + "loss": 0.011646793782711029, + "step": 123080 + }, + { + "epoch": 17.471965933286018, + "grad_norm": 0.03262794017791748, + "learning_rate": 8.253541518807666e-05, + "loss": 0.002748313918709755, + "step": 123090 + }, + { + "epoch": 17.473385379701917, + "grad_norm": 0.7681041359901428, + "learning_rate": 8.253399574166075e-05, + "loss": 0.03596278131008148, + "step": 123100 + }, + { + "epoch": 17.474804826117815, + "grad_norm": 1.0353686809539795, + "learning_rate": 8.253257629524486e-05, + "loss": 0.004134359210729599, + "step": 123110 + }, + { + "epoch": 17.476224272533713, + "grad_norm": 0.7349243760108948, + "learning_rate": 8.253115684882896e-05, + "loss": 0.006130160763859749, + "step": 123120 + }, + { + "epoch": 17.477643718949608, + "grad_norm": 0.018726080656051636, + "learning_rate": 8.252973740241307e-05, + "loss": 0.004772935435175896, + "step": 123130 + }, + { + "epoch": 17.479063165365506, + "grad_norm": 0.2598189413547516, + "learning_rate": 8.252831795599716e-05, + "loss": 0.011500736325979232, + "step": 123140 + }, + { + "epoch": 17.480482611781405, + "grad_norm": 0.9024463295936584, + "learning_rate": 8.252689850958126e-05, + "loss": 0.020366585254669188, + "step": 123150 + }, + { + "epoch": 17.481902058197303, + "grad_norm": 11.113703727722168, + "learning_rate": 8.252547906316537e-05, + "loss": 0.02417703568935394, + "step": 123160 + }, + { + "epoch": 17.4833215046132, + "grad_norm": 0.005567350424826145, + "learning_rate": 8.252405961674947e-05, + "loss": 0.01901007890701294, + "step": 123170 + }, + { + "epoch": 17.4847409510291, + "grad_norm": 0.3311940133571625, + "learning_rate": 8.252264017033358e-05, + "loss": 0.012172038853168487, + "step": 123180 + }, + { + "epoch": 17.486160397444998, + "grad_norm": 0.018239812925457954, + "learning_rate": 8.252122072391768e-05, + "loss": 0.0036922723054885866, + "step": 123190 + }, + { + "epoch": 17.487579843860892, + "grad_norm": 0.007090285886079073, + "learning_rate": 8.251980127750177e-05, + "loss": 0.0167811781167984, + "step": 123200 + }, + { + "epoch": 17.48899929027679, + "grad_norm": 4.8959269523620605, + "learning_rate": 8.251838183108587e-05, + "loss": 0.06325067281723022, + "step": 123210 + }, + { + "epoch": 17.49041873669269, + "grad_norm": 0.44174933433532715, + "learning_rate": 8.251696238466998e-05, + "loss": 0.008886340260505676, + "step": 123220 + }, + { + "epoch": 17.491838183108587, + "grad_norm": 12.290687561035156, + "learning_rate": 8.251554293825408e-05, + "loss": 0.031110918521881102, + "step": 123230 + }, + { + "epoch": 17.493257629524486, + "grad_norm": 2.3835344314575195, + "learning_rate": 8.251412349183819e-05, + "loss": 0.05299111008644104, + "step": 123240 + }, + { + "epoch": 17.494677075940384, + "grad_norm": 0.015629790723323822, + "learning_rate": 8.251270404542229e-05, + "loss": 0.04958618879318237, + "step": 123250 + }, + { + "epoch": 17.496096522356282, + "grad_norm": 0.18703432381153107, + "learning_rate": 8.251128459900639e-05, + "loss": 0.013501530885696411, + "step": 123260 + }, + { + "epoch": 17.497515968772177, + "grad_norm": 7.4073991775512695, + "learning_rate": 8.25098651525905e-05, + "loss": 0.042862167954444884, + "step": 123270 + }, + { + "epoch": 17.498935415188075, + "grad_norm": 3.2967214584350586, + "learning_rate": 8.25084457061746e-05, + "loss": 0.004912526533007622, + "step": 123280 + }, + { + "epoch": 17.500354861603974, + "grad_norm": 2.1429383754730225, + "learning_rate": 8.25070262597587e-05, + "loss": 0.05954912304878235, + "step": 123290 + }, + { + "epoch": 17.501774308019872, + "grad_norm": 0.5956802368164062, + "learning_rate": 8.250560681334279e-05, + "loss": 0.031099620461463928, + "step": 123300 + }, + { + "epoch": 17.50319375443577, + "grad_norm": 0.24786630272865295, + "learning_rate": 8.25041873669269e-05, + "loss": 0.018831300735473632, + "step": 123310 + }, + { + "epoch": 17.50461320085167, + "grad_norm": 0.10456979274749756, + "learning_rate": 8.2502767920511e-05, + "loss": 0.011707174777984618, + "step": 123320 + }, + { + "epoch": 17.506032647267567, + "grad_norm": 0.050170306116342545, + "learning_rate": 8.250134847409511e-05, + "loss": 0.039228835701942445, + "step": 123330 + }, + { + "epoch": 17.50745209368346, + "grad_norm": 2.9133970737457275, + "learning_rate": 8.24999290276792e-05, + "loss": 0.003978077322244644, + "step": 123340 + }, + { + "epoch": 17.50887154009936, + "grad_norm": 2.6007888317108154, + "learning_rate": 8.24985095812633e-05, + "loss": 0.05585094690322876, + "step": 123350 + }, + { + "epoch": 17.51029098651526, + "grad_norm": 0.1047113761305809, + "learning_rate": 8.249709013484741e-05, + "loss": 0.012924128770828247, + "step": 123360 + }, + { + "epoch": 17.511710432931157, + "grad_norm": 0.23884299397468567, + "learning_rate": 8.249567068843151e-05, + "loss": 0.022875207662582397, + "step": 123370 + }, + { + "epoch": 17.513129879347055, + "grad_norm": 0.16158747673034668, + "learning_rate": 8.249425124201562e-05, + "loss": 0.02361272871494293, + "step": 123380 + }, + { + "epoch": 17.514549325762953, + "grad_norm": 3.2364308834075928, + "learning_rate": 8.249283179559972e-05, + "loss": 0.03016907870769501, + "step": 123390 + }, + { + "epoch": 17.51596877217885, + "grad_norm": 0.15410637855529785, + "learning_rate": 8.249141234918383e-05, + "loss": 0.015434008836746217, + "step": 123400 + }, + { + "epoch": 17.517388218594746, + "grad_norm": 0.05540246143937111, + "learning_rate": 8.248999290276791e-05, + "loss": 0.009972456097602844, + "step": 123410 + }, + { + "epoch": 17.518807665010645, + "grad_norm": 6.280983924865723, + "learning_rate": 8.248857345635203e-05, + "loss": 0.016720139980316163, + "step": 123420 + }, + { + "epoch": 17.520227111426543, + "grad_norm": 0.0030701293144375086, + "learning_rate": 8.248715400993612e-05, + "loss": 0.04440495073795318, + "step": 123430 + }, + { + "epoch": 17.52164655784244, + "grad_norm": 1.1619906425476074, + "learning_rate": 8.248573456352023e-05, + "loss": 0.0237629234790802, + "step": 123440 + }, + { + "epoch": 17.52306600425834, + "grad_norm": 0.05782632157206535, + "learning_rate": 8.248431511710434e-05, + "loss": 0.025528308749198914, + "step": 123450 + }, + { + "epoch": 17.524485450674238, + "grad_norm": 4.620859622955322, + "learning_rate": 8.248289567068843e-05, + "loss": 0.022039780020713808, + "step": 123460 + }, + { + "epoch": 17.525904897090136, + "grad_norm": 0.013535212725400925, + "learning_rate": 8.248147622427254e-05, + "loss": 0.013678130507469178, + "step": 123470 + }, + { + "epoch": 17.52732434350603, + "grad_norm": 1.571839690208435, + "learning_rate": 8.248005677785664e-05, + "loss": 0.015347103774547576, + "step": 123480 + }, + { + "epoch": 17.52874378992193, + "grad_norm": 12.499358177185059, + "learning_rate": 8.247863733144075e-05, + "loss": 0.09705874919891358, + "step": 123490 + }, + { + "epoch": 17.530163236337827, + "grad_norm": 0.08306962251663208, + "learning_rate": 8.247721788502485e-05, + "loss": 0.007053288817405701, + "step": 123500 + }, + { + "epoch": 17.530163236337827, + "eval_accuracy": 0.9822598079735487, + "eval_loss": 0.07242994010448456, + "eval_runtime": 32.3528, + "eval_samples_per_second": 486.109, + "eval_steps_per_second": 15.207, + "step": 123500 + }, + { + "epoch": 17.531582682753726, + "grad_norm": 0.28261151909828186, + "learning_rate": 8.247579843860894e-05, + "loss": 0.020450976490974427, + "step": 123510 + }, + { + "epoch": 17.533002129169624, + "grad_norm": 7.134189128875732, + "learning_rate": 8.247437899219304e-05, + "loss": 0.06149494647979736, + "step": 123520 + }, + { + "epoch": 17.534421575585522, + "grad_norm": 5.049461841583252, + "learning_rate": 8.247295954577715e-05, + "loss": 0.02939991354942322, + "step": 123530 + }, + { + "epoch": 17.53584102200142, + "grad_norm": 1.4308009147644043, + "learning_rate": 8.247154009936126e-05, + "loss": 0.0470628172159195, + "step": 123540 + }, + { + "epoch": 17.537260468417315, + "grad_norm": 0.2950064539909363, + "learning_rate": 8.247012065294536e-05, + "loss": 0.031284031271934507, + "step": 123550 + }, + { + "epoch": 17.538679914833214, + "grad_norm": 6.529641628265381, + "learning_rate": 8.246870120652946e-05, + "loss": 0.025937312841415407, + "step": 123560 + }, + { + "epoch": 17.540099361249112, + "grad_norm": 0.01473158597946167, + "learning_rate": 8.246728176011355e-05, + "loss": 0.06190433502197266, + "step": 123570 + }, + { + "epoch": 17.54151880766501, + "grad_norm": 0.28922995924949646, + "learning_rate": 8.246586231369766e-05, + "loss": 0.008700785040855408, + "step": 123580 + }, + { + "epoch": 17.54293825408091, + "grad_norm": 12.637017250061035, + "learning_rate": 8.246444286728176e-05, + "loss": 0.03645790219306946, + "step": 123590 + }, + { + "epoch": 17.544357700496807, + "grad_norm": 0.027270788326859474, + "learning_rate": 8.246302342086587e-05, + "loss": 0.015550993382930756, + "step": 123600 + }, + { + "epoch": 17.545777146912705, + "grad_norm": 8.379532814025879, + "learning_rate": 8.246160397444996e-05, + "loss": 0.021490223705768585, + "step": 123610 + }, + { + "epoch": 17.5471965933286, + "grad_norm": 7.282254219055176, + "learning_rate": 8.246018452803407e-05, + "loss": 0.01094207763671875, + "step": 123620 + }, + { + "epoch": 17.5486160397445, + "grad_norm": 0.03394869714975357, + "learning_rate": 8.245876508161818e-05, + "loss": 0.004773364588618279, + "step": 123630 + }, + { + "epoch": 17.550035486160397, + "grad_norm": 0.06286856532096863, + "learning_rate": 8.245734563520228e-05, + "loss": 0.0032698489725589753, + "step": 123640 + }, + { + "epoch": 17.551454932576295, + "grad_norm": 1.2578529119491577, + "learning_rate": 8.245592618878639e-05, + "loss": 0.02867700159549713, + "step": 123650 + }, + { + "epoch": 17.552874378992193, + "grad_norm": 0.13294143974781036, + "learning_rate": 8.245450674237047e-05, + "loss": 0.03565240502357483, + "step": 123660 + }, + { + "epoch": 17.55429382540809, + "grad_norm": 0.19572824239730835, + "learning_rate": 8.245308729595458e-05, + "loss": 0.017168080806732176, + "step": 123670 + }, + { + "epoch": 17.55571327182399, + "grad_norm": 0.611763060092926, + "learning_rate": 8.245166784953868e-05, + "loss": 0.01786961555480957, + "step": 123680 + }, + { + "epoch": 17.557132718239885, + "grad_norm": 0.8012092113494873, + "learning_rate": 8.245024840312279e-05, + "loss": 0.011101428419351578, + "step": 123690 + }, + { + "epoch": 17.558552164655783, + "grad_norm": 0.12960731983184814, + "learning_rate": 8.244882895670689e-05, + "loss": 0.040901082754135135, + "step": 123700 + }, + { + "epoch": 17.55997161107168, + "grad_norm": 1.262016773223877, + "learning_rate": 8.244740951029098e-05, + "loss": 0.028654444217681884, + "step": 123710 + }, + { + "epoch": 17.56139105748758, + "grad_norm": 0.09878029674291611, + "learning_rate": 8.24459900638751e-05, + "loss": 0.005360887199640274, + "step": 123720 + }, + { + "epoch": 17.562810503903478, + "grad_norm": 0.01878870464861393, + "learning_rate": 8.244457061745919e-05, + "loss": 0.06400970816612243, + "step": 123730 + }, + { + "epoch": 17.564229950319376, + "grad_norm": 8.757014274597168, + "learning_rate": 8.24431511710433e-05, + "loss": 0.026718950271606444, + "step": 123740 + }, + { + "epoch": 17.565649396735274, + "grad_norm": 1.1390107870101929, + "learning_rate": 8.24417317246274e-05, + "loss": 0.013444627821445464, + "step": 123750 + }, + { + "epoch": 17.56706884315117, + "grad_norm": 0.5454521775245667, + "learning_rate": 8.244031227821151e-05, + "loss": 0.023906174302101135, + "step": 123760 + }, + { + "epoch": 17.568488289567068, + "grad_norm": 0.45480877161026, + "learning_rate": 8.24388928317956e-05, + "loss": 0.05484031438827515, + "step": 123770 + }, + { + "epoch": 17.569907735982966, + "grad_norm": 4.2534589767456055, + "learning_rate": 8.243747338537971e-05, + "loss": 0.020327845215797426, + "step": 123780 + }, + { + "epoch": 17.571327182398864, + "grad_norm": 2.1090826988220215, + "learning_rate": 8.24360539389638e-05, + "loss": 0.01915072351694107, + "step": 123790 + }, + { + "epoch": 17.572746628814762, + "grad_norm": 0.3815958499908447, + "learning_rate": 8.243463449254792e-05, + "loss": 0.012741312384605408, + "step": 123800 + }, + { + "epoch": 17.57416607523066, + "grad_norm": 3.8844399452209473, + "learning_rate": 8.243321504613201e-05, + "loss": 0.014030416309833527, + "step": 123810 + }, + { + "epoch": 17.57558552164656, + "grad_norm": 3.2737066745758057, + "learning_rate": 8.243179559971611e-05, + "loss": 0.022963154315948486, + "step": 123820 + }, + { + "epoch": 17.577004968062454, + "grad_norm": 9.672563552856445, + "learning_rate": 8.243037615330022e-05, + "loss": 0.02373957335948944, + "step": 123830 + }, + { + "epoch": 17.578424414478352, + "grad_norm": 0.26230794191360474, + "learning_rate": 8.242895670688432e-05, + "loss": 0.020463331043720244, + "step": 123840 + }, + { + "epoch": 17.57984386089425, + "grad_norm": 1.9679028987884521, + "learning_rate": 8.242753726046843e-05, + "loss": 0.007435081154108047, + "step": 123850 + }, + { + "epoch": 17.58126330731015, + "grad_norm": 0.18290266394615173, + "learning_rate": 8.242611781405253e-05, + "loss": 0.028893864154815672, + "step": 123860 + }, + { + "epoch": 17.582682753726047, + "grad_norm": 5.281581401824951, + "learning_rate": 8.242469836763662e-05, + "loss": 0.025062206387519836, + "step": 123870 + }, + { + "epoch": 17.584102200141945, + "grad_norm": 0.441976934671402, + "learning_rate": 8.242327892122072e-05, + "loss": 0.009923070669174194, + "step": 123880 + }, + { + "epoch": 17.585521646557844, + "grad_norm": 0.3485376536846161, + "learning_rate": 8.242185947480483e-05, + "loss": 0.023020398616790772, + "step": 123890 + }, + { + "epoch": 17.58694109297374, + "grad_norm": 0.02939898520708084, + "learning_rate": 8.242044002838893e-05, + "loss": 0.015514726936817168, + "step": 123900 + }, + { + "epoch": 17.588360539389637, + "grad_norm": 0.2501057982444763, + "learning_rate": 8.241902058197304e-05, + "loss": 0.021653544902801514, + "step": 123910 + }, + { + "epoch": 17.589779985805535, + "grad_norm": 0.25378942489624023, + "learning_rate": 8.241760113555714e-05, + "loss": 0.029783162474632262, + "step": 123920 + }, + { + "epoch": 17.591199432221433, + "grad_norm": 0.5727351903915405, + "learning_rate": 8.241618168914124e-05, + "loss": 0.01356571912765503, + "step": 123930 + }, + { + "epoch": 17.59261887863733, + "grad_norm": 0.010458029806613922, + "learning_rate": 8.241476224272535e-05, + "loss": 0.039194774627685544, + "step": 123940 + }, + { + "epoch": 17.59403832505323, + "grad_norm": 0.6755303144454956, + "learning_rate": 8.241334279630944e-05, + "loss": 0.013087576627731324, + "step": 123950 + }, + { + "epoch": 17.59545777146913, + "grad_norm": 0.9623459577560425, + "learning_rate": 8.241192334989355e-05, + "loss": 0.0032142918556928636, + "step": 123960 + }, + { + "epoch": 17.596877217885023, + "grad_norm": 1.4890637397766113, + "learning_rate": 8.241050390347764e-05, + "loss": 0.016361470520496368, + "step": 123970 + }, + { + "epoch": 17.59829666430092, + "grad_norm": 1.138136625289917, + "learning_rate": 8.240908445706175e-05, + "loss": 0.01091969683766365, + "step": 123980 + }, + { + "epoch": 17.59971611071682, + "grad_norm": 0.7977745532989502, + "learning_rate": 8.240766501064585e-05, + "loss": 0.02227955460548401, + "step": 123990 + }, + { + "epoch": 17.601135557132718, + "grad_norm": 11.51099681854248, + "learning_rate": 8.240624556422996e-05, + "loss": 0.06129167079925537, + "step": 124000 + }, + { + "epoch": 17.601135557132718, + "eval_accuracy": 0.9866471672919184, + "eval_loss": 0.04729590564966202, + "eval_runtime": 33.4178, + "eval_samples_per_second": 470.617, + "eval_steps_per_second": 14.723, + "step": 124000 + }, + { + "epoch": 17.602555003548616, + "grad_norm": 1.4898444414138794, + "learning_rate": 8.240482611781406e-05, + "loss": 0.024441052973270417, + "step": 124010 + }, + { + "epoch": 17.603974449964515, + "grad_norm": 0.052741412073373795, + "learning_rate": 8.240340667139815e-05, + "loss": 0.04893164336681366, + "step": 124020 + }, + { + "epoch": 17.605393896380413, + "grad_norm": 0.6417841911315918, + "learning_rate": 8.240198722498226e-05, + "loss": 0.011471222341060638, + "step": 124030 + }, + { + "epoch": 17.606813342796308, + "grad_norm": 0.02762775495648384, + "learning_rate": 8.240056777856636e-05, + "loss": 0.028142285346984864, + "step": 124040 + }, + { + "epoch": 17.608232789212206, + "grad_norm": 1.8497322797775269, + "learning_rate": 8.239914833215047e-05, + "loss": 0.03893795311450958, + "step": 124050 + }, + { + "epoch": 17.609652235628104, + "grad_norm": 0.47043776512145996, + "learning_rate": 8.239772888573457e-05, + "loss": 0.014402249455451965, + "step": 124060 + }, + { + "epoch": 17.611071682044003, + "grad_norm": 1.0286686420440674, + "learning_rate": 8.239630943931868e-05, + "loss": 0.011852450668811798, + "step": 124070 + }, + { + "epoch": 17.6124911284599, + "grad_norm": 0.024061763659119606, + "learning_rate": 8.239488999290276e-05, + "loss": 0.014504016935825348, + "step": 124080 + }, + { + "epoch": 17.6139105748758, + "grad_norm": 1.218518614768982, + "learning_rate": 8.239347054648687e-05, + "loss": 0.023448963463306428, + "step": 124090 + }, + { + "epoch": 17.615330021291697, + "grad_norm": 5.819756507873535, + "learning_rate": 8.239205110007097e-05, + "loss": 0.017736424505710603, + "step": 124100 + }, + { + "epoch": 17.616749467707596, + "grad_norm": 0.11186391115188599, + "learning_rate": 8.239063165365508e-05, + "loss": 0.012619951367378235, + "step": 124110 + }, + { + "epoch": 17.61816891412349, + "grad_norm": 12.263643264770508, + "learning_rate": 8.238921220723918e-05, + "loss": 0.04501128494739533, + "step": 124120 + }, + { + "epoch": 17.61958836053939, + "grad_norm": 0.25323036313056946, + "learning_rate": 8.238779276082328e-05, + "loss": 0.027339151501655577, + "step": 124130 + }, + { + "epoch": 17.621007806955287, + "grad_norm": 1.4857368469238281, + "learning_rate": 8.238637331440739e-05, + "loss": 0.01260230541229248, + "step": 124140 + }, + { + "epoch": 17.622427253371185, + "grad_norm": 1.9226738214492798, + "learning_rate": 8.238495386799149e-05, + "loss": 0.005465056374669075, + "step": 124150 + }, + { + "epoch": 17.623846699787084, + "grad_norm": 0.01063599530607462, + "learning_rate": 8.23835344215756e-05, + "loss": 0.02559836208820343, + "step": 124160 + }, + { + "epoch": 17.625266146202982, + "grad_norm": 0.022318635135889053, + "learning_rate": 8.23821149751597e-05, + "loss": 0.0031830746680498122, + "step": 124170 + }, + { + "epoch": 17.62668559261888, + "grad_norm": 7.479580879211426, + "learning_rate": 8.238069552874379e-05, + "loss": 0.016616930067539216, + "step": 124180 + }, + { + "epoch": 17.628105039034775, + "grad_norm": 6.861004829406738, + "learning_rate": 8.237927608232789e-05, + "loss": 0.02781272232532501, + "step": 124190 + }, + { + "epoch": 17.629524485450673, + "grad_norm": 0.3056953549385071, + "learning_rate": 8.2377856635912e-05, + "loss": 0.011820276081562043, + "step": 124200 + }, + { + "epoch": 17.63094393186657, + "grad_norm": 0.2387654185295105, + "learning_rate": 8.23764371894961e-05, + "loss": 0.03424981236457825, + "step": 124210 + }, + { + "epoch": 17.63236337828247, + "grad_norm": 13.709823608398438, + "learning_rate": 8.237501774308021e-05, + "loss": 0.03825869858264923, + "step": 124220 + }, + { + "epoch": 17.63378282469837, + "grad_norm": 0.09609783440828323, + "learning_rate": 8.23735982966643e-05, + "loss": 0.034221959114074704, + "step": 124230 + }, + { + "epoch": 17.635202271114267, + "grad_norm": 0.06290843337774277, + "learning_rate": 8.23721788502484e-05, + "loss": 0.019939064979553223, + "step": 124240 + }, + { + "epoch": 17.636621717530165, + "grad_norm": 3.8382863998413086, + "learning_rate": 8.237075940383251e-05, + "loss": 0.021433869004249574, + "step": 124250 + }, + { + "epoch": 17.63804116394606, + "grad_norm": 1.720458984375, + "learning_rate": 8.236933995741661e-05, + "loss": 0.0241766482591629, + "step": 124260 + }, + { + "epoch": 17.639460610361958, + "grad_norm": 8.562639236450195, + "learning_rate": 8.236792051100072e-05, + "loss": 0.02787082493305206, + "step": 124270 + }, + { + "epoch": 17.640880056777856, + "grad_norm": 0.12311892211437225, + "learning_rate": 8.236664300922641e-05, + "loss": 0.0271776020526886, + "step": 124280 + }, + { + "epoch": 17.642299503193755, + "grad_norm": 0.6175635457038879, + "learning_rate": 8.236522356281052e-05, + "loss": 0.03380132913589477, + "step": 124290 + }, + { + "epoch": 17.643718949609653, + "grad_norm": 0.050154250115156174, + "learning_rate": 8.23638041163946e-05, + "loss": 0.023596420884132385, + "step": 124300 + }, + { + "epoch": 17.64513839602555, + "grad_norm": 0.5078491568565369, + "learning_rate": 8.236238466997871e-05, + "loss": 0.012544466555118561, + "step": 124310 + }, + { + "epoch": 17.64655784244145, + "grad_norm": 8.803682327270508, + "learning_rate": 8.236096522356281e-05, + "loss": 0.012344937026500701, + "step": 124320 + }, + { + "epoch": 17.647977288857344, + "grad_norm": 9.935019493103027, + "learning_rate": 8.235954577714692e-05, + "loss": 0.01960514485836029, + "step": 124330 + }, + { + "epoch": 17.649396735273243, + "grad_norm": 9.821824073791504, + "learning_rate": 8.235812633073102e-05, + "loss": 0.015455111861228943, + "step": 124340 + }, + { + "epoch": 17.65081618168914, + "grad_norm": 0.24194194376468658, + "learning_rate": 8.235684882895672e-05, + "loss": 0.019938093423843384, + "step": 124350 + }, + { + "epoch": 17.65223562810504, + "grad_norm": 7.058005332946777, + "learning_rate": 8.235542938254081e-05, + "loss": 0.018306195735931396, + "step": 124360 + }, + { + "epoch": 17.653655074520938, + "grad_norm": 3.6557610034942627, + "learning_rate": 8.235400993612493e-05, + "loss": 0.008752855658531188, + "step": 124370 + }, + { + "epoch": 17.655074520936836, + "grad_norm": 0.06243341043591499, + "learning_rate": 8.235259048970901e-05, + "loss": 0.012735649943351746, + "step": 124380 + }, + { + "epoch": 17.656493967352734, + "grad_norm": 0.3967527151107788, + "learning_rate": 8.235117104329312e-05, + "loss": 0.03084678053855896, + "step": 124390 + }, + { + "epoch": 17.65791341376863, + "grad_norm": 1.7461168766021729, + "learning_rate": 8.234975159687722e-05, + "loss": 0.013518714904785156, + "step": 124400 + }, + { + "epoch": 17.659332860184527, + "grad_norm": 8.566067695617676, + "learning_rate": 8.234833215046133e-05, + "loss": 0.044089671969413755, + "step": 124410 + }, + { + "epoch": 17.660752306600425, + "grad_norm": 14.566707611083984, + "learning_rate": 8.234691270404543e-05, + "loss": 0.010162675380706787, + "step": 124420 + }, + { + "epoch": 17.662171753016324, + "grad_norm": 1.658950924873352, + "learning_rate": 8.234549325762952e-05, + "loss": 0.005507271736860275, + "step": 124430 + }, + { + "epoch": 17.663591199432222, + "grad_norm": 7.552545070648193, + "learning_rate": 8.234407381121363e-05, + "loss": 0.01709498018026352, + "step": 124440 + }, + { + "epoch": 17.66501064584812, + "grad_norm": 10.414005279541016, + "learning_rate": 8.234265436479773e-05, + "loss": 0.0319903165102005, + "step": 124450 + }, + { + "epoch": 17.66643009226402, + "grad_norm": 0.043101776391267776, + "learning_rate": 8.234123491838184e-05, + "loss": 0.03780293762683869, + "step": 124460 + }, + { + "epoch": 17.667849538679913, + "grad_norm": 0.3531849682331085, + "learning_rate": 8.233981547196594e-05, + "loss": 0.011514118313789368, + "step": 124470 + }, + { + "epoch": 17.669268985095812, + "grad_norm": 6.060878276824951, + "learning_rate": 8.233839602555004e-05, + "loss": 0.03236663341522217, + "step": 124480 + }, + { + "epoch": 17.67068843151171, + "grad_norm": 0.04426760599017143, + "learning_rate": 8.233697657913413e-05, + "loss": 0.0357169508934021, + "step": 124490 + }, + { + "epoch": 17.67210787792761, + "grad_norm": 4.216787815093994, + "learning_rate": 8.233555713271825e-05, + "loss": 0.027725604176521302, + "step": 124500 + }, + { + "epoch": 17.67210787792761, + "eval_accuracy": 0.9877281108920964, + "eval_loss": 0.04588211327791214, + "eval_runtime": 33.295, + "eval_samples_per_second": 472.353, + "eval_steps_per_second": 14.777, + "step": 124500 + }, + { + "epoch": 17.673527324343507, + "grad_norm": 7.310577869415283, + "learning_rate": 8.233413768630234e-05, + "loss": 0.08030985593795777, + "step": 124510 + }, + { + "epoch": 17.674946770759405, + "grad_norm": 1.1622400283813477, + "learning_rate": 8.233271823988645e-05, + "loss": 0.0325200617313385, + "step": 124520 + }, + { + "epoch": 17.676366217175303, + "grad_norm": 0.07404623925685883, + "learning_rate": 8.233129879347055e-05, + "loss": 0.02496753931045532, + "step": 124530 + }, + { + "epoch": 17.677785663591198, + "grad_norm": 2.0904412269592285, + "learning_rate": 8.232987934705465e-05, + "loss": 0.01579650193452835, + "step": 124540 + }, + { + "epoch": 17.679205110007096, + "grad_norm": 1.241287350654602, + "learning_rate": 8.232845990063876e-05, + "loss": 0.009946402907371522, + "step": 124550 + }, + { + "epoch": 17.680624556422995, + "grad_norm": 0.16404542326927185, + "learning_rate": 8.232704045422286e-05, + "loss": 0.01308448165655136, + "step": 124560 + }, + { + "epoch": 17.682044002838893, + "grad_norm": 0.14319021999835968, + "learning_rate": 8.232562100780697e-05, + "loss": 0.009645262360572815, + "step": 124570 + }, + { + "epoch": 17.68346344925479, + "grad_norm": 1.4152799844741821, + "learning_rate": 8.232420156139105e-05, + "loss": 0.010371728241443634, + "step": 124580 + }, + { + "epoch": 17.68488289567069, + "grad_norm": 0.107726089656353, + "learning_rate": 8.232278211497516e-05, + "loss": 0.07219036817550659, + "step": 124590 + }, + { + "epoch": 17.686302342086588, + "grad_norm": 10.603217124938965, + "learning_rate": 8.232136266855926e-05, + "loss": 0.0411306768655777, + "step": 124600 + }, + { + "epoch": 17.687721788502483, + "grad_norm": 0.11992136389017105, + "learning_rate": 8.231994322214337e-05, + "loss": 0.033462563157081605, + "step": 124610 + }, + { + "epoch": 17.68914123491838, + "grad_norm": 0.36357948184013367, + "learning_rate": 8.231852377572747e-05, + "loss": 0.02809482216835022, + "step": 124620 + }, + { + "epoch": 17.69056068133428, + "grad_norm": 0.8509595394134521, + "learning_rate": 8.231710432931157e-05, + "loss": 0.029175931215286256, + "step": 124630 + }, + { + "epoch": 17.691980127750178, + "grad_norm": 3.2803003787994385, + "learning_rate": 8.231568488289568e-05, + "loss": 0.07031551599502564, + "step": 124640 + }, + { + "epoch": 17.693399574166076, + "grad_norm": 12.018957138061523, + "learning_rate": 8.231426543647977e-05, + "loss": 0.04732522368431091, + "step": 124650 + }, + { + "epoch": 17.694819020581974, + "grad_norm": 0.03543302044272423, + "learning_rate": 8.231284599006388e-05, + "loss": 0.02590615451335907, + "step": 124660 + }, + { + "epoch": 17.696238466997873, + "grad_norm": 0.1485099345445633, + "learning_rate": 8.231142654364798e-05, + "loss": 0.037968295812606814, + "step": 124670 + }, + { + "epoch": 17.697657913413767, + "grad_norm": 0.10532703250646591, + "learning_rate": 8.231000709723208e-05, + "loss": 0.02712967395782471, + "step": 124680 + }, + { + "epoch": 17.699077359829666, + "grad_norm": 0.5705673098564148, + "learning_rate": 8.230858765081618e-05, + "loss": 0.03225035667419433, + "step": 124690 + }, + { + "epoch": 17.700496806245564, + "grad_norm": 0.003143586916849017, + "learning_rate": 8.230716820440029e-05, + "loss": 0.004828961193561554, + "step": 124700 + }, + { + "epoch": 17.701916252661462, + "grad_norm": 0.09341616183519363, + "learning_rate": 8.230574875798439e-05, + "loss": 0.01144537478685379, + "step": 124710 + }, + { + "epoch": 17.70333569907736, + "grad_norm": 0.6089215874671936, + "learning_rate": 8.23043293115685e-05, + "loss": 0.016727571189403535, + "step": 124720 + }, + { + "epoch": 17.70475514549326, + "grad_norm": 0.12068206071853638, + "learning_rate": 8.23029098651526e-05, + "loss": 0.029977282881736754, + "step": 124730 + }, + { + "epoch": 17.706174591909157, + "grad_norm": 0.4399046003818512, + "learning_rate": 8.230149041873669e-05, + "loss": 0.004052478447556495, + "step": 124740 + }, + { + "epoch": 17.707594038325052, + "grad_norm": 7.016625881195068, + "learning_rate": 8.23000709723208e-05, + "loss": 0.01683313101530075, + "step": 124750 + }, + { + "epoch": 17.70901348474095, + "grad_norm": 0.6942397356033325, + "learning_rate": 8.22986515259049e-05, + "loss": 0.0047518197447061535, + "step": 124760 + }, + { + "epoch": 17.71043293115685, + "grad_norm": 0.5672115683555603, + "learning_rate": 8.229723207948901e-05, + "loss": 0.030978906154632568, + "step": 124770 + }, + { + "epoch": 17.711852377572747, + "grad_norm": 0.7284126877784729, + "learning_rate": 8.229581263307311e-05, + "loss": 0.008985263854265213, + "step": 124780 + }, + { + "epoch": 17.713271823988645, + "grad_norm": 0.7015783190727234, + "learning_rate": 8.22943931866572e-05, + "loss": 0.04537283778190613, + "step": 124790 + }, + { + "epoch": 17.714691270404543, + "grad_norm": 0.14790210127830505, + "learning_rate": 8.22929737402413e-05, + "loss": 0.004269610345363617, + "step": 124800 + }, + { + "epoch": 17.71611071682044, + "grad_norm": 7.4049224853515625, + "learning_rate": 8.229155429382541e-05, + "loss": 0.04315000772476196, + "step": 124810 + }, + { + "epoch": 17.717530163236336, + "grad_norm": 0.7216931581497192, + "learning_rate": 8.229013484740951e-05, + "loss": 0.06089695692062378, + "step": 124820 + }, + { + "epoch": 17.718949609652235, + "grad_norm": 0.060295574367046356, + "learning_rate": 8.228871540099362e-05, + "loss": 0.028363001346588135, + "step": 124830 + }, + { + "epoch": 17.720369056068133, + "grad_norm": 3.4502999782562256, + "learning_rate": 8.228729595457772e-05, + "loss": 0.021394972503185273, + "step": 124840 + }, + { + "epoch": 17.72178850248403, + "grad_norm": 0.5024992823600769, + "learning_rate": 8.228587650816182e-05, + "loss": 0.009638495743274689, + "step": 124850 + }, + { + "epoch": 17.72320794889993, + "grad_norm": 7.753618240356445, + "learning_rate": 8.228445706174593e-05, + "loss": 0.046898412704467776, + "step": 124860 + }, + { + "epoch": 17.724627395315828, + "grad_norm": 0.00778708653524518, + "learning_rate": 8.228303761533002e-05, + "loss": 0.01838609725236893, + "step": 124870 + }, + { + "epoch": 17.726046841731726, + "grad_norm": 0.9254558086395264, + "learning_rate": 8.228161816891414e-05, + "loss": 0.024858033657073973, + "step": 124880 + }, + { + "epoch": 17.72746628814762, + "grad_norm": 0.058187440037727356, + "learning_rate": 8.228019872249822e-05, + "loss": 0.010150325298309327, + "step": 124890 + }, + { + "epoch": 17.72888573456352, + "grad_norm": 5.154980659484863, + "learning_rate": 8.227877927608233e-05, + "loss": 0.06288841962814332, + "step": 124900 + }, + { + "epoch": 17.730305180979418, + "grad_norm": 0.2450360506772995, + "learning_rate": 8.227735982966643e-05, + "loss": 0.025718489289283754, + "step": 124910 + }, + { + "epoch": 17.731724627395316, + "grad_norm": 4.560354709625244, + "learning_rate": 8.227594038325054e-05, + "loss": 0.006730784475803375, + "step": 124920 + }, + { + "epoch": 17.733144073811214, + "grad_norm": 9.024094581604004, + "learning_rate": 8.227452093683464e-05, + "loss": 0.014889387786388398, + "step": 124930 + }, + { + "epoch": 17.734563520227113, + "grad_norm": 0.312635213136673, + "learning_rate": 8.227310149041873e-05, + "loss": 0.0237678125500679, + "step": 124940 + }, + { + "epoch": 17.73598296664301, + "grad_norm": 0.08515344560146332, + "learning_rate": 8.227168204400284e-05, + "loss": 0.02553623020648956, + "step": 124950 + }, + { + "epoch": 17.737402413058906, + "grad_norm": 1.9939868450164795, + "learning_rate": 8.227026259758694e-05, + "loss": 0.012267166376113891, + "step": 124960 + }, + { + "epoch": 17.738821859474804, + "grad_norm": 0.26859158277511597, + "learning_rate": 8.226884315117105e-05, + "loss": 0.007619164884090424, + "step": 124970 + }, + { + "epoch": 17.740241305890702, + "grad_norm": 0.0797005370259285, + "learning_rate": 8.226742370475515e-05, + "loss": 0.08029309511184693, + "step": 124980 + }, + { + "epoch": 17.7416607523066, + "grad_norm": 15.179434776306152, + "learning_rate": 8.226600425833925e-05, + "loss": 0.024601057171821594, + "step": 124990 + }, + { + "epoch": 17.7430801987225, + "grad_norm": 0.025385675951838493, + "learning_rate": 8.226458481192334e-05, + "loss": 0.005561524629592895, + "step": 125000 + }, + { + "epoch": 17.7430801987225, + "eval_accuracy": 0.9884275449863292, + "eval_loss": 0.043167341500520706, + "eval_runtime": 32.2048, + "eval_samples_per_second": 488.344, + "eval_steps_per_second": 15.277, + "step": 125000 + }, + { + "epoch": 17.744499645138397, + "grad_norm": 0.022722860798239708, + "learning_rate": 8.226316536550746e-05, + "loss": 0.022749267518520355, + "step": 125010 + }, + { + "epoch": 17.745919091554295, + "grad_norm": 9.28839111328125, + "learning_rate": 8.226174591909155e-05, + "loss": 0.043350374698638915, + "step": 125020 + }, + { + "epoch": 17.74733853797019, + "grad_norm": 4.699091911315918, + "learning_rate": 8.226032647267566e-05, + "loss": 0.06598352193832398, + "step": 125030 + }, + { + "epoch": 17.74875798438609, + "grad_norm": 1.6834558248519897, + "learning_rate": 8.225890702625976e-05, + "loss": 0.02005355656147003, + "step": 125040 + }, + { + "epoch": 17.750177430801987, + "grad_norm": 0.021661954000592232, + "learning_rate": 8.225748757984386e-05, + "loss": 0.05298972725868225, + "step": 125050 + }, + { + "epoch": 17.751596877217885, + "grad_norm": 0.35899174213409424, + "learning_rate": 8.225606813342797e-05, + "loss": 0.006761927902698517, + "step": 125060 + }, + { + "epoch": 17.753016323633783, + "grad_norm": 0.041905470192432404, + "learning_rate": 8.225464868701207e-05, + "loss": 0.020069000124931336, + "step": 125070 + }, + { + "epoch": 17.75443577004968, + "grad_norm": 0.7074944376945496, + "learning_rate": 8.225322924059618e-05, + "loss": 0.02227655053138733, + "step": 125080 + }, + { + "epoch": 17.75585521646558, + "grad_norm": 0.05130421370267868, + "learning_rate": 8.225180979418028e-05, + "loss": 0.01859382688999176, + "step": 125090 + }, + { + "epoch": 17.757274662881475, + "grad_norm": 5.374364376068115, + "learning_rate": 8.225039034776437e-05, + "loss": 0.010960782319307328, + "step": 125100 + }, + { + "epoch": 17.758694109297373, + "grad_norm": 0.010262245312333107, + "learning_rate": 8.224897090134847e-05, + "loss": 0.00434894859790802, + "step": 125110 + }, + { + "epoch": 17.76011355571327, + "grad_norm": 1.0669256448745728, + "learning_rate": 8.224755145493258e-05, + "loss": 0.01742391139268875, + "step": 125120 + }, + { + "epoch": 17.76153300212917, + "grad_norm": 0.36459091305732727, + "learning_rate": 8.224613200851669e-05, + "loss": 0.013276790082454682, + "step": 125130 + }, + { + "epoch": 17.762952448545068, + "grad_norm": 0.09000733494758606, + "learning_rate": 8.224471256210079e-05, + "loss": 0.001743781939148903, + "step": 125140 + }, + { + "epoch": 17.764371894960966, + "grad_norm": 0.1272316426038742, + "learning_rate": 8.224329311568489e-05, + "loss": 0.01023392528295517, + "step": 125150 + }, + { + "epoch": 17.765791341376865, + "grad_norm": 1.799155354499817, + "learning_rate": 8.224187366926898e-05, + "loss": 0.0067228637635707855, + "step": 125160 + }, + { + "epoch": 17.76721078779276, + "grad_norm": 0.9365578889846802, + "learning_rate": 8.22404542228531e-05, + "loss": 0.008356766402721405, + "step": 125170 + }, + { + "epoch": 17.768630234208658, + "grad_norm": 0.6004683375358582, + "learning_rate": 8.223903477643719e-05, + "loss": 0.014860156178474426, + "step": 125180 + }, + { + "epoch": 17.770049680624556, + "grad_norm": 0.9573814868927002, + "learning_rate": 8.22376153300213e-05, + "loss": 0.01890397071838379, + "step": 125190 + }, + { + "epoch": 17.771469127040454, + "grad_norm": 0.2591208815574646, + "learning_rate": 8.223619588360539e-05, + "loss": 0.007004987448453903, + "step": 125200 + }, + { + "epoch": 17.772888573456353, + "grad_norm": 0.821864664554596, + "learning_rate": 8.22347764371895e-05, + "loss": 0.012261110544204711, + "step": 125210 + }, + { + "epoch": 17.77430801987225, + "grad_norm": 0.10942957550287247, + "learning_rate": 8.223335699077361e-05, + "loss": 0.05162022709846496, + "step": 125220 + }, + { + "epoch": 17.77572746628815, + "grad_norm": 0.10782337933778763, + "learning_rate": 8.22319375443577e-05, + "loss": 0.018732479214668273, + "step": 125230 + }, + { + "epoch": 17.777146912704044, + "grad_norm": 0.04795246571302414, + "learning_rate": 8.223051809794182e-05, + "loss": 0.0030033662915229797, + "step": 125240 + }, + { + "epoch": 17.778566359119942, + "grad_norm": 0.05053180456161499, + "learning_rate": 8.22290986515259e-05, + "loss": 0.04560690820217132, + "step": 125250 + }, + { + "epoch": 17.77998580553584, + "grad_norm": 0.10530559718608856, + "learning_rate": 8.222767920511001e-05, + "loss": 0.005130567401647568, + "step": 125260 + }, + { + "epoch": 17.78140525195174, + "grad_norm": 0.23054805397987366, + "learning_rate": 8.222625975869411e-05, + "loss": 0.012102346122264861, + "step": 125270 + }, + { + "epoch": 17.782824698367637, + "grad_norm": 0.09214671701192856, + "learning_rate": 8.222484031227822e-05, + "loss": 0.03815768957138062, + "step": 125280 + }, + { + "epoch": 17.784244144783536, + "grad_norm": 0.05258322134613991, + "learning_rate": 8.222342086586232e-05, + "loss": 0.014068126678466797, + "step": 125290 + }, + { + "epoch": 17.785663591199434, + "grad_norm": 7.656113624572754, + "learning_rate": 8.222200141944642e-05, + "loss": 0.019538348913192748, + "step": 125300 + }, + { + "epoch": 17.78708303761533, + "grad_norm": 0.06774721294641495, + "learning_rate": 8.222058197303053e-05, + "loss": 0.027745184302330018, + "step": 125310 + }, + { + "epoch": 17.788502484031227, + "grad_norm": 1.0145679712295532, + "learning_rate": 8.221916252661462e-05, + "loss": 0.030012327432632446, + "step": 125320 + }, + { + "epoch": 17.789921930447125, + "grad_norm": 0.03959951922297478, + "learning_rate": 8.221774308019873e-05, + "loss": 0.01319495439529419, + "step": 125330 + }, + { + "epoch": 17.791341376863024, + "grad_norm": 0.01071132905781269, + "learning_rate": 8.221632363378283e-05, + "loss": 0.02355894446372986, + "step": 125340 + }, + { + "epoch": 17.792760823278922, + "grad_norm": 4.440755844116211, + "learning_rate": 8.221490418736693e-05, + "loss": 0.008524559438228607, + "step": 125350 + }, + { + "epoch": 17.79418026969482, + "grad_norm": 0.028077952563762665, + "learning_rate": 8.221348474095103e-05, + "loss": 0.051354610919952394, + "step": 125360 + }, + { + "epoch": 17.79559971611072, + "grad_norm": 0.19310365617275238, + "learning_rate": 8.221206529453514e-05, + "loss": 0.016239266097545623, + "step": 125370 + }, + { + "epoch": 17.797019162526613, + "grad_norm": 0.8175052404403687, + "learning_rate": 8.221064584811923e-05, + "loss": 0.004264084994792939, + "step": 125380 + }, + { + "epoch": 17.79843860894251, + "grad_norm": 6.119190692901611, + "learning_rate": 8.220922640170335e-05, + "loss": 0.04939507246017456, + "step": 125390 + }, + { + "epoch": 17.79985805535841, + "grad_norm": 3.670011043548584, + "learning_rate": 8.220780695528744e-05, + "loss": 0.016700688004493713, + "step": 125400 + }, + { + "epoch": 17.801277501774308, + "grad_norm": 10.37806224822998, + "learning_rate": 8.220638750887154e-05, + "loss": 0.06917227506637573, + "step": 125410 + }, + { + "epoch": 17.802696948190206, + "grad_norm": 0.466407835483551, + "learning_rate": 8.220496806245565e-05, + "loss": 0.0451697438955307, + "step": 125420 + }, + { + "epoch": 17.804116394606105, + "grad_norm": 0.18278907239437103, + "learning_rate": 8.220354861603975e-05, + "loss": 0.0026770364493131638, + "step": 125430 + }, + { + "epoch": 17.805535841022003, + "grad_norm": 0.26450440287590027, + "learning_rate": 8.220212916962386e-05, + "loss": 0.02100861966609955, + "step": 125440 + }, + { + "epoch": 17.806955287437898, + "grad_norm": 0.026414619758725166, + "learning_rate": 8.220070972320796e-05, + "loss": 0.018268810212612153, + "step": 125450 + }, + { + "epoch": 17.808374733853796, + "grad_norm": 0.05897356569766998, + "learning_rate": 8.219929027679205e-05, + "loss": 0.003758923336863518, + "step": 125460 + }, + { + "epoch": 17.809794180269694, + "grad_norm": 0.001893079956062138, + "learning_rate": 8.219787083037615e-05, + "loss": 0.005195864289999008, + "step": 125470 + }, + { + "epoch": 17.811213626685593, + "grad_norm": 10.834534645080566, + "learning_rate": 8.219645138396026e-05, + "loss": 0.012825360894203186, + "step": 125480 + }, + { + "epoch": 17.81263307310149, + "grad_norm": 0.10624898970127106, + "learning_rate": 8.219503193754436e-05, + "loss": 0.0045176450163125995, + "step": 125490 + }, + { + "epoch": 17.81405251951739, + "grad_norm": 0.6353622674942017, + "learning_rate": 8.219361249112847e-05, + "loss": 0.04030350148677826, + "step": 125500 + }, + { + "epoch": 17.81405251951739, + "eval_accuracy": 0.9826413174794939, + "eval_loss": 0.06225878372788429, + "eval_runtime": 32.1976, + "eval_samples_per_second": 488.453, + "eval_steps_per_second": 15.281, + "step": 125500 + }, + { + "epoch": 17.815471965933288, + "grad_norm": 9.062827110290527, + "learning_rate": 8.219219304471257e-05, + "loss": 0.05064420104026794, + "step": 125510 + }, + { + "epoch": 17.816891412349182, + "grad_norm": 0.1245955228805542, + "learning_rate": 8.219077359829667e-05, + "loss": 0.04497800469398498, + "step": 125520 + }, + { + "epoch": 17.81831085876508, + "grad_norm": 0.023755589500069618, + "learning_rate": 8.218935415188078e-05, + "loss": 0.011510214954614639, + "step": 125530 + }, + { + "epoch": 17.81973030518098, + "grad_norm": 0.008768800646066666, + "learning_rate": 8.218793470546487e-05, + "loss": 0.007446672022342682, + "step": 125540 + }, + { + "epoch": 17.821149751596877, + "grad_norm": 12.563655853271484, + "learning_rate": 8.218651525904899e-05, + "loss": 0.01588403284549713, + "step": 125550 + }, + { + "epoch": 17.822569198012776, + "grad_norm": 1.1750997304916382, + "learning_rate": 8.218509581263307e-05, + "loss": 0.004291301220655441, + "step": 125560 + }, + { + "epoch": 17.823988644428674, + "grad_norm": 0.2507944405078888, + "learning_rate": 8.218367636621718e-05, + "loss": 0.01853466182947159, + "step": 125570 + }, + { + "epoch": 17.825408090844572, + "grad_norm": 1.2327880859375, + "learning_rate": 8.218225691980128e-05, + "loss": 0.05478519797325134, + "step": 125580 + }, + { + "epoch": 17.826827537260467, + "grad_norm": 8.883460998535156, + "learning_rate": 8.218083747338539e-05, + "loss": 0.0659081757068634, + "step": 125590 + }, + { + "epoch": 17.828246983676365, + "grad_norm": 2.973017930984497, + "learning_rate": 8.217941802696949e-05, + "loss": 0.02106604874134064, + "step": 125600 + }, + { + "epoch": 17.829666430092264, + "grad_norm": 0.6707797050476074, + "learning_rate": 8.217799858055358e-05, + "loss": 0.0067960724234580995, + "step": 125610 + }, + { + "epoch": 17.831085876508162, + "grad_norm": 10.60904598236084, + "learning_rate": 8.21765791341377e-05, + "loss": 0.03403809368610382, + "step": 125620 + }, + { + "epoch": 17.83250532292406, + "grad_norm": 0.2786976993083954, + "learning_rate": 8.217515968772179e-05, + "loss": 0.022289738059043884, + "step": 125630 + }, + { + "epoch": 17.83392476933996, + "grad_norm": 5.2510600090026855, + "learning_rate": 8.21737402413059e-05, + "loss": 0.05297409296035767, + "step": 125640 + }, + { + "epoch": 17.835344215755857, + "grad_norm": 0.7457426190376282, + "learning_rate": 8.217232079489e-05, + "loss": 0.03707561790943146, + "step": 125650 + }, + { + "epoch": 17.83676366217175, + "grad_norm": 0.5260939598083496, + "learning_rate": 8.21709013484741e-05, + "loss": 0.0180939644575119, + "step": 125660 + }, + { + "epoch": 17.83818310858765, + "grad_norm": 2.1039488315582275, + "learning_rate": 8.21694819020582e-05, + "loss": 0.012895692884922028, + "step": 125670 + }, + { + "epoch": 17.839602555003548, + "grad_norm": 0.09327096492052078, + "learning_rate": 8.21680624556423e-05, + "loss": 0.02153548151254654, + "step": 125680 + }, + { + "epoch": 17.841022001419446, + "grad_norm": 0.023013439029455185, + "learning_rate": 8.21666430092264e-05, + "loss": 0.039509478211402896, + "step": 125690 + }, + { + "epoch": 17.842441447835345, + "grad_norm": 0.023209473118185997, + "learning_rate": 8.216522356281051e-05, + "loss": 0.02362530380487442, + "step": 125700 + }, + { + "epoch": 17.843860894251243, + "grad_norm": 17.44260025024414, + "learning_rate": 8.216380411639461e-05, + "loss": 0.05340635180473328, + "step": 125710 + }, + { + "epoch": 17.84528034066714, + "grad_norm": 0.31442582607269287, + "learning_rate": 8.216238466997871e-05, + "loss": 0.007146529853343964, + "step": 125720 + }, + { + "epoch": 17.846699787083036, + "grad_norm": 6.465976715087891, + "learning_rate": 8.216096522356282e-05, + "loss": 0.023551468551158906, + "step": 125730 + }, + { + "epoch": 17.848119233498934, + "grad_norm": 1.7314174175262451, + "learning_rate": 8.215954577714692e-05, + "loss": 0.021711570024490357, + "step": 125740 + }, + { + "epoch": 17.849538679914833, + "grad_norm": 0.35447877645492554, + "learning_rate": 8.215812633073103e-05, + "loss": 0.04752359390258789, + "step": 125750 + }, + { + "epoch": 17.85095812633073, + "grad_norm": 0.16713052988052368, + "learning_rate": 8.215670688431511e-05, + "loss": 0.0385821133852005, + "step": 125760 + }, + { + "epoch": 17.85237757274663, + "grad_norm": 2.4941494464874268, + "learning_rate": 8.215528743789922e-05, + "loss": 0.061608928442001346, + "step": 125770 + }, + { + "epoch": 17.853797019162528, + "grad_norm": 3.051335096359253, + "learning_rate": 8.215386799148332e-05, + "loss": 0.04033620953559876, + "step": 125780 + }, + { + "epoch": 17.855216465578426, + "grad_norm": 4.2297210693359375, + "learning_rate": 8.215244854506743e-05, + "loss": 0.02570372223854065, + "step": 125790 + }, + { + "epoch": 17.85663591199432, + "grad_norm": 0.06211649626493454, + "learning_rate": 8.215102909865153e-05, + "loss": 0.022726473212242127, + "step": 125800 + }, + { + "epoch": 17.85805535841022, + "grad_norm": 5.127074241638184, + "learning_rate": 8.214960965223564e-05, + "loss": 0.012352780997753143, + "step": 125810 + }, + { + "epoch": 17.859474804826117, + "grad_norm": 6.0037713050842285, + "learning_rate": 8.214819020581974e-05, + "loss": 0.027094042301177977, + "step": 125820 + }, + { + "epoch": 17.860894251242016, + "grad_norm": 0.09564322978258133, + "learning_rate": 8.214677075940383e-05, + "loss": 0.014927592873573304, + "step": 125830 + }, + { + "epoch": 17.862313697657914, + "grad_norm": 3.6967010498046875, + "learning_rate": 8.214535131298794e-05, + "loss": 0.007258567214012146, + "step": 125840 + }, + { + "epoch": 17.863733144073812, + "grad_norm": 8.384481430053711, + "learning_rate": 8.214393186657204e-05, + "loss": 0.020434218645095825, + "step": 125850 + }, + { + "epoch": 17.86515259048971, + "grad_norm": 0.2185468077659607, + "learning_rate": 8.214251242015615e-05, + "loss": 0.00585351549088955, + "step": 125860 + }, + { + "epoch": 17.866572036905605, + "grad_norm": 0.16802102327346802, + "learning_rate": 8.214109297374024e-05, + "loss": 0.04269077479839325, + "step": 125870 + }, + { + "epoch": 17.867991483321504, + "grad_norm": 0.6906107068061829, + "learning_rate": 8.213967352732435e-05, + "loss": 0.010298679769039153, + "step": 125880 + }, + { + "epoch": 17.869410929737402, + "grad_norm": 2.713566541671753, + "learning_rate": 8.213825408090844e-05, + "loss": 0.026750019192695616, + "step": 125890 + }, + { + "epoch": 17.8708303761533, + "grad_norm": 9.485734939575195, + "learning_rate": 8.213683463449256e-05, + "loss": 0.006816279888153076, + "step": 125900 + }, + { + "epoch": 17.8722498225692, + "grad_norm": 0.0062073878943920135, + "learning_rate": 8.213541518807665e-05, + "loss": 0.0393365740776062, + "step": 125910 + }, + { + "epoch": 17.873669268985097, + "grad_norm": 0.17222857475280762, + "learning_rate": 8.213399574166075e-05, + "loss": 0.02054024189710617, + "step": 125920 + }, + { + "epoch": 17.875088715400995, + "grad_norm": 0.05197976902127266, + "learning_rate": 8.213257629524486e-05, + "loss": 0.04310756027698517, + "step": 125930 + }, + { + "epoch": 17.87650816181689, + "grad_norm": 0.09764538705348969, + "learning_rate": 8.213115684882896e-05, + "loss": 0.054064315557479856, + "step": 125940 + }, + { + "epoch": 17.87792760823279, + "grad_norm": 0.07554052025079727, + "learning_rate": 8.212973740241307e-05, + "loss": 0.025239002704620362, + "step": 125950 + }, + { + "epoch": 17.879347054648687, + "grad_norm": 0.349587082862854, + "learning_rate": 8.212831795599717e-05, + "loss": 0.007651855796575546, + "step": 125960 + }, + { + "epoch": 17.880766501064585, + "grad_norm": 13.251055717468262, + "learning_rate": 8.212689850958126e-05, + "loss": 0.06340181827545166, + "step": 125970 + }, + { + "epoch": 17.882185947480483, + "grad_norm": 0.27692946791648865, + "learning_rate": 8.212547906316536e-05, + "loss": 0.00708906427025795, + "step": 125980 + }, + { + "epoch": 17.88360539389638, + "grad_norm": 0.04449354112148285, + "learning_rate": 8.212405961674947e-05, + "loss": 0.01091306209564209, + "step": 125990 + }, + { + "epoch": 17.88502484031228, + "grad_norm": 0.03929013013839722, + "learning_rate": 8.212264017033357e-05, + "loss": 0.01091454029083252, + "step": 126000 + }, + { + "epoch": 17.88502484031228, + "eval_accuracy": 0.9870922617155211, + "eval_loss": 0.05081784725189209, + "eval_runtime": 31.9763, + "eval_samples_per_second": 491.833, + "eval_steps_per_second": 15.386, + "step": 126000 + }, + { + "epoch": 17.886444286728175, + "grad_norm": 0.03523946925997734, + "learning_rate": 8.212122072391768e-05, + "loss": 0.01761469542980194, + "step": 126010 + }, + { + "epoch": 17.887863733144073, + "grad_norm": 0.3471955955028534, + "learning_rate": 8.211980127750178e-05, + "loss": 0.02797209918498993, + "step": 126020 + }, + { + "epoch": 17.88928317955997, + "grad_norm": 0.18042099475860596, + "learning_rate": 8.211838183108588e-05, + "loss": 0.024269339442253113, + "step": 126030 + }, + { + "epoch": 17.89070262597587, + "grad_norm": 3.176307201385498, + "learning_rate": 8.211696238466999e-05, + "loss": 0.018799322843551635, + "step": 126040 + }, + { + "epoch": 17.892122072391768, + "grad_norm": 0.6055776476860046, + "learning_rate": 8.211554293825408e-05, + "loss": 0.048582276701927184, + "step": 126050 + }, + { + "epoch": 17.893541518807666, + "grad_norm": 8.118757247924805, + "learning_rate": 8.21141234918382e-05, + "loss": 0.045041635632514954, + "step": 126060 + }, + { + "epoch": 17.894960965223564, + "grad_norm": 0.2706262767314911, + "learning_rate": 8.211270404542228e-05, + "loss": 0.011531689763069152, + "step": 126070 + }, + { + "epoch": 17.89638041163946, + "grad_norm": 0.02398722618818283, + "learning_rate": 8.211128459900639e-05, + "loss": 0.011123070120811462, + "step": 126080 + }, + { + "epoch": 17.897799858055357, + "grad_norm": 0.4285498261451721, + "learning_rate": 8.210986515259049e-05, + "loss": 0.011589570343494416, + "step": 126090 + }, + { + "epoch": 17.899219304471256, + "grad_norm": 0.03318289667367935, + "learning_rate": 8.21084457061746e-05, + "loss": 0.02185286730527878, + "step": 126100 + }, + { + "epoch": 17.900638750887154, + "grad_norm": 2.2753231525421143, + "learning_rate": 8.21070262597587e-05, + "loss": 0.04003996551036835, + "step": 126110 + }, + { + "epoch": 17.902058197303052, + "grad_norm": 1.4221254587173462, + "learning_rate": 8.210560681334279e-05, + "loss": 0.04174538254737854, + "step": 126120 + }, + { + "epoch": 17.90347764371895, + "grad_norm": 0.8516020178794861, + "learning_rate": 8.21041873669269e-05, + "loss": 0.03194279670715332, + "step": 126130 + }, + { + "epoch": 17.90489709013485, + "grad_norm": 0.3246018588542938, + "learning_rate": 8.2102767920511e-05, + "loss": 0.05795959234237671, + "step": 126140 + }, + { + "epoch": 17.906316536550744, + "grad_norm": 1.9304618835449219, + "learning_rate": 8.210134847409511e-05, + "loss": 0.06401010155677796, + "step": 126150 + }, + { + "epoch": 17.907735982966642, + "grad_norm": 0.311942994594574, + "learning_rate": 8.209992902767921e-05, + "loss": 0.005649371817708015, + "step": 126160 + }, + { + "epoch": 17.90915542938254, + "grad_norm": 0.02588566765189171, + "learning_rate": 8.209850958126332e-05, + "loss": 0.0437458336353302, + "step": 126170 + }, + { + "epoch": 17.91057487579844, + "grad_norm": 0.28646552562713623, + "learning_rate": 8.20970901348474e-05, + "loss": 0.003959463909268379, + "step": 126180 + }, + { + "epoch": 17.911994322214337, + "grad_norm": 0.2520774304866791, + "learning_rate": 8.209567068843152e-05, + "loss": 0.00396527536213398, + "step": 126190 + }, + { + "epoch": 17.913413768630235, + "grad_norm": 0.05746285244822502, + "learning_rate": 8.209425124201561e-05, + "loss": 0.020936407148838043, + "step": 126200 + }, + { + "epoch": 17.914833215046134, + "grad_norm": 1.556191325187683, + "learning_rate": 8.209283179559972e-05, + "loss": 0.0039587758481502535, + "step": 126210 + }, + { + "epoch": 17.91625266146203, + "grad_norm": 11.09601879119873, + "learning_rate": 8.209141234918382e-05, + "loss": 0.0388120174407959, + "step": 126220 + }, + { + "epoch": 17.917672107877927, + "grad_norm": 12.754619598388672, + "learning_rate": 8.208999290276792e-05, + "loss": 0.031015089154243468, + "step": 126230 + }, + { + "epoch": 17.919091554293825, + "grad_norm": 0.07646643370389938, + "learning_rate": 8.208857345635203e-05, + "loss": 0.029643809795379637, + "step": 126240 + }, + { + "epoch": 17.920511000709723, + "grad_norm": 0.5458995699882507, + "learning_rate": 8.208715400993613e-05, + "loss": 0.01911151260137558, + "step": 126250 + }, + { + "epoch": 17.92193044712562, + "grad_norm": 0.9907100796699524, + "learning_rate": 8.208573456352024e-05, + "loss": 0.006705816090106964, + "step": 126260 + }, + { + "epoch": 17.92334989354152, + "grad_norm": 0.5216385722160339, + "learning_rate": 8.208431511710433e-05, + "loss": 0.013103863596916199, + "step": 126270 + }, + { + "epoch": 17.924769339957418, + "grad_norm": 2.8902833461761475, + "learning_rate": 8.208289567068843e-05, + "loss": 0.03189074695110321, + "step": 126280 + }, + { + "epoch": 17.926188786373313, + "grad_norm": 12.639440536499023, + "learning_rate": 8.208147622427253e-05, + "loss": 0.02160928547382355, + "step": 126290 + }, + { + "epoch": 17.92760823278921, + "grad_norm": 0.10358481109142303, + "learning_rate": 8.208005677785664e-05, + "loss": 0.015329189598560333, + "step": 126300 + }, + { + "epoch": 17.92902767920511, + "grad_norm": 1.279746413230896, + "learning_rate": 8.207863733144074e-05, + "loss": 0.030715879797935487, + "step": 126310 + }, + { + "epoch": 17.930447125621008, + "grad_norm": 0.05002136528491974, + "learning_rate": 8.207721788502485e-05, + "loss": 0.051266276836395265, + "step": 126320 + }, + { + "epoch": 17.931866572036906, + "grad_norm": 6.9989776611328125, + "learning_rate": 8.207579843860895e-05, + "loss": 0.050137877464294434, + "step": 126330 + }, + { + "epoch": 17.933286018452804, + "grad_norm": 0.1464066207408905, + "learning_rate": 8.207437899219304e-05, + "loss": 0.03081599771976471, + "step": 126340 + }, + { + "epoch": 17.934705464868703, + "grad_norm": 0.022758029401302338, + "learning_rate": 8.207295954577715e-05, + "loss": 0.0385101467370987, + "step": 126350 + }, + { + "epoch": 17.936124911284598, + "grad_norm": 2.994476079940796, + "learning_rate": 8.207154009936125e-05, + "loss": 0.04679540097713471, + "step": 126360 + }, + { + "epoch": 17.937544357700496, + "grad_norm": 0.03078029491007328, + "learning_rate": 8.207012065294536e-05, + "loss": 0.01109546199440956, + "step": 126370 + }, + { + "epoch": 17.938963804116394, + "grad_norm": 3.0478312969207764, + "learning_rate": 8.206870120652945e-05, + "loss": 0.019247525930404664, + "step": 126380 + }, + { + "epoch": 17.940383250532292, + "grad_norm": 0.9252429604530334, + "learning_rate": 8.206728176011356e-05, + "loss": 0.08226449489593506, + "step": 126390 + }, + { + "epoch": 17.94180269694819, + "grad_norm": 0.025419319048523903, + "learning_rate": 8.206586231369766e-05, + "loss": 0.007851970195770264, + "step": 126400 + }, + { + "epoch": 17.94322214336409, + "grad_norm": 7.943525314331055, + "learning_rate": 8.206444286728177e-05, + "loss": 0.037611573934555054, + "step": 126410 + }, + { + "epoch": 17.944641589779987, + "grad_norm": 0.060102060437202454, + "learning_rate": 8.206302342086586e-05, + "loss": 0.027545714378356935, + "step": 126420 + }, + { + "epoch": 17.946061036195882, + "grad_norm": 12.149624824523926, + "learning_rate": 8.206160397444996e-05, + "loss": 0.029793751239776612, + "step": 126430 + }, + { + "epoch": 17.94748048261178, + "grad_norm": 0.3006502687931061, + "learning_rate": 8.206018452803407e-05, + "loss": 0.012186054885387421, + "step": 126440 + }, + { + "epoch": 17.94889992902768, + "grad_norm": 0.08010541647672653, + "learning_rate": 8.205876508161817e-05, + "loss": 0.002343623712658882, + "step": 126450 + }, + { + "epoch": 17.950319375443577, + "grad_norm": 0.19521616399288177, + "learning_rate": 8.205734563520228e-05, + "loss": 0.011480587720870971, + "step": 126460 + }, + { + "epoch": 17.951738821859475, + "grad_norm": 0.05199567973613739, + "learning_rate": 8.205592618878638e-05, + "loss": 0.01906854808330536, + "step": 126470 + }, + { + "epoch": 17.953158268275374, + "grad_norm": 0.6597695350646973, + "learning_rate": 8.205450674237049e-05, + "loss": 0.0264710009098053, + "step": 126480 + }, + { + "epoch": 17.954577714691272, + "grad_norm": 0.0455927774310112, + "learning_rate": 8.205308729595457e-05, + "loss": 0.006426760554313659, + "step": 126490 + }, + { + "epoch": 17.955997161107167, + "grad_norm": 2.061488389968872, + "learning_rate": 8.205166784953868e-05, + "loss": 0.02843399941921234, + "step": 126500 + }, + { + "epoch": 17.955997161107167, + "eval_accuracy": 0.9857569784447129, + "eval_loss": 0.05199264734983444, + "eval_runtime": 32.8178, + "eval_samples_per_second": 479.221, + "eval_steps_per_second": 14.992, + "step": 126500 + }, + { + "epoch": 17.957416607523065, + "grad_norm": 0.0125628262758255, + "learning_rate": 8.205024840312278e-05, + "loss": 0.012261651456356049, + "step": 126510 + }, + { + "epoch": 17.958836053938963, + "grad_norm": 19.391925811767578, + "learning_rate": 8.204882895670689e-05, + "loss": 0.014076711237430572, + "step": 126520 + }, + { + "epoch": 17.96025550035486, + "grad_norm": 0.007833714596927166, + "learning_rate": 8.2047409510291e-05, + "loss": 0.039005106687545775, + "step": 126530 + }, + { + "epoch": 17.96167494677076, + "grad_norm": 14.843253135681152, + "learning_rate": 8.204599006387509e-05, + "loss": 0.03159629106521607, + "step": 126540 + }, + { + "epoch": 17.96309439318666, + "grad_norm": 0.6994040012359619, + "learning_rate": 8.20445706174592e-05, + "loss": 0.034837445616722106, + "step": 126550 + }, + { + "epoch": 17.964513839602557, + "grad_norm": 2.879476308822632, + "learning_rate": 8.20431511710433e-05, + "loss": 0.034412276744842527, + "step": 126560 + }, + { + "epoch": 17.96593328601845, + "grad_norm": 10.790650367736816, + "learning_rate": 8.20417317246274e-05, + "loss": 0.02943170666694641, + "step": 126570 + }, + { + "epoch": 17.96735273243435, + "grad_norm": 0.146264910697937, + "learning_rate": 8.20403122782115e-05, + "loss": 0.02726702392101288, + "step": 126580 + }, + { + "epoch": 17.968772178850248, + "grad_norm": 0.09430106729269028, + "learning_rate": 8.20388928317956e-05, + "loss": 0.07335036993026733, + "step": 126590 + }, + { + "epoch": 17.970191625266146, + "grad_norm": 3.3713955879211426, + "learning_rate": 8.20374733853797e-05, + "loss": 0.013988673686981201, + "step": 126600 + }, + { + "epoch": 17.971611071682045, + "grad_norm": 0.6408278346061707, + "learning_rate": 8.203605393896381e-05, + "loss": 0.028031525015830994, + "step": 126610 + }, + { + "epoch": 17.973030518097943, + "grad_norm": 0.8432971239089966, + "learning_rate": 8.203463449254792e-05, + "loss": 0.01225607916712761, + "step": 126620 + }, + { + "epoch": 17.97444996451384, + "grad_norm": 0.672948956489563, + "learning_rate": 8.203321504613202e-05, + "loss": 0.04240490198135376, + "step": 126630 + }, + { + "epoch": 17.975869410929736, + "grad_norm": 0.4233643412590027, + "learning_rate": 8.203179559971611e-05, + "loss": 0.012204398214817048, + "step": 126640 + }, + { + "epoch": 17.977288857345634, + "grad_norm": 0.7830970883369446, + "learning_rate": 8.203037615330021e-05, + "loss": 0.004717732965946198, + "step": 126650 + }, + { + "epoch": 17.978708303761533, + "grad_norm": 1.6771160364151, + "learning_rate": 8.202895670688432e-05, + "loss": 0.02526704967021942, + "step": 126660 + }, + { + "epoch": 17.98012775017743, + "grad_norm": 1.994340181350708, + "learning_rate": 8.202753726046842e-05, + "loss": 0.019393594563007356, + "step": 126670 + }, + { + "epoch": 17.98154719659333, + "grad_norm": 0.24979780614376068, + "learning_rate": 8.202611781405253e-05, + "loss": 0.026185110211372375, + "step": 126680 + }, + { + "epoch": 17.982966643009227, + "grad_norm": 5.780867099761963, + "learning_rate": 8.202469836763661e-05, + "loss": 0.03918190896511078, + "step": 126690 + }, + { + "epoch": 17.984386089425126, + "grad_norm": 0.01048432756215334, + "learning_rate": 8.202327892122073e-05, + "loss": 0.015203535556793213, + "step": 126700 + }, + { + "epoch": 17.98580553584102, + "grad_norm": 2.775602102279663, + "learning_rate": 8.202185947480484e-05, + "loss": 0.0074320010840892795, + "step": 126710 + }, + { + "epoch": 17.98722498225692, + "grad_norm": 7.469266414642334, + "learning_rate": 8.202044002838893e-05, + "loss": 0.012325166165828705, + "step": 126720 + }, + { + "epoch": 17.988644428672817, + "grad_norm": 0.059806808829307556, + "learning_rate": 8.201902058197304e-05, + "loss": 0.02976747751235962, + "step": 126730 + }, + { + "epoch": 17.990063875088715, + "grad_norm": 0.03276080638170242, + "learning_rate": 8.201760113555713e-05, + "loss": 0.060982465744018555, + "step": 126740 + }, + { + "epoch": 17.991483321504614, + "grad_norm": 4.326698303222656, + "learning_rate": 8.201618168914124e-05, + "loss": 0.011280091106891632, + "step": 126750 + }, + { + "epoch": 17.992902767920512, + "grad_norm": 1.296022891998291, + "learning_rate": 8.201476224272534e-05, + "loss": 0.018512937426567077, + "step": 126760 + }, + { + "epoch": 17.99432221433641, + "grad_norm": 0.13187599182128906, + "learning_rate": 8.201348474095104e-05, + "loss": 0.036490410566329956, + "step": 126770 + }, + { + "epoch": 17.995741660752305, + "grad_norm": 0.8004917502403259, + "learning_rate": 8.201206529453513e-05, + "loss": 0.05194540023803711, + "step": 126780 + }, + { + "epoch": 17.997161107168203, + "grad_norm": 2.241412401199341, + "learning_rate": 8.201064584811924e-05, + "loss": 0.042962107062339785, + "step": 126790 + }, + { + "epoch": 17.9985805535841, + "grad_norm": 3.015472650527954, + "learning_rate": 8.200922640170334e-05, + "loss": 0.01860196590423584, + "step": 126800 + }, + { + "epoch": 18.0, + "grad_norm": 6.105637073516846, + "learning_rate": 8.200780695528745e-05, + "loss": 0.045799678564071654, + "step": 126810 + }, + { + "epoch": 18.0014194464159, + "grad_norm": 0.3411359488964081, + "learning_rate": 8.200638750887154e-05, + "loss": 0.022154295444488527, + "step": 126820 + }, + { + "epoch": 18.002838892831797, + "grad_norm": 3.1426784992218018, + "learning_rate": 8.200496806245565e-05, + "loss": 0.008062352985143661, + "step": 126830 + }, + { + "epoch": 18.004258339247695, + "grad_norm": 0.09681227803230286, + "learning_rate": 8.200354861603974e-05, + "loss": 0.050596457719802854, + "step": 126840 + }, + { + "epoch": 18.00567778566359, + "grad_norm": 0.666832685470581, + "learning_rate": 8.200212916962385e-05, + "loss": 0.013695698976516724, + "step": 126850 + }, + { + "epoch": 18.007097232079488, + "grad_norm": 0.31903573870658875, + "learning_rate": 8.200070972320795e-05, + "loss": 0.0022612977772951126, + "step": 126860 + }, + { + "epoch": 18.008516678495386, + "grad_norm": 0.0052805026061832905, + "learning_rate": 8.199929027679205e-05, + "loss": 0.027579629421234132, + "step": 126870 + }, + { + "epoch": 18.009936124911285, + "grad_norm": 0.006886809598654509, + "learning_rate": 8.199787083037616e-05, + "loss": 0.041148635745048526, + "step": 126880 + }, + { + "epoch": 18.011355571327183, + "grad_norm": 0.40197938680648804, + "learning_rate": 8.199645138396026e-05, + "loss": 0.023601478338241576, + "step": 126890 + }, + { + "epoch": 18.01277501774308, + "grad_norm": 0.012682395055890083, + "learning_rate": 8.199503193754437e-05, + "loss": 0.002660195529460907, + "step": 126900 + }, + { + "epoch": 18.01419446415898, + "grad_norm": 0.38907021284103394, + "learning_rate": 8.199361249112847e-05, + "loss": 0.005682775005698204, + "step": 126910 + }, + { + "epoch": 18.015613910574874, + "grad_norm": 2.3589015007019043, + "learning_rate": 8.199219304471256e-05, + "loss": 0.016606913506984712, + "step": 126920 + }, + { + "epoch": 18.017033356990773, + "grad_norm": 0.516564667224884, + "learning_rate": 8.199077359829666e-05, + "loss": 0.0015501592308282853, + "step": 126930 + }, + { + "epoch": 18.01845280340667, + "grad_norm": 0.02881469391286373, + "learning_rate": 8.198935415188077e-05, + "loss": 0.01067756861448288, + "step": 126940 + }, + { + "epoch": 18.01987224982257, + "grad_norm": 0.00568376574665308, + "learning_rate": 8.198793470546487e-05, + "loss": 0.041530221700668335, + "step": 126950 + }, + { + "epoch": 18.021291696238467, + "grad_norm": 0.17403747141361237, + "learning_rate": 8.198651525904898e-05, + "loss": 0.008302728086709977, + "step": 126960 + }, + { + "epoch": 18.022711142654366, + "grad_norm": 7.299103260040283, + "learning_rate": 8.198509581263308e-05, + "loss": 0.01957940012216568, + "step": 126970 + }, + { + "epoch": 18.024130589070264, + "grad_norm": 0.09948822855949402, + "learning_rate": 8.198367636621717e-05, + "loss": 0.011512156575918198, + "step": 126980 + }, + { + "epoch": 18.02555003548616, + "grad_norm": 0.20196844637393951, + "learning_rate": 8.198225691980129e-05, + "loss": 0.005794602632522583, + "step": 126990 + }, + { + "epoch": 18.026969481902057, + "grad_norm": 0.057056453078985214, + "learning_rate": 8.198083747338538e-05, + "loss": 0.009519323706626892, + "step": 127000 + }, + { + "epoch": 18.026969481902057, + "eval_accuracy": 0.9840401856679596, + "eval_loss": 0.05799579620361328, + "eval_runtime": 33.0898, + "eval_samples_per_second": 475.283, + "eval_steps_per_second": 14.869, + "step": 127000 + }, + { + "epoch": 18.028388928317955, + "grad_norm": 5.088464736938477, + "learning_rate": 8.19794180269695e-05, + "loss": 0.031551048159599304, + "step": 127010 + }, + { + "epoch": 18.029808374733854, + "grad_norm": 18.893657684326172, + "learning_rate": 8.197799858055358e-05, + "loss": 0.0429678350687027, + "step": 127020 + }, + { + "epoch": 18.031227821149752, + "grad_norm": 9.892513275146484, + "learning_rate": 8.197657913413769e-05, + "loss": 0.03231886029243469, + "step": 127030 + }, + { + "epoch": 18.03264726756565, + "grad_norm": 3.9869258403778076, + "learning_rate": 8.197515968772179e-05, + "loss": 0.04821803271770477, + "step": 127040 + }, + { + "epoch": 18.03406671398155, + "grad_norm": 0.049460362643003464, + "learning_rate": 8.19737402413059e-05, + "loss": 0.04642572402954102, + "step": 127050 + }, + { + "epoch": 18.035486160397443, + "grad_norm": 1.9470139741897583, + "learning_rate": 8.197232079489e-05, + "loss": 0.04240458309650421, + "step": 127060 + }, + { + "epoch": 18.03690560681334, + "grad_norm": 0.018669215962290764, + "learning_rate": 8.197090134847409e-05, + "loss": 0.011113067716360092, + "step": 127070 + }, + { + "epoch": 18.03832505322924, + "grad_norm": 2.685368299484253, + "learning_rate": 8.19694819020582e-05, + "loss": 0.010963048040866851, + "step": 127080 + }, + { + "epoch": 18.03974449964514, + "grad_norm": 0.46036213636398315, + "learning_rate": 8.19680624556423e-05, + "loss": 0.02715737819671631, + "step": 127090 + }, + { + "epoch": 18.041163946061037, + "grad_norm": 0.6678931713104248, + "learning_rate": 8.196664300922641e-05, + "loss": 0.025738224387168884, + "step": 127100 + }, + { + "epoch": 18.042583392476935, + "grad_norm": 0.012589601799845695, + "learning_rate": 8.196522356281051e-05, + "loss": 0.002760981023311615, + "step": 127110 + }, + { + "epoch": 18.044002838892833, + "grad_norm": 4.038692951202393, + "learning_rate": 8.19638041163946e-05, + "loss": 0.025073921680450438, + "step": 127120 + }, + { + "epoch": 18.045422285308728, + "grad_norm": 1.019339680671692, + "learning_rate": 8.19623846699787e-05, + "loss": 0.004967309162020683, + "step": 127130 + }, + { + "epoch": 18.046841731724626, + "grad_norm": 0.03080017864704132, + "learning_rate": 8.196096522356281e-05, + "loss": 0.012518167495727539, + "step": 127140 + }, + { + "epoch": 18.048261178140525, + "grad_norm": 24.186779022216797, + "learning_rate": 8.195954577714691e-05, + "loss": 0.03561782538890838, + "step": 127150 + }, + { + "epoch": 18.049680624556423, + "grad_norm": 1.739776611328125, + "learning_rate": 8.195812633073102e-05, + "loss": 0.02224576622247696, + "step": 127160 + }, + { + "epoch": 18.05110007097232, + "grad_norm": 0.572066605091095, + "learning_rate": 8.195670688431512e-05, + "loss": 0.012349732220172882, + "step": 127170 + }, + { + "epoch": 18.05251951738822, + "grad_norm": 0.03597768768668175, + "learning_rate": 8.195528743789922e-05, + "loss": 0.04076186418533325, + "step": 127180 + }, + { + "epoch": 18.053938963804118, + "grad_norm": 25.167530059814453, + "learning_rate": 8.195386799148333e-05, + "loss": 0.0753936767578125, + "step": 127190 + }, + { + "epoch": 18.055358410220013, + "grad_norm": 1.2025281190872192, + "learning_rate": 8.195244854506743e-05, + "loss": 0.020939262211322786, + "step": 127200 + }, + { + "epoch": 18.05677785663591, + "grad_norm": 0.0326387844979763, + "learning_rate": 8.195102909865154e-05, + "loss": 0.05523158311843872, + "step": 127210 + }, + { + "epoch": 18.05819730305181, + "grad_norm": 0.41500118374824524, + "learning_rate": 8.194960965223563e-05, + "loss": 0.008329737931489944, + "step": 127220 + }, + { + "epoch": 18.059616749467708, + "grad_norm": 3.731995105743408, + "learning_rate": 8.194819020581973e-05, + "loss": 0.01609371155500412, + "step": 127230 + }, + { + "epoch": 18.061036195883606, + "grad_norm": 0.08213970810174942, + "learning_rate": 8.194677075940383e-05, + "loss": 0.05572362542152405, + "step": 127240 + }, + { + "epoch": 18.062455642299504, + "grad_norm": 0.02728845551609993, + "learning_rate": 8.194535131298794e-05, + "loss": 0.009065951406955718, + "step": 127250 + }, + { + "epoch": 18.063875088715402, + "grad_norm": 3.4261679649353027, + "learning_rate": 8.194393186657204e-05, + "loss": 0.021069920063018797, + "step": 127260 + }, + { + "epoch": 18.065294535131297, + "grad_norm": 0.04758109897375107, + "learning_rate": 8.194251242015615e-05, + "loss": 0.028346240520477295, + "step": 127270 + }, + { + "epoch": 18.066713981547196, + "grad_norm": 0.24030058085918427, + "learning_rate": 8.194109297374025e-05, + "loss": 0.02941884696483612, + "step": 127280 + }, + { + "epoch": 18.068133427963094, + "grad_norm": 0.5719775557518005, + "learning_rate": 8.193967352732434e-05, + "loss": 0.011897590756416321, + "step": 127290 + }, + { + "epoch": 18.069552874378992, + "grad_norm": 0.09043441712856293, + "learning_rate": 8.193825408090845e-05, + "loss": 0.020763903856277466, + "step": 127300 + }, + { + "epoch": 18.07097232079489, + "grad_norm": 0.005356388632208109, + "learning_rate": 8.193683463449255e-05, + "loss": 0.014215975999832153, + "step": 127310 + }, + { + "epoch": 18.07239176721079, + "grad_norm": 0.012542439624667168, + "learning_rate": 8.193541518807666e-05, + "loss": 0.00543864406645298, + "step": 127320 + }, + { + "epoch": 18.073811213626687, + "grad_norm": 1.8537201881408691, + "learning_rate": 8.193399574166075e-05, + "loss": 0.004514655843377113, + "step": 127330 + }, + { + "epoch": 18.075230660042582, + "grad_norm": 1.0644135475158691, + "learning_rate": 8.193257629524486e-05, + "loss": 0.031022971868515013, + "step": 127340 + }, + { + "epoch": 18.07665010645848, + "grad_norm": 0.22495824098587036, + "learning_rate": 8.193115684882895e-05, + "loss": 0.025671708583831786, + "step": 127350 + }, + { + "epoch": 18.07806955287438, + "grad_norm": 4.438430309295654, + "learning_rate": 8.192973740241306e-05, + "loss": 0.010372009128332138, + "step": 127360 + }, + { + "epoch": 18.079488999290277, + "grad_norm": 8.326705932617188, + "learning_rate": 8.192831795599718e-05, + "loss": 0.03074938654899597, + "step": 127370 + }, + { + "epoch": 18.080908445706175, + "grad_norm": 5.019266605377197, + "learning_rate": 8.192689850958126e-05, + "loss": 0.006851650774478912, + "step": 127380 + }, + { + "epoch": 18.082327892122073, + "grad_norm": 0.056960105895996094, + "learning_rate": 8.192547906316537e-05, + "loss": 0.027861076593399047, + "step": 127390 + }, + { + "epoch": 18.08374733853797, + "grad_norm": 0.039441630244255066, + "learning_rate": 8.192405961674947e-05, + "loss": 0.004483645036816597, + "step": 127400 + }, + { + "epoch": 18.085166784953866, + "grad_norm": 0.44658857583999634, + "learning_rate": 8.192264017033358e-05, + "loss": 0.01149245649576187, + "step": 127410 + }, + { + "epoch": 18.086586231369765, + "grad_norm": 1.7577241659164429, + "learning_rate": 8.192122072391768e-05, + "loss": 0.012128777801990509, + "step": 127420 + }, + { + "epoch": 18.088005677785663, + "grad_norm": 1.1049087047576904, + "learning_rate": 8.191980127750177e-05, + "loss": 0.024096983671188354, + "step": 127430 + }, + { + "epoch": 18.08942512420156, + "grad_norm": 3.239858388900757, + "learning_rate": 8.191838183108587e-05, + "loss": 0.059730923175811766, + "step": 127440 + }, + { + "epoch": 18.09084457061746, + "grad_norm": 0.11400025337934494, + "learning_rate": 8.191696238466998e-05, + "loss": 0.0018292196094989777, + "step": 127450 + }, + { + "epoch": 18.092264017033358, + "grad_norm": 0.0311578419059515, + "learning_rate": 8.191554293825409e-05, + "loss": 0.005093959718942642, + "step": 127460 + }, + { + "epoch": 18.093683463449256, + "grad_norm": 0.03095116652548313, + "learning_rate": 8.191412349183819e-05, + "loss": 0.0027309712022542953, + "step": 127470 + }, + { + "epoch": 18.09510290986515, + "grad_norm": 11.3790283203125, + "learning_rate": 8.191270404542229e-05, + "loss": 0.03980101048946381, + "step": 127480 + }, + { + "epoch": 18.09652235628105, + "grad_norm": 6.759974002838135, + "learning_rate": 8.191128459900639e-05, + "loss": 0.033737432956695554, + "step": 127490 + }, + { + "epoch": 18.097941802696948, + "grad_norm": 4.424436092376709, + "learning_rate": 8.19098651525905e-05, + "loss": 0.0036382827907800674, + "step": 127500 + }, + { + "epoch": 18.097941802696948, + "eval_accuracy": 0.9875373561391237, + "eval_loss": 0.04337623715400696, + "eval_runtime": 31.9534, + "eval_samples_per_second": 492.186, + "eval_steps_per_second": 15.397, + "step": 127500 + }, + { + "epoch": 18.099361249112846, + "grad_norm": 0.4387366771697998, + "learning_rate": 8.19084457061746e-05, + "loss": 0.049114978313446044, + "step": 127510 + }, + { + "epoch": 18.100780695528744, + "grad_norm": 0.765316367149353, + "learning_rate": 8.19070262597587e-05, + "loss": 0.013087628781795502, + "step": 127520 + }, + { + "epoch": 18.102200141944643, + "grad_norm": 0.1502961665391922, + "learning_rate": 8.19056068133428e-05, + "loss": 0.0034311629831790926, + "step": 127530 + }, + { + "epoch": 18.10361958836054, + "grad_norm": 1.7437764406204224, + "learning_rate": 8.19041873669269e-05, + "loss": 0.05554571747779846, + "step": 127540 + }, + { + "epoch": 18.105039034776436, + "grad_norm": 0.5341615080833435, + "learning_rate": 8.190276792051101e-05, + "loss": 0.03598266541957855, + "step": 127550 + }, + { + "epoch": 18.106458481192334, + "grad_norm": 0.07560352236032486, + "learning_rate": 8.190134847409511e-05, + "loss": 0.007796591520309449, + "step": 127560 + }, + { + "epoch": 18.107877927608232, + "grad_norm": 4.293848037719727, + "learning_rate": 8.189992902767922e-05, + "loss": 0.006240556016564369, + "step": 127570 + }, + { + "epoch": 18.10929737402413, + "grad_norm": 0.7019795775413513, + "learning_rate": 8.189850958126332e-05, + "loss": 0.012034883350133896, + "step": 127580 + }, + { + "epoch": 18.11071682044003, + "grad_norm": 0.013099047355353832, + "learning_rate": 8.189709013484741e-05, + "loss": 0.021384745836257935, + "step": 127590 + }, + { + "epoch": 18.112136266855927, + "grad_norm": 8.4081392288208, + "learning_rate": 8.189567068843151e-05, + "loss": 0.008062791824340821, + "step": 127600 + }, + { + "epoch": 18.113555713271825, + "grad_norm": 0.26725485920906067, + "learning_rate": 8.189425124201562e-05, + "loss": 0.006530357897281647, + "step": 127610 + }, + { + "epoch": 18.11497515968772, + "grad_norm": 0.20434775948524475, + "learning_rate": 8.189283179559972e-05, + "loss": 0.011984960734844207, + "step": 127620 + }, + { + "epoch": 18.11639460610362, + "grad_norm": 6.735098838806152, + "learning_rate": 8.189141234918383e-05, + "loss": 0.009172295033931733, + "step": 127630 + }, + { + "epoch": 18.117814052519517, + "grad_norm": 0.07924386858940125, + "learning_rate": 8.188999290276791e-05, + "loss": 0.0017124176025390624, + "step": 127640 + }, + { + "epoch": 18.119233498935415, + "grad_norm": 0.20373296737670898, + "learning_rate": 8.188857345635202e-05, + "loss": 0.004818763583898544, + "step": 127650 + }, + { + "epoch": 18.120652945351313, + "grad_norm": 0.3803417384624481, + "learning_rate": 8.188715400993614e-05, + "loss": 0.0035828322172164915, + "step": 127660 + }, + { + "epoch": 18.12207239176721, + "grad_norm": 14.244536399841309, + "learning_rate": 8.188573456352023e-05, + "loss": 0.02630659341812134, + "step": 127670 + }, + { + "epoch": 18.12349183818311, + "grad_norm": 9.004671096801758, + "learning_rate": 8.188431511710434e-05, + "loss": 0.010734491050243378, + "step": 127680 + }, + { + "epoch": 18.124911284599005, + "grad_norm": 8.941965103149414, + "learning_rate": 8.188289567068843e-05, + "loss": 0.0061449378728866575, + "step": 127690 + }, + { + "epoch": 18.126330731014903, + "grad_norm": 0.6067605018615723, + "learning_rate": 8.188147622427254e-05, + "loss": 0.003517572209239006, + "step": 127700 + }, + { + "epoch": 18.1277501774308, + "grad_norm": 0.03921455517411232, + "learning_rate": 8.188005677785664e-05, + "loss": 0.00920310840010643, + "step": 127710 + }, + { + "epoch": 18.1291696238467, + "grad_norm": 0.08954808861017227, + "learning_rate": 8.187863733144075e-05, + "loss": 0.020823955535888672, + "step": 127720 + }, + { + "epoch": 18.130589070262598, + "grad_norm": 0.1396220326423645, + "learning_rate": 8.187721788502484e-05, + "loss": 0.023295214772224425, + "step": 127730 + }, + { + "epoch": 18.132008516678496, + "grad_norm": 0.06990166008472443, + "learning_rate": 8.187579843860894e-05, + "loss": 0.00844774693250656, + "step": 127740 + }, + { + "epoch": 18.133427963094395, + "grad_norm": 0.69849693775177, + "learning_rate": 8.187437899219305e-05, + "loss": 0.030935484170913696, + "step": 127750 + }, + { + "epoch": 18.13484740951029, + "grad_norm": 0.022997183725237846, + "learning_rate": 8.187295954577715e-05, + "loss": 0.010376222431659698, + "step": 127760 + }, + { + "epoch": 18.136266855926188, + "grad_norm": 0.551956295967102, + "learning_rate": 8.187154009936126e-05, + "loss": 0.0124617338180542, + "step": 127770 + }, + { + "epoch": 18.137686302342086, + "grad_norm": 0.15997475385665894, + "learning_rate": 8.187012065294536e-05, + "loss": 0.00329936146736145, + "step": 127780 + }, + { + "epoch": 18.139105748757984, + "grad_norm": 2.299191951751709, + "learning_rate": 8.186870120652946e-05, + "loss": 0.004490911960601807, + "step": 127790 + }, + { + "epoch": 18.140525195173883, + "grad_norm": 0.010329111479222775, + "learning_rate": 8.186728176011355e-05, + "loss": 0.03573310077190399, + "step": 127800 + }, + { + "epoch": 18.14194464158978, + "grad_norm": 0.03730699419975281, + "learning_rate": 8.186586231369766e-05, + "loss": 0.008077595382928848, + "step": 127810 + }, + { + "epoch": 18.14336408800568, + "grad_norm": 0.1495765596628189, + "learning_rate": 8.186444286728176e-05, + "loss": 0.026599448919296265, + "step": 127820 + }, + { + "epoch": 18.144783534421574, + "grad_norm": 0.8765677809715271, + "learning_rate": 8.186302342086587e-05, + "loss": 0.006034733355045318, + "step": 127830 + }, + { + "epoch": 18.146202980837472, + "grad_norm": 0.0686149150133133, + "learning_rate": 8.186160397444997e-05, + "loss": 0.0200776606798172, + "step": 127840 + }, + { + "epoch": 18.14762242725337, + "grad_norm": 0.061525650322437286, + "learning_rate": 8.186018452803407e-05, + "loss": 0.06339784860610961, + "step": 127850 + }, + { + "epoch": 18.14904187366927, + "grad_norm": 0.015170774422585964, + "learning_rate": 8.185876508161818e-05, + "loss": 0.048648470640182497, + "step": 127860 + }, + { + "epoch": 18.150461320085167, + "grad_norm": 2.982309579849243, + "learning_rate": 8.185734563520228e-05, + "loss": 0.020402514934539796, + "step": 127870 + }, + { + "epoch": 18.151880766501066, + "grad_norm": 11.487321853637695, + "learning_rate": 8.185592618878639e-05, + "loss": 0.011919765919446944, + "step": 127880 + }, + { + "epoch": 18.153300212916964, + "grad_norm": 0.02453085035085678, + "learning_rate": 8.185450674237048e-05, + "loss": 0.0481383353471756, + "step": 127890 + }, + { + "epoch": 18.15471965933286, + "grad_norm": 4.876072883605957, + "learning_rate": 8.185308729595458e-05, + "loss": 0.006667395681142807, + "step": 127900 + }, + { + "epoch": 18.156139105748757, + "grad_norm": 1.0623502731323242, + "learning_rate": 8.185166784953868e-05, + "loss": 0.0022208701819181444, + "step": 127910 + }, + { + "epoch": 18.157558552164655, + "grad_norm": 0.1288275569677353, + "learning_rate": 8.185024840312279e-05, + "loss": 0.009044210612773895, + "step": 127920 + }, + { + "epoch": 18.158977998580554, + "grad_norm": 9.580000877380371, + "learning_rate": 8.184882895670689e-05, + "loss": 0.0466098964214325, + "step": 127930 + }, + { + "epoch": 18.160397444996452, + "grad_norm": 2.10084867477417, + "learning_rate": 8.1847409510291e-05, + "loss": 0.006572528183460236, + "step": 127940 + }, + { + "epoch": 18.16181689141235, + "grad_norm": 0.019893677905201912, + "learning_rate": 8.18459900638751e-05, + "loss": 0.03291449844837189, + "step": 127950 + }, + { + "epoch": 18.16323633782825, + "grad_norm": 6.739614486694336, + "learning_rate": 8.184457061745919e-05, + "loss": 0.02328267991542816, + "step": 127960 + }, + { + "epoch": 18.164655784244143, + "grad_norm": 9.85806941986084, + "learning_rate": 8.18431511710433e-05, + "loss": 0.029339200258255003, + "step": 127970 + }, + { + "epoch": 18.16607523066004, + "grad_norm": 1.8335438966751099, + "learning_rate": 8.18417317246274e-05, + "loss": 0.009429128468036651, + "step": 127980 + }, + { + "epoch": 18.16749467707594, + "grad_norm": 0.15850071609020233, + "learning_rate": 8.184031227821151e-05, + "loss": 0.0174192413687706, + "step": 127990 + }, + { + "epoch": 18.168914123491838, + "grad_norm": 10.395427703857422, + "learning_rate": 8.18388928317956e-05, + "loss": 0.08283510208129882, + "step": 128000 + }, + { + "epoch": 18.168914123491838, + "eval_accuracy": 0.976791505055001, + "eval_loss": 0.09529868513345718, + "eval_runtime": 32.2522, + "eval_samples_per_second": 487.625, + "eval_steps_per_second": 15.255, + "step": 128000 + }, + { + "epoch": 18.170333569907736, + "grad_norm": 0.5031605958938599, + "learning_rate": 8.18374733853797e-05, + "loss": 0.04021282494068146, + "step": 128010 + }, + { + "epoch": 18.171753016323635, + "grad_norm": 0.05247000977396965, + "learning_rate": 8.18360539389638e-05, + "loss": 0.029411664605140685, + "step": 128020 + }, + { + "epoch": 18.173172462739533, + "grad_norm": 0.3838053047657013, + "learning_rate": 8.183463449254791e-05, + "loss": 0.004196440801024437, + "step": 128030 + }, + { + "epoch": 18.174591909155428, + "grad_norm": 1.891951084136963, + "learning_rate": 8.183321504613201e-05, + "loss": 0.009262125194072723, + "step": 128040 + }, + { + "epoch": 18.176011355571326, + "grad_norm": 13.83259105682373, + "learning_rate": 8.183179559971611e-05, + "loss": 0.043445545434951785, + "step": 128050 + }, + { + "epoch": 18.177430801987224, + "grad_norm": 4.659943580627441, + "learning_rate": 8.183037615330022e-05, + "loss": 0.032861173152923584, + "step": 128060 + }, + { + "epoch": 18.178850248403123, + "grad_norm": 0.14418882131576538, + "learning_rate": 8.182895670688432e-05, + "loss": 0.005234483629465103, + "step": 128070 + }, + { + "epoch": 18.18026969481902, + "grad_norm": 0.6910912394523621, + "learning_rate": 8.182753726046843e-05, + "loss": 0.05057108402252197, + "step": 128080 + }, + { + "epoch": 18.18168914123492, + "grad_norm": 10.517608642578125, + "learning_rate": 8.182611781405253e-05, + "loss": 0.010358494520187379, + "step": 128090 + }, + { + "epoch": 18.183108587650818, + "grad_norm": 0.281340628862381, + "learning_rate": 8.182469836763662e-05, + "loss": 0.03498583734035492, + "step": 128100 + }, + { + "epoch": 18.184528034066712, + "grad_norm": 0.248043954372406, + "learning_rate": 8.182327892122072e-05, + "loss": 0.05613068342208862, + "step": 128110 + }, + { + "epoch": 18.18594748048261, + "grad_norm": 2.2355258464813232, + "learning_rate": 8.182185947480483e-05, + "loss": 0.041114142537117, + "step": 128120 + }, + { + "epoch": 18.18736692689851, + "grad_norm": 4.95033073425293, + "learning_rate": 8.182044002838893e-05, + "loss": 0.016805705428123475, + "step": 128130 + }, + { + "epoch": 18.188786373314407, + "grad_norm": 0.05342892184853554, + "learning_rate": 8.181902058197304e-05, + "loss": 0.03471288681030273, + "step": 128140 + }, + { + "epoch": 18.190205819730306, + "grad_norm": 10.261602401733398, + "learning_rate": 8.181760113555714e-05, + "loss": 0.047914010286331174, + "step": 128150 + }, + { + "epoch": 18.191625266146204, + "grad_norm": 0.45671752095222473, + "learning_rate": 8.181618168914123e-05, + "loss": 0.0416656494140625, + "step": 128160 + }, + { + "epoch": 18.193044712562102, + "grad_norm": 0.16699133813381195, + "learning_rate": 8.181476224272535e-05, + "loss": 0.037295478582382205, + "step": 128170 + }, + { + "epoch": 18.194464158977997, + "grad_norm": 0.020635658875107765, + "learning_rate": 8.181334279630944e-05, + "loss": 0.0075481578707695006, + "step": 128180 + }, + { + "epoch": 18.195883605393895, + "grad_norm": 1.0739357471466064, + "learning_rate": 8.181192334989355e-05, + "loss": 0.014541852474212646, + "step": 128190 + }, + { + "epoch": 18.197303051809794, + "grad_norm": 0.07027911394834518, + "learning_rate": 8.181050390347764e-05, + "loss": 0.0018665395677089692, + "step": 128200 + }, + { + "epoch": 18.198722498225692, + "grad_norm": 0.017957065254449844, + "learning_rate": 8.180908445706175e-05, + "loss": 0.008435264229774475, + "step": 128210 + }, + { + "epoch": 18.20014194464159, + "grad_norm": 0.46577832102775574, + "learning_rate": 8.180766501064585e-05, + "loss": 0.06814204454421997, + "step": 128220 + }, + { + "epoch": 18.20156139105749, + "grad_norm": 14.87221908569336, + "learning_rate": 8.180624556422996e-05, + "loss": 0.03074239194393158, + "step": 128230 + }, + { + "epoch": 18.202980837473387, + "grad_norm": 9.295565605163574, + "learning_rate": 8.180482611781405e-05, + "loss": 0.04867202043533325, + "step": 128240 + }, + { + "epoch": 18.20440028388928, + "grad_norm": 4.98245096206665, + "learning_rate": 8.180340667139817e-05, + "loss": 0.025966173410415648, + "step": 128250 + }, + { + "epoch": 18.20581973030518, + "grad_norm": 12.073112487792969, + "learning_rate": 8.180198722498226e-05, + "loss": 0.028777456283569335, + "step": 128260 + }, + { + "epoch": 18.207239176721078, + "grad_norm": 0.19803652167320251, + "learning_rate": 8.180056777856636e-05, + "loss": 0.0357023686170578, + "step": 128270 + }, + { + "epoch": 18.208658623136976, + "grad_norm": 0.04275793582201004, + "learning_rate": 8.179914833215047e-05, + "loss": 0.010345979034900666, + "step": 128280 + }, + { + "epoch": 18.210078069552875, + "grad_norm": 3.300391435623169, + "learning_rate": 8.179772888573457e-05, + "loss": 0.00928306058049202, + "step": 128290 + }, + { + "epoch": 18.211497515968773, + "grad_norm": 0.040431976318359375, + "learning_rate": 8.179630943931868e-05, + "loss": 0.001940181478857994, + "step": 128300 + }, + { + "epoch": 18.21291696238467, + "grad_norm": 2.5737407207489014, + "learning_rate": 8.179488999290276e-05, + "loss": 0.005912529304623604, + "step": 128310 + }, + { + "epoch": 18.214336408800566, + "grad_norm": 0.1068541407585144, + "learning_rate": 8.179347054648687e-05, + "loss": 0.013379514217376709, + "step": 128320 + }, + { + "epoch": 18.215755855216464, + "grad_norm": 1.6886368989944458, + "learning_rate": 8.179205110007097e-05, + "loss": 0.006964053213596344, + "step": 128330 + }, + { + "epoch": 18.217175301632363, + "grad_norm": 0.038944900035858154, + "learning_rate": 8.179063165365508e-05, + "loss": 0.022647054493427278, + "step": 128340 + }, + { + "epoch": 18.21859474804826, + "grad_norm": 0.4597911834716797, + "learning_rate": 8.178921220723918e-05, + "loss": 0.02247808873653412, + "step": 128350 + }, + { + "epoch": 18.22001419446416, + "grad_norm": 9.54317569732666, + "learning_rate": 8.178779276082328e-05, + "loss": 0.011771997064352035, + "step": 128360 + }, + { + "epoch": 18.221433640880058, + "grad_norm": 0.015923280268907547, + "learning_rate": 8.178637331440739e-05, + "loss": 0.005302245542407036, + "step": 128370 + }, + { + "epoch": 18.222853087295956, + "grad_norm": 4.213832855224609, + "learning_rate": 8.178495386799149e-05, + "loss": 0.007035575807094574, + "step": 128380 + }, + { + "epoch": 18.22427253371185, + "grad_norm": 0.07770871371030807, + "learning_rate": 8.17835344215756e-05, + "loss": 0.031133627891540526, + "step": 128390 + }, + { + "epoch": 18.22569198012775, + "grad_norm": 8.806855201721191, + "learning_rate": 8.17821149751597e-05, + "loss": 0.01694534718990326, + "step": 128400 + }, + { + "epoch": 18.227111426543647, + "grad_norm": 6.428601264953613, + "learning_rate": 8.178069552874379e-05, + "loss": 0.016386696696281434, + "step": 128410 + }, + { + "epoch": 18.228530872959546, + "grad_norm": 0.42039939761161804, + "learning_rate": 8.177927608232789e-05, + "loss": 0.018320520222187043, + "step": 128420 + }, + { + "epoch": 18.229950319375444, + "grad_norm": 0.013496828265488148, + "learning_rate": 8.1777856635912e-05, + "loss": 0.005241439118981362, + "step": 128430 + }, + { + "epoch": 18.231369765791342, + "grad_norm": 0.2199200540781021, + "learning_rate": 8.17764371894961e-05, + "loss": 0.006305563449859619, + "step": 128440 + }, + { + "epoch": 18.23278921220724, + "grad_norm": 0.1683681160211563, + "learning_rate": 8.177501774308021e-05, + "loss": 0.050753450393676756, + "step": 128450 + }, + { + "epoch": 18.234208658623135, + "grad_norm": 0.1312318742275238, + "learning_rate": 8.17735982966643e-05, + "loss": 0.030958676338195802, + "step": 128460 + }, + { + "epoch": 18.235628105039034, + "grad_norm": 0.24649330973625183, + "learning_rate": 8.17721788502484e-05, + "loss": 0.02686150074005127, + "step": 128470 + }, + { + "epoch": 18.237047551454932, + "grad_norm": 0.52034592628479, + "learning_rate": 8.177075940383251e-05, + "loss": 0.020387536287307738, + "step": 128480 + }, + { + "epoch": 18.23846699787083, + "grad_norm": 4.3982648849487305, + "learning_rate": 8.176933995741661e-05, + "loss": 0.017361976206302643, + "step": 128490 + }, + { + "epoch": 18.23988644428673, + "grad_norm": 0.3818369507789612, + "learning_rate": 8.176792051100072e-05, + "loss": 0.03366567492485047, + "step": 128500 + }, + { + "epoch": 18.23988644428673, + "eval_accuracy": 0.9851847141857951, + "eval_loss": 0.053453847765922546, + "eval_runtime": 32.9058, + "eval_samples_per_second": 477.94, + "eval_steps_per_second": 14.952, + "step": 128500 + }, + { + "epoch": 18.241305890702627, + "grad_norm": 12.275089263916016, + "learning_rate": 8.17665010645848e-05, + "loss": 0.07093088626861573, + "step": 128510 + }, + { + "epoch": 18.242725337118525, + "grad_norm": 0.02158868871629238, + "learning_rate": 8.176508161816892e-05, + "loss": 0.0045996904373168945, + "step": 128520 + }, + { + "epoch": 18.24414478353442, + "grad_norm": 0.19157670438289642, + "learning_rate": 8.176366217175301e-05, + "loss": 0.014573585987091065, + "step": 128530 + }, + { + "epoch": 18.24556422995032, + "grad_norm": 4.176166534423828, + "learning_rate": 8.176224272533712e-05, + "loss": 0.022593997418880463, + "step": 128540 + }, + { + "epoch": 18.246983676366217, + "grad_norm": 0.0027891797944903374, + "learning_rate": 8.176082327892122e-05, + "loss": 0.0024897228926420213, + "step": 128550 + }, + { + "epoch": 18.248403122782115, + "grad_norm": 0.1233685314655304, + "learning_rate": 8.175940383250532e-05, + "loss": 0.025276607275009154, + "step": 128560 + }, + { + "epoch": 18.249822569198013, + "grad_norm": 2.294764995574951, + "learning_rate": 8.175798438608943e-05, + "loss": 0.018644881248474122, + "step": 128570 + }, + { + "epoch": 18.25124201561391, + "grad_norm": 14.123501777648926, + "learning_rate": 8.175656493967353e-05, + "loss": 0.026422369480133056, + "step": 128580 + }, + { + "epoch": 18.25266146202981, + "grad_norm": 0.04175082966685295, + "learning_rate": 8.175514549325764e-05, + "loss": 0.021905991435050964, + "step": 128590 + }, + { + "epoch": 18.254080908445705, + "grad_norm": 0.9239396452903748, + "learning_rate": 8.175372604684174e-05, + "loss": 0.013399584591388703, + "step": 128600 + }, + { + "epoch": 18.255500354861603, + "grad_norm": 2.069931745529175, + "learning_rate": 8.175230660042585e-05, + "loss": 0.05277642607688904, + "step": 128610 + }, + { + "epoch": 18.2569198012775, + "grad_norm": 1.028808355331421, + "learning_rate": 8.175088715400993e-05, + "loss": 0.015741121768951417, + "step": 128620 + }, + { + "epoch": 18.2583392476934, + "grad_norm": 1.025839924812317, + "learning_rate": 8.174946770759404e-05, + "loss": 0.009937211871147156, + "step": 128630 + }, + { + "epoch": 18.259758694109298, + "grad_norm": 0.1733449101448059, + "learning_rate": 8.174804826117814e-05, + "loss": 0.0087957963347435, + "step": 128640 + }, + { + "epoch": 18.261178140525196, + "grad_norm": 5.729976654052734, + "learning_rate": 8.174662881476225e-05, + "loss": 0.052794671058654784, + "step": 128650 + }, + { + "epoch": 18.262597586941094, + "grad_norm": 1.5335612297058105, + "learning_rate": 8.174520936834635e-05, + "loss": 0.04668539762496948, + "step": 128660 + }, + { + "epoch": 18.26401703335699, + "grad_norm": 0.021340427920222282, + "learning_rate": 8.174378992193044e-05, + "loss": 0.029316258430480958, + "step": 128670 + }, + { + "epoch": 18.265436479772887, + "grad_norm": 1.2407418489456177, + "learning_rate": 8.174237047551456e-05, + "loss": 0.0165558785200119, + "step": 128680 + }, + { + "epoch": 18.266855926188786, + "grad_norm": 0.15023832023143768, + "learning_rate": 8.174095102909865e-05, + "loss": 0.004716331884264946, + "step": 128690 + }, + { + "epoch": 18.268275372604684, + "grad_norm": 3.0022218227386475, + "learning_rate": 8.173953158268276e-05, + "loss": 0.031516680121421815, + "step": 128700 + }, + { + "epoch": 18.269694819020582, + "grad_norm": 0.06068778783082962, + "learning_rate": 8.173811213626686e-05, + "loss": 0.011225759983062744, + "step": 128710 + }, + { + "epoch": 18.27111426543648, + "grad_norm": 14.479620933532715, + "learning_rate": 8.173669268985096e-05, + "loss": 0.0523698627948761, + "step": 128720 + }, + { + "epoch": 18.27253371185238, + "grad_norm": 0.26089462637901306, + "learning_rate": 8.173527324343506e-05, + "loss": 0.03181962370872497, + "step": 128730 + }, + { + "epoch": 18.273953158268274, + "grad_norm": 0.012346764095127583, + "learning_rate": 8.173385379701917e-05, + "loss": 0.012769991159439087, + "step": 128740 + }, + { + "epoch": 18.275372604684172, + "grad_norm": 0.034206002950668335, + "learning_rate": 8.173243435060326e-05, + "loss": 0.004806910455226898, + "step": 128750 + }, + { + "epoch": 18.27679205110007, + "grad_norm": 0.015452612191438675, + "learning_rate": 8.173101490418738e-05, + "loss": 0.0019456423819065095, + "step": 128760 + }, + { + "epoch": 18.27821149751597, + "grad_norm": 0.08761896938085556, + "learning_rate": 8.172959545777147e-05, + "loss": 0.005956121534109115, + "step": 128770 + }, + { + "epoch": 18.279630943931867, + "grad_norm": 0.0656166523694992, + "learning_rate": 8.172817601135557e-05, + "loss": 0.013135121762752533, + "step": 128780 + }, + { + "epoch": 18.281050390347765, + "grad_norm": 0.09261200577020645, + "learning_rate": 8.172675656493968e-05, + "loss": 0.01091306060552597, + "step": 128790 + }, + { + "epoch": 18.282469836763664, + "grad_norm": 3.3840973377227783, + "learning_rate": 8.172533711852378e-05, + "loss": 0.03014102280139923, + "step": 128800 + }, + { + "epoch": 18.28388928317956, + "grad_norm": 3.299243450164795, + "learning_rate": 8.172391767210789e-05, + "loss": 0.053286343812942505, + "step": 128810 + }, + { + "epoch": 18.285308729595457, + "grad_norm": 5.125652313232422, + "learning_rate": 8.172249822569197e-05, + "loss": 0.036524662375450136, + "step": 128820 + }, + { + "epoch": 18.286728176011355, + "grad_norm": 0.01079608965665102, + "learning_rate": 8.172107877927608e-05, + "loss": 0.012189614027738572, + "step": 128830 + }, + { + "epoch": 18.288147622427253, + "grad_norm": 5.404969215393066, + "learning_rate": 8.171965933286018e-05, + "loss": 0.06643599271774292, + "step": 128840 + }, + { + "epoch": 18.28956706884315, + "grad_norm": 0.21422381699085236, + "learning_rate": 8.171823988644429e-05, + "loss": 0.017375747859477996, + "step": 128850 + }, + { + "epoch": 18.29098651525905, + "grad_norm": 0.20465363562107086, + "learning_rate": 8.17168204400284e-05, + "loss": 0.031096231937408448, + "step": 128860 + }, + { + "epoch": 18.292405961674948, + "grad_norm": 1.680165410041809, + "learning_rate": 8.171540099361249e-05, + "loss": 0.024290363490581512, + "step": 128870 + }, + { + "epoch": 18.293825408090843, + "grad_norm": 0.2074621170759201, + "learning_rate": 8.17139815471966e-05, + "loss": 0.018671299517154693, + "step": 128880 + }, + { + "epoch": 18.29524485450674, + "grad_norm": 0.21123144030570984, + "learning_rate": 8.17125621007807e-05, + "loss": 0.060861396789550784, + "step": 128890 + }, + { + "epoch": 18.29666430092264, + "grad_norm": 11.028310775756836, + "learning_rate": 8.17111426543648e-05, + "loss": 0.07039762139320374, + "step": 128900 + }, + { + "epoch": 18.298083747338538, + "grad_norm": 0.025449946522712708, + "learning_rate": 8.17097232079489e-05, + "loss": 0.010196681320667266, + "step": 128910 + }, + { + "epoch": 18.299503193754436, + "grad_norm": 0.018262367695569992, + "learning_rate": 8.170830376153301e-05, + "loss": 0.018941739201545717, + "step": 128920 + }, + { + "epoch": 18.300922640170334, + "grad_norm": 3.6312954425811768, + "learning_rate": 8.17068843151171e-05, + "loss": 0.04633817672729492, + "step": 128930 + }, + { + "epoch": 18.302342086586233, + "grad_norm": 3.760207176208496, + "learning_rate": 8.170546486870121e-05, + "loss": 0.028416919708251952, + "step": 128940 + }, + { + "epoch": 18.303761533002127, + "grad_norm": 2.070991277694702, + "learning_rate": 8.170404542228532e-05, + "loss": 0.004716591536998748, + "step": 128950 + }, + { + "epoch": 18.305180979418026, + "grad_norm": 0.1530076414346695, + "learning_rate": 8.170262597586942e-05, + "loss": 0.012267137318849564, + "step": 128960 + }, + { + "epoch": 18.306600425833924, + "grad_norm": 0.007661824580281973, + "learning_rate": 8.170120652945353e-05, + "loss": 0.005905486643314362, + "step": 128970 + }, + { + "epoch": 18.308019872249822, + "grad_norm": 4.314486026763916, + "learning_rate": 8.169978708303761e-05, + "loss": 0.031454536318778994, + "step": 128980 + }, + { + "epoch": 18.30943931866572, + "grad_norm": 0.3191210627555847, + "learning_rate": 8.169836763662172e-05, + "loss": 0.01454719752073288, + "step": 128990 + }, + { + "epoch": 18.31085876508162, + "grad_norm": 0.23693464696407318, + "learning_rate": 8.169694819020582e-05, + "loss": 0.02261695861816406, + "step": 129000 + }, + { + "epoch": 18.31085876508162, + "eval_accuracy": 0.9855662236917403, + "eval_loss": 0.05503528565168381, + "eval_runtime": 32.4819, + "eval_samples_per_second": 484.177, + "eval_steps_per_second": 15.147, + "step": 129000 + }, + { + "epoch": 18.312278211497517, + "grad_norm": 0.4022933542728424, + "learning_rate": 8.169552874378993e-05, + "loss": 0.05248420238494873, + "step": 129010 + }, + { + "epoch": 18.313697657913412, + "grad_norm": 13.0270414352417, + "learning_rate": 8.169410929737403e-05, + "loss": 0.04522181451320648, + "step": 129020 + }, + { + "epoch": 18.31511710432931, + "grad_norm": 0.7602053284645081, + "learning_rate": 8.169268985095813e-05, + "loss": 0.023917488753795624, + "step": 129030 + }, + { + "epoch": 18.31653655074521, + "grad_norm": 4.949351787567139, + "learning_rate": 8.169127040454224e-05, + "loss": 0.021494348347187043, + "step": 129040 + }, + { + "epoch": 18.317955997161107, + "grad_norm": 0.1981838345527649, + "learning_rate": 8.168985095812633e-05, + "loss": 0.01364341527223587, + "step": 129050 + }, + { + "epoch": 18.319375443577005, + "grad_norm": 0.21024064719676971, + "learning_rate": 8.168843151171045e-05, + "loss": 0.00963202938437462, + "step": 129060 + }, + { + "epoch": 18.320794889992904, + "grad_norm": 0.08986670523881912, + "learning_rate": 8.168701206529454e-05, + "loss": 0.004476574808359146, + "step": 129070 + }, + { + "epoch": 18.322214336408802, + "grad_norm": Infinity, + "learning_rate": 8.168559261887864e-05, + "loss": 0.029144853353500366, + "step": 129080 + }, + { + "epoch": 18.323633782824697, + "grad_norm": 1.82876718044281, + "learning_rate": 8.168431511710434e-05, + "loss": 0.026475942134857176, + "step": 129090 + }, + { + "epoch": 18.325053229240595, + "grad_norm": 10.667966842651367, + "learning_rate": 8.168289567068844e-05, + "loss": 0.021684975922107698, + "step": 129100 + }, + { + "epoch": 18.326472675656493, + "grad_norm": 0.19591830670833588, + "learning_rate": 8.168147622427253e-05, + "loss": 0.005786832049489021, + "step": 129110 + }, + { + "epoch": 18.32789212207239, + "grad_norm": 0.5080521106719971, + "learning_rate": 8.168005677785664e-05, + "loss": 0.02582077980041504, + "step": 129120 + }, + { + "epoch": 18.32931156848829, + "grad_norm": 3.638640880584717, + "learning_rate": 8.167863733144074e-05, + "loss": 0.050702786445617674, + "step": 129130 + }, + { + "epoch": 18.330731014904188, + "grad_norm": 0.13650347292423248, + "learning_rate": 8.167721788502485e-05, + "loss": 0.0599769115447998, + "step": 129140 + }, + { + "epoch": 18.332150461320087, + "grad_norm": 0.37721243500709534, + "learning_rate": 8.167579843860894e-05, + "loss": 0.04912948906421662, + "step": 129150 + }, + { + "epoch": 18.33356990773598, + "grad_norm": 6.161162853240967, + "learning_rate": 8.167437899219305e-05, + "loss": 0.03150567710399628, + "step": 129160 + }, + { + "epoch": 18.33498935415188, + "grad_norm": 0.027903534471988678, + "learning_rate": 8.167295954577714e-05, + "loss": 0.011949583142995834, + "step": 129170 + }, + { + "epoch": 18.336408800567778, + "grad_norm": 0.12960362434387207, + "learning_rate": 8.167154009936126e-05, + "loss": 0.0031189464032649996, + "step": 129180 + }, + { + "epoch": 18.337828246983676, + "grad_norm": 1.6316189765930176, + "learning_rate": 8.167012065294535e-05, + "loss": 0.018705233931541443, + "step": 129190 + }, + { + "epoch": 18.339247693399575, + "grad_norm": 0.2289106547832489, + "learning_rate": 8.166870120652945e-05, + "loss": 0.00795489102602005, + "step": 129200 + }, + { + "epoch": 18.340667139815473, + "grad_norm": 10.622598648071289, + "learning_rate": 8.166728176011356e-05, + "loss": 0.015995678305625916, + "step": 129210 + }, + { + "epoch": 18.34208658623137, + "grad_norm": 0.01911492832005024, + "learning_rate": 8.166586231369766e-05, + "loss": 0.006051765382289886, + "step": 129220 + }, + { + "epoch": 18.343506032647266, + "grad_norm": 0.06445548683404922, + "learning_rate": 8.166444286728177e-05, + "loss": 0.07028995752334595, + "step": 129230 + }, + { + "epoch": 18.344925479063164, + "grad_norm": 0.06971293687820435, + "learning_rate": 8.166302342086587e-05, + "loss": 0.01684674471616745, + "step": 129240 + }, + { + "epoch": 18.346344925479062, + "grad_norm": 0.19688984751701355, + "learning_rate": 8.166160397444998e-05, + "loss": 0.0024049151688814163, + "step": 129250 + }, + { + "epoch": 18.34776437189496, + "grad_norm": 0.0021098207216709852, + "learning_rate": 8.166018452803406e-05, + "loss": 0.024603772163391113, + "step": 129260 + }, + { + "epoch": 18.34918381831086, + "grad_norm": 0.620466411113739, + "learning_rate": 8.165876508161817e-05, + "loss": 0.01730831414461136, + "step": 129270 + }, + { + "epoch": 18.350603264726757, + "grad_norm": 0.025463026016950607, + "learning_rate": 8.165734563520227e-05, + "loss": 0.01627231538295746, + "step": 129280 + }, + { + "epoch": 18.352022711142656, + "grad_norm": 0.13226747512817383, + "learning_rate": 8.165592618878638e-05, + "loss": 0.006072504445910454, + "step": 129290 + }, + { + "epoch": 18.35344215755855, + "grad_norm": 9.42383098602295, + "learning_rate": 8.165450674237048e-05, + "loss": 0.020982658863067626, + "step": 129300 + }, + { + "epoch": 18.35486160397445, + "grad_norm": 0.4462938904762268, + "learning_rate": 8.165308729595458e-05, + "loss": 0.03400824964046478, + "step": 129310 + }, + { + "epoch": 18.356281050390347, + "grad_norm": 2.0822784900665283, + "learning_rate": 8.165166784953869e-05, + "loss": 0.005411965027451515, + "step": 129320 + }, + { + "epoch": 18.357700496806245, + "grad_norm": 0.01747988536953926, + "learning_rate": 8.165024840312278e-05, + "loss": 0.012718930840492249, + "step": 129330 + }, + { + "epoch": 18.359119943222144, + "grad_norm": 0.06952252238988876, + "learning_rate": 8.16488289567069e-05, + "loss": 0.01600598692893982, + "step": 129340 + }, + { + "epoch": 18.360539389638042, + "grad_norm": 3.6428823471069336, + "learning_rate": 8.164740951029099e-05, + "loss": 0.005556048080325127, + "step": 129350 + }, + { + "epoch": 18.36195883605394, + "grad_norm": 2.1990156173706055, + "learning_rate": 8.164599006387509e-05, + "loss": 0.034307444095611574, + "step": 129360 + }, + { + "epoch": 18.363378282469835, + "grad_norm": 0.49816688895225525, + "learning_rate": 8.164457061745919e-05, + "loss": 0.047325742244720456, + "step": 129370 + }, + { + "epoch": 18.364797728885733, + "grad_norm": 6.322769641876221, + "learning_rate": 8.16431511710433e-05, + "loss": 0.015552473068237305, + "step": 129380 + }, + { + "epoch": 18.36621717530163, + "grad_norm": 0.07665533572435379, + "learning_rate": 8.16417317246274e-05, + "loss": 0.03991127610206604, + "step": 129390 + }, + { + "epoch": 18.36763662171753, + "grad_norm": 0.08783033490180969, + "learning_rate": 8.16403122782115e-05, + "loss": 0.008310206234455109, + "step": 129400 + }, + { + "epoch": 18.36905606813343, + "grad_norm": 0.8235506415367126, + "learning_rate": 8.16388928317956e-05, + "loss": 0.02230387181043625, + "step": 129410 + }, + { + "epoch": 18.370475514549327, + "grad_norm": 0.016514234244823456, + "learning_rate": 8.16374733853797e-05, + "loss": 0.004676712676882744, + "step": 129420 + }, + { + "epoch": 18.371894960965225, + "grad_norm": 6.880260944366455, + "learning_rate": 8.163605393896381e-05, + "loss": 0.04326528310775757, + "step": 129430 + }, + { + "epoch": 18.37331440738112, + "grad_norm": 15.991277694702148, + "learning_rate": 8.163463449254791e-05, + "loss": 0.03879193067550659, + "step": 129440 + }, + { + "epoch": 18.374733853797018, + "grad_norm": 0.3291440010070801, + "learning_rate": 8.163321504613202e-05, + "loss": 0.006866324692964554, + "step": 129450 + }, + { + "epoch": 18.376153300212916, + "grad_norm": 0.1651817411184311, + "learning_rate": 8.16317955997161e-05, + "loss": 0.047083538770675656, + "step": 129460 + }, + { + "epoch": 18.377572746628815, + "grad_norm": 1.0862044095993042, + "learning_rate": 8.163037615330022e-05, + "loss": 0.012643066048622132, + "step": 129470 + }, + { + "epoch": 18.378992193044713, + "grad_norm": 0.4422168433666229, + "learning_rate": 8.162895670688431e-05, + "loss": 0.00818169042468071, + "step": 129480 + }, + { + "epoch": 18.38041163946061, + "grad_norm": 5.287008762359619, + "learning_rate": 8.162753726046842e-05, + "loss": 0.012424495071172714, + "step": 129490 + }, + { + "epoch": 18.38183108587651, + "grad_norm": 0.3110944926738739, + "learning_rate": 8.162611781405252e-05, + "loss": 0.003440593555569649, + "step": 129500 + }, + { + "epoch": 18.38183108587651, + "eval_accuracy": 0.9833407515737267, + "eval_loss": 0.061810024082660675, + "eval_runtime": 31.9536, + "eval_samples_per_second": 492.183, + "eval_steps_per_second": 15.397, + "step": 129500 + }, + { + "epoch": 18.383250532292404, + "grad_norm": 5.51362943649292, + "learning_rate": 8.162469836763662e-05, + "loss": 0.045882344245910645, + "step": 129510 + }, + { + "epoch": 18.384669978708303, + "grad_norm": 1.5791926383972168, + "learning_rate": 8.162327892122073e-05, + "loss": 0.06756555438041686, + "step": 129520 + }, + { + "epoch": 18.3860894251242, + "grad_norm": 3.9684646129608154, + "learning_rate": 8.162185947480483e-05, + "loss": 0.02608048915863037, + "step": 129530 + }, + { + "epoch": 18.3875088715401, + "grad_norm": 0.2998262643814087, + "learning_rate": 8.162044002838894e-05, + "loss": 0.02499275803565979, + "step": 129540 + }, + { + "epoch": 18.388928317955997, + "grad_norm": 4.6388020515441895, + "learning_rate": 8.161902058197303e-05, + "loss": 0.010197123885154724, + "step": 129550 + }, + { + "epoch": 18.390347764371896, + "grad_norm": 0.12832055985927582, + "learning_rate": 8.161760113555713e-05, + "loss": 0.014759251475334167, + "step": 129560 + }, + { + "epoch": 18.391767210787794, + "grad_norm": 1.8525927066802979, + "learning_rate": 8.161618168914123e-05, + "loss": 0.012668058276176453, + "step": 129570 + }, + { + "epoch": 18.39318665720369, + "grad_norm": 0.3240136504173279, + "learning_rate": 8.161476224272534e-05, + "loss": 0.024926677346229553, + "step": 129580 + }, + { + "epoch": 18.394606103619587, + "grad_norm": 5.895405292510986, + "learning_rate": 8.161334279630944e-05, + "loss": 0.05909621119499207, + "step": 129590 + }, + { + "epoch": 18.396025550035485, + "grad_norm": 6.2054948806762695, + "learning_rate": 8.161192334989355e-05, + "loss": 0.026874464750289918, + "step": 129600 + }, + { + "epoch": 18.397444996451384, + "grad_norm": 7.646547794342041, + "learning_rate": 8.161050390347766e-05, + "loss": 0.009939579665660859, + "step": 129610 + }, + { + "epoch": 18.398864442867282, + "grad_norm": 0.11051219701766968, + "learning_rate": 8.160908445706174e-05, + "loss": 0.006405261158943176, + "step": 129620 + }, + { + "epoch": 18.40028388928318, + "grad_norm": 7.410157203674316, + "learning_rate": 8.160766501064585e-05, + "loss": 0.04566127061843872, + "step": 129630 + }, + { + "epoch": 18.40170333569908, + "grad_norm": 0.6017818450927734, + "learning_rate": 8.160624556422995e-05, + "loss": 0.041333383321762084, + "step": 129640 + }, + { + "epoch": 18.403122782114973, + "grad_norm": 7.644882678985596, + "learning_rate": 8.160482611781406e-05, + "loss": 0.008430808037519454, + "step": 129650 + }, + { + "epoch": 18.40454222853087, + "grad_norm": 0.5640460848808289, + "learning_rate": 8.160340667139816e-05, + "loss": 0.008810888230800628, + "step": 129660 + }, + { + "epoch": 18.40596167494677, + "grad_norm": 0.032049115747213364, + "learning_rate": 8.160198722498226e-05, + "loss": 0.03185172080993652, + "step": 129670 + }, + { + "epoch": 18.40738112136267, + "grad_norm": 4.317996978759766, + "learning_rate": 8.160056777856635e-05, + "loss": 0.03531225621700287, + "step": 129680 + }, + { + "epoch": 18.408800567778567, + "grad_norm": 0.04317004978656769, + "learning_rate": 8.159914833215047e-05, + "loss": 0.044870421290397644, + "step": 129690 + }, + { + "epoch": 18.410220014194465, + "grad_norm": 2.8784120082855225, + "learning_rate": 8.159772888573458e-05, + "loss": 0.028561246395111085, + "step": 129700 + }, + { + "epoch": 18.411639460610363, + "grad_norm": 0.2848926782608032, + "learning_rate": 8.159630943931867e-05, + "loss": 0.01058817058801651, + "step": 129710 + }, + { + "epoch": 18.413058907026258, + "grad_norm": 0.2673834562301636, + "learning_rate": 8.159488999290277e-05, + "loss": 0.0068480148911476135, + "step": 129720 + }, + { + "epoch": 18.414478353442156, + "grad_norm": 0.07594344019889832, + "learning_rate": 8.159347054648687e-05, + "loss": 0.005010564252734185, + "step": 129730 + }, + { + "epoch": 18.415897799858055, + "grad_norm": 0.028032490983605385, + "learning_rate": 8.159205110007098e-05, + "loss": 0.0075709976255893706, + "step": 129740 + }, + { + "epoch": 18.417317246273953, + "grad_norm": 0.05056433379650116, + "learning_rate": 8.159063165365508e-05, + "loss": 0.021717457473278044, + "step": 129750 + }, + { + "epoch": 18.41873669268985, + "grad_norm": 0.35659167170524597, + "learning_rate": 8.158921220723919e-05, + "loss": 0.0025276631116867066, + "step": 129760 + }, + { + "epoch": 18.42015613910575, + "grad_norm": 7.861473083496094, + "learning_rate": 8.158779276082327e-05, + "loss": 0.011818940937519073, + "step": 129770 + }, + { + "epoch": 18.421575585521648, + "grad_norm": 0.0440661758184433, + "learning_rate": 8.158637331440738e-05, + "loss": 0.025470623373985292, + "step": 129780 + }, + { + "epoch": 18.422995031937543, + "grad_norm": 0.26259395480155945, + "learning_rate": 8.158495386799148e-05, + "loss": 0.03641534149646759, + "step": 129790 + }, + { + "epoch": 18.42441447835344, + "grad_norm": 0.06135106831789017, + "learning_rate": 8.158353442157559e-05, + "loss": 0.014743022620677948, + "step": 129800 + }, + { + "epoch": 18.42583392476934, + "grad_norm": 0.306878000497818, + "learning_rate": 8.15821149751597e-05, + "loss": 0.04522762894630432, + "step": 129810 + }, + { + "epoch": 18.427253371185238, + "grad_norm": 0.21729759871959686, + "learning_rate": 8.158069552874379e-05, + "loss": 0.057735615968704225, + "step": 129820 + }, + { + "epoch": 18.428672817601136, + "grad_norm": 0.7416278719902039, + "learning_rate": 8.15792760823279e-05, + "loss": 0.016473343968391417, + "step": 129830 + }, + { + "epoch": 18.430092264017034, + "grad_norm": 1.3812849521636963, + "learning_rate": 8.1577856635912e-05, + "loss": 0.007637700438499451, + "step": 129840 + }, + { + "epoch": 18.431511710432932, + "grad_norm": 8.001340866088867, + "learning_rate": 8.15764371894961e-05, + "loss": 0.03369604349136353, + "step": 129850 + }, + { + "epoch": 18.432931156848827, + "grad_norm": 0.0657612532377243, + "learning_rate": 8.15750177430802e-05, + "loss": 0.0019177131354808808, + "step": 129860 + }, + { + "epoch": 18.434350603264726, + "grad_norm": 0.5097655653953552, + "learning_rate": 8.15735982966643e-05, + "loss": 0.023853468894958495, + "step": 129870 + }, + { + "epoch": 18.435770049680624, + "grad_norm": 0.03282688185572624, + "learning_rate": 8.15721788502484e-05, + "loss": 0.0029218826442956925, + "step": 129880 + }, + { + "epoch": 18.437189496096522, + "grad_norm": 1.0238633155822754, + "learning_rate": 8.157075940383251e-05, + "loss": 0.0808078944683075, + "step": 129890 + }, + { + "epoch": 18.43860894251242, + "grad_norm": 0.7021051645278931, + "learning_rate": 8.156933995741662e-05, + "loss": 0.011131210625171662, + "step": 129900 + }, + { + "epoch": 18.44002838892832, + "grad_norm": 0.019014930352568626, + "learning_rate": 8.156792051100072e-05, + "loss": 0.02635202705860138, + "step": 129910 + }, + { + "epoch": 18.441447835344217, + "grad_norm": 0.11540888994932175, + "learning_rate": 8.156650106458481e-05, + "loss": 0.0037258245050907136, + "step": 129920 + }, + { + "epoch": 18.442867281760112, + "grad_norm": 11.679533958435059, + "learning_rate": 8.156508161816891e-05, + "loss": 0.052882683277130124, + "step": 129930 + }, + { + "epoch": 18.44428672817601, + "grad_norm": 0.2892465889453888, + "learning_rate": 8.156366217175302e-05, + "loss": 0.029611861705780028, + "step": 129940 + }, + { + "epoch": 18.44570617459191, + "grad_norm": 0.1754387468099594, + "learning_rate": 8.156224272533712e-05, + "loss": 0.008679266273975372, + "step": 129950 + }, + { + "epoch": 18.447125621007807, + "grad_norm": 0.4253217875957489, + "learning_rate": 8.156082327892123e-05, + "loss": 0.025545185804367064, + "step": 129960 + }, + { + "epoch": 18.448545067423705, + "grad_norm": 1.0162055492401123, + "learning_rate": 8.155940383250533e-05, + "loss": 0.0023859657347202303, + "step": 129970 + }, + { + "epoch": 18.449964513839603, + "grad_norm": 0.161495178937912, + "learning_rate": 8.155798438608943e-05, + "loss": 0.009250961244106293, + "step": 129980 + }, + { + "epoch": 18.4513839602555, + "grad_norm": 0.13088825345039368, + "learning_rate": 8.155656493967354e-05, + "loss": 0.010349231958389282, + "step": 129990 + }, + { + "epoch": 18.4528034066714, + "grad_norm": 11.125255584716797, + "learning_rate": 8.155514549325763e-05, + "loss": 0.012452618777751922, + "step": 130000 + }, + { + "epoch": 18.4528034066714, + "eval_accuracy": 0.9861384879506581, + "eval_loss": 0.04782792553305626, + "eval_runtime": 31.6384, + "eval_samples_per_second": 497.086, + "eval_steps_per_second": 15.551, + "step": 130000 + }, + { + "epoch": 18.454222853087295, + "grad_norm": 8.363743782043457, + "learning_rate": 8.155372604684174e-05, + "loss": 0.04421942234039307, + "step": 130010 + }, + { + "epoch": 18.455642299503193, + "grad_norm": 0.00922936387360096, + "learning_rate": 8.155230660042584e-05, + "loss": 0.005544114112854004, + "step": 130020 + }, + { + "epoch": 18.45706174591909, + "grad_norm": 0.04554932191967964, + "learning_rate": 8.155088715400994e-05, + "loss": 0.04884060621261597, + "step": 130030 + }, + { + "epoch": 18.45848119233499, + "grad_norm": 0.05452294275164604, + "learning_rate": 8.154946770759404e-05, + "loss": 0.01580309122800827, + "step": 130040 + }, + { + "epoch": 18.459900638750888, + "grad_norm": 0.23617449402809143, + "learning_rate": 8.154804826117815e-05, + "loss": 0.02258615642786026, + "step": 130050 + }, + { + "epoch": 18.461320085166786, + "grad_norm": 0.11298935860395432, + "learning_rate": 8.154662881476224e-05, + "loss": 0.014024610817432403, + "step": 130060 + }, + { + "epoch": 18.462739531582685, + "grad_norm": 0.07385504245758057, + "learning_rate": 8.154520936834636e-05, + "loss": 0.004758263379335404, + "step": 130070 + }, + { + "epoch": 18.46415897799858, + "grad_norm": 0.18449749052524567, + "learning_rate": 8.154378992193045e-05, + "loss": 0.013836902379989625, + "step": 130080 + }, + { + "epoch": 18.465578424414478, + "grad_norm": 0.08751697838306427, + "learning_rate": 8.154237047551455e-05, + "loss": 0.04114283621311188, + "step": 130090 + }, + { + "epoch": 18.466997870830376, + "grad_norm": 0.015224000439047813, + "learning_rate": 8.154095102909866e-05, + "loss": 0.01883476823568344, + "step": 130100 + }, + { + "epoch": 18.468417317246274, + "grad_norm": 0.01533492747694254, + "learning_rate": 8.153953158268276e-05, + "loss": 0.012298651039600372, + "step": 130110 + }, + { + "epoch": 18.469836763662173, + "grad_norm": 1.5610331296920776, + "learning_rate": 8.153811213626687e-05, + "loss": 0.00967889130115509, + "step": 130120 + }, + { + "epoch": 18.47125621007807, + "grad_norm": 7.301101207733154, + "learning_rate": 8.153669268985095e-05, + "loss": 0.057591044902801515, + "step": 130130 + }, + { + "epoch": 18.47267565649397, + "grad_norm": 6.648977279663086, + "learning_rate": 8.153527324343506e-05, + "loss": 0.023354032635688783, + "step": 130140 + }, + { + "epoch": 18.474095102909864, + "grad_norm": 0.012980490922927856, + "learning_rate": 8.153385379701916e-05, + "loss": 0.02406987249851227, + "step": 130150 + }, + { + "epoch": 18.475514549325762, + "grad_norm": 0.22776415944099426, + "learning_rate": 8.153243435060327e-05, + "loss": 0.007413412630558014, + "step": 130160 + }, + { + "epoch": 18.47693399574166, + "grad_norm": 1.5166367292404175, + "learning_rate": 8.153101490418737e-05, + "loss": 0.06717569828033447, + "step": 130170 + }, + { + "epoch": 18.47835344215756, + "grad_norm": 2.784849166870117, + "learning_rate": 8.152959545777147e-05, + "loss": 0.031895536184310916, + "step": 130180 + }, + { + "epoch": 18.479772888573457, + "grad_norm": 0.029781892895698547, + "learning_rate": 8.152817601135558e-05, + "loss": 0.04838870763778687, + "step": 130190 + }, + { + "epoch": 18.481192334989355, + "grad_norm": 6.059544086456299, + "learning_rate": 8.152675656493968e-05, + "loss": 0.12346867322921753, + "step": 130200 + }, + { + "epoch": 18.482611781405254, + "grad_norm": 2.957576274871826, + "learning_rate": 8.152533711852379e-05, + "loss": 0.02838844358921051, + "step": 130210 + }, + { + "epoch": 18.48403122782115, + "grad_norm": 0.07354004681110382, + "learning_rate": 8.152391767210788e-05, + "loss": 0.06742051839828492, + "step": 130220 + }, + { + "epoch": 18.485450674237047, + "grad_norm": 0.6618748307228088, + "learning_rate": 8.152249822569198e-05, + "loss": 0.02543102204799652, + "step": 130230 + }, + { + "epoch": 18.486870120652945, + "grad_norm": 1.7090263366699219, + "learning_rate": 8.152107877927608e-05, + "loss": 0.02140275239944458, + "step": 130240 + }, + { + "epoch": 18.488289567068843, + "grad_norm": 1.1480499505996704, + "learning_rate": 8.151965933286019e-05, + "loss": 0.031545788049697876, + "step": 130250 + }, + { + "epoch": 18.48970901348474, + "grad_norm": 4.303988933563232, + "learning_rate": 8.151823988644429e-05, + "loss": 0.004767316952347755, + "step": 130260 + }, + { + "epoch": 18.49112845990064, + "grad_norm": 0.045829661190509796, + "learning_rate": 8.15168204400284e-05, + "loss": 0.015709532797336577, + "step": 130270 + }, + { + "epoch": 18.49254790631654, + "grad_norm": 0.04191277548670769, + "learning_rate": 8.15154009936125e-05, + "loss": 0.003898696228861809, + "step": 130280 + }, + { + "epoch": 18.493967352732433, + "grad_norm": 0.24794617295265198, + "learning_rate": 8.151398154719659e-05, + "loss": 0.005687400698661804, + "step": 130290 + }, + { + "epoch": 18.49538679914833, + "grad_norm": 9.907116889953613, + "learning_rate": 8.15125621007807e-05, + "loss": 0.028389474749565123, + "step": 130300 + }, + { + "epoch": 18.49680624556423, + "grad_norm": 0.5462149381637573, + "learning_rate": 8.15111426543648e-05, + "loss": 0.0044613339006900786, + "step": 130310 + }, + { + "epoch": 18.498225691980128, + "grad_norm": 0.2734536826610565, + "learning_rate": 8.150972320794891e-05, + "loss": 0.003214791417121887, + "step": 130320 + }, + { + "epoch": 18.499645138396026, + "grad_norm": 0.014535377733409405, + "learning_rate": 8.150830376153301e-05, + "loss": 0.031720873713493344, + "step": 130330 + }, + { + "epoch": 18.501064584811925, + "grad_norm": 0.18706893920898438, + "learning_rate": 8.150688431511711e-05, + "loss": 0.009772472828626633, + "step": 130340 + }, + { + "epoch": 18.502484031227823, + "grad_norm": 3.6424851417541504, + "learning_rate": 8.15054648687012e-05, + "loss": 0.04417179822921753, + "step": 130350 + }, + { + "epoch": 18.503903477643718, + "grad_norm": 0.017430992797017097, + "learning_rate": 8.150404542228532e-05, + "loss": 0.015689238905906677, + "step": 130360 + }, + { + "epoch": 18.505322924059616, + "grad_norm": 0.010013514198362827, + "learning_rate": 8.150262597586941e-05, + "loss": 0.014602565765380859, + "step": 130370 + }, + { + "epoch": 18.506742370475514, + "grad_norm": 0.896862268447876, + "learning_rate": 8.150120652945352e-05, + "loss": 0.0020655494183301924, + "step": 130380 + }, + { + "epoch": 18.508161816891413, + "grad_norm": 3.901364326477051, + "learning_rate": 8.149978708303762e-05, + "loss": 0.013213767111301422, + "step": 130390 + }, + { + "epoch": 18.50958126330731, + "grad_norm": 2.040121078491211, + "learning_rate": 8.149836763662172e-05, + "loss": 0.018503423035144805, + "step": 130400 + }, + { + "epoch": 18.51100070972321, + "grad_norm": 0.017969397827982903, + "learning_rate": 8.149694819020583e-05, + "loss": 0.009773757308721542, + "step": 130410 + }, + { + "epoch": 18.512420156139108, + "grad_norm": 0.004456004127860069, + "learning_rate": 8.149552874378993e-05, + "loss": 0.00311664380133152, + "step": 130420 + }, + { + "epoch": 18.513839602555002, + "grad_norm": 2.9926350116729736, + "learning_rate": 8.149410929737404e-05, + "loss": 0.02173994779586792, + "step": 130430 + }, + { + "epoch": 18.5152590489709, + "grad_norm": 0.16839352250099182, + "learning_rate": 8.149268985095812e-05, + "loss": 0.016191045939922332, + "step": 130440 + }, + { + "epoch": 18.5166784953868, + "grad_norm": 0.08863834291696548, + "learning_rate": 8.149127040454223e-05, + "loss": 0.022671495378017426, + "step": 130450 + }, + { + "epoch": 18.518097941802697, + "grad_norm": 1.1707170009613037, + "learning_rate": 8.148985095812633e-05, + "loss": 0.004408159479498863, + "step": 130460 + }, + { + "epoch": 18.519517388218595, + "grad_norm": 0.09221091866493225, + "learning_rate": 8.148843151171044e-05, + "loss": 0.01547957956790924, + "step": 130470 + }, + { + "epoch": 18.520936834634494, + "grad_norm": 0.13105377554893494, + "learning_rate": 8.148701206529454e-05, + "loss": 0.018665242195129394, + "step": 130480 + }, + { + "epoch": 18.522356281050392, + "grad_norm": 1.4174994230270386, + "learning_rate": 8.148559261887864e-05, + "loss": 0.027663955092430116, + "step": 130490 + }, + { + "epoch": 18.523775727466287, + "grad_norm": 8.597201347351074, + "learning_rate": 8.148417317246275e-05, + "loss": 0.0659090518951416, + "step": 130500 + }, + { + "epoch": 18.523775727466287, + "eval_accuracy": 0.9857569784447129, + "eval_loss": 0.05306238681077957, + "eval_runtime": 32.6311, + "eval_samples_per_second": 481.963, + "eval_steps_per_second": 15.078, + "step": 130500 + }, + { + "epoch": 18.525195173882185, + "grad_norm": 3.4279048442840576, + "learning_rate": 8.148275372604684e-05, + "loss": 0.030299320816993713, + "step": 130510 + }, + { + "epoch": 18.526614620298083, + "grad_norm": 2.21703839302063, + "learning_rate": 8.148133427963095e-05, + "loss": 0.015195395052433013, + "step": 130520 + }, + { + "epoch": 18.528034066713982, + "grad_norm": 0.005461210384964943, + "learning_rate": 8.147991483321505e-05, + "loss": 0.006234246119856834, + "step": 130530 + }, + { + "epoch": 18.52945351312988, + "grad_norm": 6.675775051116943, + "learning_rate": 8.147849538679915e-05, + "loss": 0.023515474796295167, + "step": 130540 + }, + { + "epoch": 18.53087295954578, + "grad_norm": 2.0605430603027344, + "learning_rate": 8.147707594038325e-05, + "loss": 0.028863516449928284, + "step": 130550 + }, + { + "epoch": 18.532292405961677, + "grad_norm": 0.08040369302034378, + "learning_rate": 8.147565649396736e-05, + "loss": 0.015750017762184144, + "step": 130560 + }, + { + "epoch": 18.53371185237757, + "grad_norm": 10.392151832580566, + "learning_rate": 8.147423704755146e-05, + "loss": 0.051103293895721436, + "step": 130570 + }, + { + "epoch": 18.53513129879347, + "grad_norm": 0.018073299899697304, + "learning_rate": 8.147281760113557e-05, + "loss": 0.01106690764427185, + "step": 130580 + }, + { + "epoch": 18.536550745209368, + "grad_norm": 7.361319065093994, + "learning_rate": 8.147139815471966e-05, + "loss": 0.016362231969833375, + "step": 130590 + }, + { + "epoch": 18.537970191625266, + "grad_norm": 0.07301932573318481, + "learning_rate": 8.146997870830376e-05, + "loss": 0.03403286039829254, + "step": 130600 + }, + { + "epoch": 18.539389638041165, + "grad_norm": 2.030935764312744, + "learning_rate": 8.146855926188787e-05, + "loss": 0.04147332310676575, + "step": 130610 + }, + { + "epoch": 18.540809084457063, + "grad_norm": 0.02401145175099373, + "learning_rate": 8.146713981547197e-05, + "loss": 0.060898661613464355, + "step": 130620 + }, + { + "epoch": 18.54222853087296, + "grad_norm": 0.40541765093803406, + "learning_rate": 8.146572036905608e-05, + "loss": 0.0188491553068161, + "step": 130630 + }, + { + "epoch": 18.543647977288856, + "grad_norm": 0.05343012139201164, + "learning_rate": 8.146430092264016e-05, + "loss": 0.015254667401313782, + "step": 130640 + }, + { + "epoch": 18.545067423704754, + "grad_norm": 2.4271240234375, + "learning_rate": 8.146288147622427e-05, + "loss": 0.004261807724833489, + "step": 130650 + }, + { + "epoch": 18.546486870120653, + "grad_norm": 0.08628285676240921, + "learning_rate": 8.146146202980837e-05, + "loss": 0.0023907829076051713, + "step": 130660 + }, + { + "epoch": 18.54790631653655, + "grad_norm": 0.017023751512169838, + "learning_rate": 8.146004258339248e-05, + "loss": 0.026684251427650452, + "step": 130670 + }, + { + "epoch": 18.54932576295245, + "grad_norm": 0.0902213305234909, + "learning_rate": 8.145862313697658e-05, + "loss": 0.01770784556865692, + "step": 130680 + }, + { + "epoch": 18.550745209368348, + "grad_norm": 0.03622569516301155, + "learning_rate": 8.145720369056069e-05, + "loss": 0.01470385491847992, + "step": 130690 + }, + { + "epoch": 18.552164655784246, + "grad_norm": 0.4535423517227173, + "learning_rate": 8.145578424414479e-05, + "loss": 0.007574498653411865, + "step": 130700 + }, + { + "epoch": 18.55358410220014, + "grad_norm": 0.06368441134691238, + "learning_rate": 8.145436479772889e-05, + "loss": 0.05138496160507202, + "step": 130710 + }, + { + "epoch": 18.55500354861604, + "grad_norm": 0.03241115063428879, + "learning_rate": 8.1452945351313e-05, + "loss": 0.04685293734073639, + "step": 130720 + }, + { + "epoch": 18.556422995031937, + "grad_norm": 2.8476345539093018, + "learning_rate": 8.14515259048971e-05, + "loss": 0.024661506712436675, + "step": 130730 + }, + { + "epoch": 18.557842441447836, + "grad_norm": 1.1679760217666626, + "learning_rate": 8.14501064584812e-05, + "loss": 0.006464455276727676, + "step": 130740 + }, + { + "epoch": 18.559261887863734, + "grad_norm": 0.24937348067760468, + "learning_rate": 8.144868701206529e-05, + "loss": 0.026166808605194092, + "step": 130750 + }, + { + "epoch": 18.560681334279632, + "grad_norm": 2.66959285736084, + "learning_rate": 8.14472675656494e-05, + "loss": 0.0420440137386322, + "step": 130760 + }, + { + "epoch": 18.56210078069553, + "grad_norm": 13.330845832824707, + "learning_rate": 8.14458481192335e-05, + "loss": 0.08298577070236206, + "step": 130770 + }, + { + "epoch": 18.563520227111425, + "grad_norm": 0.13030719757080078, + "learning_rate": 8.144442867281761e-05, + "loss": 0.0045199781656265255, + "step": 130780 + }, + { + "epoch": 18.564939673527324, + "grad_norm": 0.3060011863708496, + "learning_rate": 8.14430092264017e-05, + "loss": 0.022911277413368226, + "step": 130790 + }, + { + "epoch": 18.566359119943222, + "grad_norm": 7.349360466003418, + "learning_rate": 8.14415897799858e-05, + "loss": 0.05471324324607849, + "step": 130800 + }, + { + "epoch": 18.56777856635912, + "grad_norm": 4.4315714836120605, + "learning_rate": 8.144017033356991e-05, + "loss": 0.013483393192291259, + "step": 130810 + }, + { + "epoch": 18.56919801277502, + "grad_norm": 1.7409733533859253, + "learning_rate": 8.143875088715401e-05, + "loss": 0.01805341839790344, + "step": 130820 + }, + { + "epoch": 18.570617459190917, + "grad_norm": 0.010257486253976822, + "learning_rate": 8.143733144073812e-05, + "loss": 0.016329763829708098, + "step": 130830 + }, + { + "epoch": 18.572036905606815, + "grad_norm": 11.223624229431152, + "learning_rate": 8.143591199432222e-05, + "loss": 0.07738075852394104, + "step": 130840 + }, + { + "epoch": 18.57345635202271, + "grad_norm": 0.13674846291542053, + "learning_rate": 8.143449254790632e-05, + "loss": 0.01924886107444763, + "step": 130850 + }, + { + "epoch": 18.574875798438608, + "grad_norm": 0.4946594834327698, + "learning_rate": 8.143307310149041e-05, + "loss": 0.029023009538650512, + "step": 130860 + }, + { + "epoch": 18.576295244854506, + "grad_norm": 0.5943418145179749, + "learning_rate": 8.143165365507453e-05, + "loss": 0.02415831983089447, + "step": 130870 + }, + { + "epoch": 18.577714691270405, + "grad_norm": 0.08107198774814606, + "learning_rate": 8.143023420865862e-05, + "loss": 0.003249601274728775, + "step": 130880 + }, + { + "epoch": 18.579134137686303, + "grad_norm": 0.24566154181957245, + "learning_rate": 8.142881476224273e-05, + "loss": 0.0384760707616806, + "step": 130890 + }, + { + "epoch": 18.5805535841022, + "grad_norm": 4.109685897827148, + "learning_rate": 8.142739531582683e-05, + "loss": 0.006746883690357208, + "step": 130900 + }, + { + "epoch": 18.5819730305181, + "grad_norm": 0.12463272362947464, + "learning_rate": 8.142597586941093e-05, + "loss": 0.021722891926765443, + "step": 130910 + }, + { + "epoch": 18.583392476933994, + "grad_norm": 0.04137643799185753, + "learning_rate": 8.142455642299504e-05, + "loss": 0.0485242635011673, + "step": 130920 + }, + { + "epoch": 18.584811923349893, + "grad_norm": 0.5120498538017273, + "learning_rate": 8.142313697657914e-05, + "loss": 0.01771296113729477, + "step": 130930 + }, + { + "epoch": 18.58623136976579, + "grad_norm": 0.3289685845375061, + "learning_rate": 8.142171753016325e-05, + "loss": 0.010862819850444794, + "step": 130940 + }, + { + "epoch": 18.58765081618169, + "grad_norm": 0.8856240510940552, + "learning_rate": 8.142029808374733e-05, + "loss": 0.01859651505947113, + "step": 130950 + }, + { + "epoch": 18.589070262597588, + "grad_norm": 11.222086906433105, + "learning_rate": 8.141887863733144e-05, + "loss": 0.038225024938583374, + "step": 130960 + }, + { + "epoch": 18.590489709013486, + "grad_norm": 9.186075210571289, + "learning_rate": 8.141745919091554e-05, + "loss": 0.027800050377845765, + "step": 130970 + }, + { + "epoch": 18.591909155429384, + "grad_norm": 2.4053432941436768, + "learning_rate": 8.141603974449965e-05, + "loss": 0.009478311240673064, + "step": 130980 + }, + { + "epoch": 18.59332860184528, + "grad_norm": 0.3215068578720093, + "learning_rate": 8.141462029808375e-05, + "loss": 0.050688672065734866, + "step": 130990 + }, + { + "epoch": 18.594748048261177, + "grad_norm": 1.0421249866485596, + "learning_rate": 8.141320085166785e-05, + "loss": 0.013289576768875122, + "step": 131000 + }, + { + "epoch": 18.594748048261177, + "eval_accuracy": 0.9817511286322884, + "eval_loss": 0.07066646963357925, + "eval_runtime": 33.9117, + "eval_samples_per_second": 463.763, + "eval_steps_per_second": 14.508, + "step": 131000 + }, + { + "epoch": 18.596167494677076, + "grad_norm": 0.13267114758491516, + "learning_rate": 8.141178140525196e-05, + "loss": 0.009116743505001069, + "step": 131010 + }, + { + "epoch": 18.597586941092974, + "grad_norm": 9.238930702209473, + "learning_rate": 8.141036195883605e-05, + "loss": 0.03145955801010132, + "step": 131020 + }, + { + "epoch": 18.599006387508872, + "grad_norm": 0.05779772996902466, + "learning_rate": 8.140894251242016e-05, + "loss": 0.011149019002914429, + "step": 131030 + }, + { + "epoch": 18.60042583392477, + "grad_norm": 0.08412010222673416, + "learning_rate": 8.140752306600426e-05, + "loss": 0.04695343375205994, + "step": 131040 + }, + { + "epoch": 18.60184528034067, + "grad_norm": 1.983355164527893, + "learning_rate": 8.140610361958837e-05, + "loss": 0.0149237260222435, + "step": 131050 + }, + { + "epoch": 18.603264726756564, + "grad_norm": 9.849051475524902, + "learning_rate": 8.140468417317246e-05, + "loss": 0.06015897989273071, + "step": 131060 + }, + { + "epoch": 18.604684173172462, + "grad_norm": 3.288647413253784, + "learning_rate": 8.140326472675657e-05, + "loss": 0.020872029662132262, + "step": 131070 + }, + { + "epoch": 18.60610361958836, + "grad_norm": 0.6425996422767639, + "learning_rate": 8.140184528034067e-05, + "loss": 0.02088252305984497, + "step": 131080 + }, + { + "epoch": 18.60752306600426, + "grad_norm": 1.5062687397003174, + "learning_rate": 8.140042583392478e-05, + "loss": 0.02488507628440857, + "step": 131090 + }, + { + "epoch": 18.608942512420157, + "grad_norm": 12.218483924865723, + "learning_rate": 8.139900638750889e-05, + "loss": 0.04160102307796478, + "step": 131100 + }, + { + "epoch": 18.610361958836055, + "grad_norm": 3.886253595352173, + "learning_rate": 8.139758694109297e-05, + "loss": 0.012016545236110687, + "step": 131110 + }, + { + "epoch": 18.611781405251953, + "grad_norm": 6.686439037322998, + "learning_rate": 8.139616749467708e-05, + "loss": 0.012572765350341797, + "step": 131120 + }, + { + "epoch": 18.613200851667848, + "grad_norm": 0.027732079848647118, + "learning_rate": 8.139474804826118e-05, + "loss": 0.0567524254322052, + "step": 131130 + }, + { + "epoch": 18.614620298083747, + "grad_norm": 0.08063772320747375, + "learning_rate": 8.139332860184529e-05, + "loss": 0.0034560371190309525, + "step": 131140 + }, + { + "epoch": 18.616039744499645, + "grad_norm": 0.16642643511295319, + "learning_rate": 8.139190915542939e-05, + "loss": 0.024695418775081635, + "step": 131150 + }, + { + "epoch": 18.617459190915543, + "grad_norm": 1.7551262378692627, + "learning_rate": 8.139048970901348e-05, + "loss": 0.026375973224639894, + "step": 131160 + }, + { + "epoch": 18.61887863733144, + "grad_norm": 7.997278213500977, + "learning_rate": 8.138907026259758e-05, + "loss": 0.01700562834739685, + "step": 131170 + }, + { + "epoch": 18.62029808374734, + "grad_norm": 1.091167688369751, + "learning_rate": 8.138765081618169e-05, + "loss": 0.009547965228557586, + "step": 131180 + }, + { + "epoch": 18.621717530163238, + "grad_norm": 11.500609397888184, + "learning_rate": 8.13862313697658e-05, + "loss": 0.07945277094841004, + "step": 131190 + }, + { + "epoch": 18.623136976579133, + "grad_norm": 6.0473432540893555, + "learning_rate": 8.13848119233499e-05, + "loss": 0.014636990427970887, + "step": 131200 + }, + { + "epoch": 18.62455642299503, + "grad_norm": 0.04876257851719856, + "learning_rate": 8.1383392476934e-05, + "loss": 0.039440539479255673, + "step": 131210 + }, + { + "epoch": 18.62597586941093, + "grad_norm": 1.9998284578323364, + "learning_rate": 8.13819730305181e-05, + "loss": 0.0699110209941864, + "step": 131220 + }, + { + "epoch": 18.627395315826828, + "grad_norm": 0.19066566228866577, + "learning_rate": 8.138055358410221e-05, + "loss": 0.017888715863227843, + "step": 131230 + }, + { + "epoch": 18.628814762242726, + "grad_norm": 0.1404818296432495, + "learning_rate": 8.13791341376863e-05, + "loss": 0.02370113581418991, + "step": 131240 + }, + { + "epoch": 18.630234208658624, + "grad_norm": 4.469192028045654, + "learning_rate": 8.137771469127042e-05, + "loss": 0.012822465598583221, + "step": 131250 + }, + { + "epoch": 18.631653655074523, + "grad_norm": 4.36820650100708, + "learning_rate": 8.13762952448545e-05, + "loss": 0.01055520549416542, + "step": 131260 + }, + { + "epoch": 18.633073101490417, + "grad_norm": 0.11531079560518265, + "learning_rate": 8.137487579843861e-05, + "loss": 0.005679406225681305, + "step": 131270 + }, + { + "epoch": 18.634492547906316, + "grad_norm": 0.1494099348783493, + "learning_rate": 8.137345635202272e-05, + "loss": 0.011889305710792542, + "step": 131280 + }, + { + "epoch": 18.635911994322214, + "grad_norm": 1.323119044303894, + "learning_rate": 8.137203690560682e-05, + "loss": 0.027772819995880126, + "step": 131290 + }, + { + "epoch": 18.637331440738112, + "grad_norm": 0.19452044367790222, + "learning_rate": 8.137061745919093e-05, + "loss": 0.022980180382728577, + "step": 131300 + }, + { + "epoch": 18.63875088715401, + "grad_norm": 0.009213218465447426, + "learning_rate": 8.136919801277501e-05, + "loss": 0.011797212064266205, + "step": 131310 + }, + { + "epoch": 18.64017033356991, + "grad_norm": 12.117408752441406, + "learning_rate": 8.136777856635912e-05, + "loss": 0.01006176769733429, + "step": 131320 + }, + { + "epoch": 18.641589779985807, + "grad_norm": 0.7869730591773987, + "learning_rate": 8.136635911994322e-05, + "loss": 0.014416629076004028, + "step": 131330 + }, + { + "epoch": 18.643009226401702, + "grad_norm": 0.23337319493293762, + "learning_rate": 8.136493967352733e-05, + "loss": 0.008844228088855743, + "step": 131340 + }, + { + "epoch": 18.6444286728176, + "grad_norm": 1.930612564086914, + "learning_rate": 8.136352022711143e-05, + "loss": 0.014108307659626007, + "step": 131350 + }, + { + "epoch": 18.6458481192335, + "grad_norm": 9.3480224609375, + "learning_rate": 8.136210078069553e-05, + "loss": 0.04676141738891602, + "step": 131360 + }, + { + "epoch": 18.647267565649397, + "grad_norm": 12.945405960083008, + "learning_rate": 8.136068133427964e-05, + "loss": 0.04466235339641571, + "step": 131370 + }, + { + "epoch": 18.648687012065295, + "grad_norm": 3.8032307624816895, + "learning_rate": 8.135926188786374e-05, + "loss": 0.013353703916072846, + "step": 131380 + }, + { + "epoch": 18.650106458481194, + "grad_norm": 0.9862574338912964, + "learning_rate": 8.135784244144785e-05, + "loss": 0.04951807558536529, + "step": 131390 + }, + { + "epoch": 18.651525904897092, + "grad_norm": 6.689640522003174, + "learning_rate": 8.135642299503194e-05, + "loss": 0.019863298535346983, + "step": 131400 + }, + { + "epoch": 18.652945351312987, + "grad_norm": 0.01660931296646595, + "learning_rate": 8.135500354861605e-05, + "loss": 0.005930447950959206, + "step": 131410 + }, + { + "epoch": 18.654364797728885, + "grad_norm": 0.058415528386831284, + "learning_rate": 8.135358410220014e-05, + "loss": 0.06661216616630554, + "step": 131420 + }, + { + "epoch": 18.655784244144783, + "grad_norm": 18.33035659790039, + "learning_rate": 8.135216465578425e-05, + "loss": 0.03377198576927185, + "step": 131430 + }, + { + "epoch": 18.65720369056068, + "grad_norm": 1.1209580898284912, + "learning_rate": 8.135074520936835e-05, + "loss": 0.019080647826194765, + "step": 131440 + }, + { + "epoch": 18.65862313697658, + "grad_norm": 0.12373685091733932, + "learning_rate": 8.134932576295246e-05, + "loss": 0.02793894410133362, + "step": 131450 + }, + { + "epoch": 18.660042583392478, + "grad_norm": 1.2706899642944336, + "learning_rate": 8.134790631653656e-05, + "loss": 0.0763306200504303, + "step": 131460 + }, + { + "epoch": 18.661462029808376, + "grad_norm": 0.7054558992385864, + "learning_rate": 8.134648687012065e-05, + "loss": 0.009234672784805298, + "step": 131470 + }, + { + "epoch": 18.66288147622427, + "grad_norm": 1.8768150806427002, + "learning_rate": 8.134506742370476e-05, + "loss": 0.04116539061069489, + "step": 131480 + }, + { + "epoch": 18.66430092264017, + "grad_norm": 2.3796298503875732, + "learning_rate": 8.134364797728886e-05, + "loss": 0.004192651808261871, + "step": 131490 + }, + { + "epoch": 18.665720369056068, + "grad_norm": 0.31088367104530334, + "learning_rate": 8.134222853087297e-05, + "loss": 0.020734292268753052, + "step": 131500 + }, + { + "epoch": 18.665720369056068, + "eval_accuracy": 0.9800979207731926, + "eval_loss": 0.07913683354854584, + "eval_runtime": 32.8344, + "eval_samples_per_second": 478.98, + "eval_steps_per_second": 14.984, + "step": 131500 + }, + { + "epoch": 18.667139815471966, + "grad_norm": 0.008138231933116913, + "learning_rate": 8.134080908445707e-05, + "loss": 0.02487410753965378, + "step": 131510 + }, + { + "epoch": 18.668559261887864, + "grad_norm": 0.1465875208377838, + "learning_rate": 8.133938963804117e-05, + "loss": 0.009092245995998383, + "step": 131520 + }, + { + "epoch": 18.669978708303763, + "grad_norm": 7.406838893890381, + "learning_rate": 8.133797019162526e-05, + "loss": 0.01487957090139389, + "step": 131530 + }, + { + "epoch": 18.67139815471966, + "grad_norm": 3.811093807220459, + "learning_rate": 8.133655074520937e-05, + "loss": 0.05009844303131104, + "step": 131540 + }, + { + "epoch": 18.672817601135556, + "grad_norm": 6.526002407073975, + "learning_rate": 8.133513129879347e-05, + "loss": 0.053089505434036253, + "step": 131550 + }, + { + "epoch": 18.674237047551454, + "grad_norm": 11.075752258300781, + "learning_rate": 8.133371185237758e-05, + "loss": 0.06426833271980285, + "step": 131560 + }, + { + "epoch": 18.675656493967352, + "grad_norm": 2.904355764389038, + "learning_rate": 8.133229240596168e-05, + "loss": 0.057389688491821286, + "step": 131570 + }, + { + "epoch": 18.67707594038325, + "grad_norm": 0.01689119264483452, + "learning_rate": 8.133087295954578e-05, + "loss": 0.01457415074110031, + "step": 131580 + }, + { + "epoch": 18.67849538679915, + "grad_norm": 2.3161025047302246, + "learning_rate": 8.132945351312989e-05, + "loss": 0.022415342926979064, + "step": 131590 + }, + { + "epoch": 18.679914833215047, + "grad_norm": 2.4645822048187256, + "learning_rate": 8.132803406671399e-05, + "loss": 0.011522973328828812, + "step": 131600 + }, + { + "epoch": 18.681334279630946, + "grad_norm": 0.06506139785051346, + "learning_rate": 8.13266146202981e-05, + "loss": 0.04800006151199341, + "step": 131610 + }, + { + "epoch": 18.68275372604684, + "grad_norm": 0.1301409751176834, + "learning_rate": 8.132519517388218e-05, + "loss": 0.021942104399204253, + "step": 131620 + }, + { + "epoch": 18.68417317246274, + "grad_norm": 0.07169385999441147, + "learning_rate": 8.132377572746629e-05, + "loss": 0.00830610990524292, + "step": 131630 + }, + { + "epoch": 18.685592618878637, + "grad_norm": 3.7510461807250977, + "learning_rate": 8.132235628105039e-05, + "loss": 0.004536581039428711, + "step": 131640 + }, + { + "epoch": 18.687012065294535, + "grad_norm": 8.096920013427734, + "learning_rate": 8.13209368346345e-05, + "loss": 0.023867668211460115, + "step": 131650 + }, + { + "epoch": 18.688431511710434, + "grad_norm": 2.1872310638427734, + "learning_rate": 8.13195173882186e-05, + "loss": 0.008351977169513702, + "step": 131660 + }, + { + "epoch": 18.689850958126332, + "grad_norm": 11.362651824951172, + "learning_rate": 8.13180979418027e-05, + "loss": 0.01560388207435608, + "step": 131670 + }, + { + "epoch": 18.69127040454223, + "grad_norm": 0.3655797839164734, + "learning_rate": 8.13166784953868e-05, + "loss": 0.009426388144493102, + "step": 131680 + }, + { + "epoch": 18.692689850958125, + "grad_norm": 0.06773939728736877, + "learning_rate": 8.13152590489709e-05, + "loss": 0.008931878209114074, + "step": 131690 + }, + { + "epoch": 18.694109297374023, + "grad_norm": 0.47058501839637756, + "learning_rate": 8.131383960255501e-05, + "loss": 0.0036383919417858125, + "step": 131700 + }, + { + "epoch": 18.69552874378992, + "grad_norm": 0.07784921675920486, + "learning_rate": 8.131242015613911e-05, + "loss": 0.02547445297241211, + "step": 131710 + }, + { + "epoch": 18.69694819020582, + "grad_norm": 1.0401661396026611, + "learning_rate": 8.131100070972322e-05, + "loss": 0.008947336673736572, + "step": 131720 + }, + { + "epoch": 18.698367636621718, + "grad_norm": 1.7095261812210083, + "learning_rate": 8.13095812633073e-05, + "loss": 0.012136232852935792, + "step": 131730 + }, + { + "epoch": 18.699787083037616, + "grad_norm": 3.366201639175415, + "learning_rate": 8.130816181689142e-05, + "loss": 0.005720870569348335, + "step": 131740 + }, + { + "epoch": 18.701206529453515, + "grad_norm": 0.0334438718855381, + "learning_rate": 8.130674237047551e-05, + "loss": 0.007759307324886322, + "step": 131750 + }, + { + "epoch": 18.70262597586941, + "grad_norm": 0.043541185557842255, + "learning_rate": 8.130532292405963e-05, + "loss": 0.01564426124095917, + "step": 131760 + }, + { + "epoch": 18.704045422285308, + "grad_norm": 0.08072488754987717, + "learning_rate": 8.130390347764372e-05, + "loss": 0.005764240026473999, + "step": 131770 + }, + { + "epoch": 18.705464868701206, + "grad_norm": 0.24349243938922882, + "learning_rate": 8.130248403122782e-05, + "loss": 0.015592911839485168, + "step": 131780 + }, + { + "epoch": 18.706884315117104, + "grad_norm": 0.29928338527679443, + "learning_rate": 8.130106458481193e-05, + "loss": 0.009814509004354478, + "step": 131790 + }, + { + "epoch": 18.708303761533003, + "grad_norm": 0.3477325737476349, + "learning_rate": 8.129964513839603e-05, + "loss": 0.005279907211661339, + "step": 131800 + }, + { + "epoch": 18.7097232079489, + "grad_norm": 7.982736110687256, + "learning_rate": 8.129822569198014e-05, + "loss": 0.04246063828468323, + "step": 131810 + }, + { + "epoch": 18.7111426543648, + "grad_norm": 0.4821631610393524, + "learning_rate": 8.129680624556424e-05, + "loss": 0.005226002261042595, + "step": 131820 + }, + { + "epoch": 18.712562100780694, + "grad_norm": 11.368393898010254, + "learning_rate": 8.129538679914833e-05, + "loss": 0.0491435170173645, + "step": 131830 + }, + { + "epoch": 18.713981547196592, + "grad_norm": 0.7549554705619812, + "learning_rate": 8.129396735273243e-05, + "loss": 0.0267733097076416, + "step": 131840 + }, + { + "epoch": 18.71540099361249, + "grad_norm": 4.594950199127197, + "learning_rate": 8.129254790631654e-05, + "loss": 0.010378662496805191, + "step": 131850 + }, + { + "epoch": 18.71682044002839, + "grad_norm": 0.2322712242603302, + "learning_rate": 8.129112845990064e-05, + "loss": 0.013620153069496155, + "step": 131860 + }, + { + "epoch": 18.718239886444287, + "grad_norm": 4.297412395477295, + "learning_rate": 8.128970901348475e-05, + "loss": 0.016456304490566252, + "step": 131870 + }, + { + "epoch": 18.719659332860186, + "grad_norm": 0.2964388430118561, + "learning_rate": 8.128828956706885e-05, + "loss": 0.025118935108184814, + "step": 131880 + }, + { + "epoch": 18.721078779276084, + "grad_norm": 3.0489017963409424, + "learning_rate": 8.128687012065295e-05, + "loss": 0.029944658279418945, + "step": 131890 + }, + { + "epoch": 18.72249822569198, + "grad_norm": 0.9195518493652344, + "learning_rate": 8.128545067423706e-05, + "loss": 0.04168401956558228, + "step": 131900 + }, + { + "epoch": 18.723917672107877, + "grad_norm": 0.03696005046367645, + "learning_rate": 8.128403122782115e-05, + "loss": 0.01858309209346771, + "step": 131910 + }, + { + "epoch": 18.725337118523775, + "grad_norm": 0.17158129811286926, + "learning_rate": 8.128261178140526e-05, + "loss": 0.011418993771076202, + "step": 131920 + }, + { + "epoch": 18.726756564939674, + "grad_norm": 2.986250638961792, + "learning_rate": 8.128119233498935e-05, + "loss": 0.04073112905025482, + "step": 131930 + }, + { + "epoch": 18.728176011355572, + "grad_norm": 2.4232630729675293, + "learning_rate": 8.127977288857346e-05, + "loss": 0.00799041911959648, + "step": 131940 + }, + { + "epoch": 18.72959545777147, + "grad_norm": 2.225045919418335, + "learning_rate": 8.127835344215756e-05, + "loss": 0.010526900738477707, + "step": 131950 + }, + { + "epoch": 18.73101490418737, + "grad_norm": 2.6924684047698975, + "learning_rate": 8.127693399574167e-05, + "loss": 0.010417158901691436, + "step": 131960 + }, + { + "epoch": 18.732434350603263, + "grad_norm": 0.05875149741768837, + "learning_rate": 8.127551454932577e-05, + "loss": 0.02316032499074936, + "step": 131970 + }, + { + "epoch": 18.73385379701916, + "grad_norm": 0.05400659516453743, + "learning_rate": 8.127409510290986e-05, + "loss": 0.07833380699157715, + "step": 131980 + }, + { + "epoch": 18.73527324343506, + "grad_norm": 0.07178810238838196, + "learning_rate": 8.127267565649397e-05, + "loss": 0.044096097350120544, + "step": 131990 + }, + { + "epoch": 18.73669268985096, + "grad_norm": 0.25669190287590027, + "learning_rate": 8.127125621007807e-05, + "loss": 0.030655372142791747, + "step": 132000 + }, + { + "epoch": 18.73669268985096, + "eval_accuracy": 0.9854390538564253, + "eval_loss": 0.055439144372940063, + "eval_runtime": 32.4017, + "eval_samples_per_second": 485.376, + "eval_steps_per_second": 15.184, + "step": 132000 + }, + { + "epoch": 18.738112136266857, + "grad_norm": 0.1243971586227417, + "learning_rate": 8.126983676366218e-05, + "loss": 0.003086409717798233, + "step": 132010 + }, + { + "epoch": 18.739531582682755, + "grad_norm": 0.016958186402916908, + "learning_rate": 8.126841731724628e-05, + "loss": 0.007639577984809876, + "step": 132020 + }, + { + "epoch": 18.740951029098653, + "grad_norm": 7.979750633239746, + "learning_rate": 8.126699787083038e-05, + "loss": 0.02907142639160156, + "step": 132030 + }, + { + "epoch": 18.742370475514548, + "grad_norm": 2.4184060096740723, + "learning_rate": 8.126557842441447e-05, + "loss": 0.011970283091068267, + "step": 132040 + }, + { + "epoch": 18.743789921930446, + "grad_norm": 11.4110107421875, + "learning_rate": 8.126415897799858e-05, + "loss": 0.022178635001182556, + "step": 132050 + }, + { + "epoch": 18.745209368346345, + "grad_norm": 10.188706398010254, + "learning_rate": 8.126273953158268e-05, + "loss": 0.03458372950553894, + "step": 132060 + }, + { + "epoch": 18.746628814762243, + "grad_norm": 0.019085505977272987, + "learning_rate": 8.12613200851668e-05, + "loss": 0.004047043249011039, + "step": 132070 + }, + { + "epoch": 18.74804826117814, + "grad_norm": 2.709345817565918, + "learning_rate": 8.125990063875089e-05, + "loss": 0.025256985425949098, + "step": 132080 + }, + { + "epoch": 18.74946770759404, + "grad_norm": 14.07314682006836, + "learning_rate": 8.125848119233499e-05, + "loss": 0.02897200584411621, + "step": 132090 + }, + { + "epoch": 18.750887154009938, + "grad_norm": 0.06611883640289307, + "learning_rate": 8.12570617459191e-05, + "loss": 0.03179367482662201, + "step": 132100 + }, + { + "epoch": 18.752306600425833, + "grad_norm": 16.679271697998047, + "learning_rate": 8.12556422995032e-05, + "loss": 0.03063758313655853, + "step": 132110 + }, + { + "epoch": 18.75372604684173, + "grad_norm": 0.21415162086486816, + "learning_rate": 8.125422285308731e-05, + "loss": 0.0198274165391922, + "step": 132120 + }, + { + "epoch": 18.75514549325763, + "grad_norm": 1.131074070930481, + "learning_rate": 8.12528034066714e-05, + "loss": 0.03010278344154358, + "step": 132130 + }, + { + "epoch": 18.756564939673527, + "grad_norm": 0.1621093600988388, + "learning_rate": 8.12513839602555e-05, + "loss": 0.017974340915679933, + "step": 132140 + }, + { + "epoch": 18.757984386089426, + "grad_norm": 0.1724443882703781, + "learning_rate": 8.12499645138396e-05, + "loss": 0.016963428258895873, + "step": 132150 + }, + { + "epoch": 18.759403832505324, + "grad_norm": 0.005614873953163624, + "learning_rate": 8.124854506742371e-05, + "loss": 0.007243013381958008, + "step": 132160 + }, + { + "epoch": 18.760823278921222, + "grad_norm": 0.3981007933616638, + "learning_rate": 8.124712562100781e-05, + "loss": 0.04376653134822846, + "step": 132170 + }, + { + "epoch": 18.762242725337117, + "grad_norm": 5.097568035125732, + "learning_rate": 8.124570617459192e-05, + "loss": 0.010595297068357467, + "step": 132180 + }, + { + "epoch": 18.763662171753015, + "grad_norm": 3.115640163421631, + "learning_rate": 8.124428672817602e-05, + "loss": 0.03532519340515137, + "step": 132190 + }, + { + "epoch": 18.765081618168914, + "grad_norm": 0.12442761659622192, + "learning_rate": 8.124286728176011e-05, + "loss": 0.049129563570022586, + "step": 132200 + }, + { + "epoch": 18.766501064584812, + "grad_norm": 10.673492431640625, + "learning_rate": 8.124144783534422e-05, + "loss": 0.06168168783187866, + "step": 132210 + }, + { + "epoch": 18.76792051100071, + "grad_norm": 0.2644001841545105, + "learning_rate": 8.124002838892832e-05, + "loss": 0.007527868449687958, + "step": 132220 + }, + { + "epoch": 18.76933995741661, + "grad_norm": 0.3530026376247406, + "learning_rate": 8.123860894251243e-05, + "loss": 0.00555514357984066, + "step": 132230 + }, + { + "epoch": 18.770759403832507, + "grad_norm": 0.15322211384773254, + "learning_rate": 8.123718949609652e-05, + "loss": 0.043745231628417966, + "step": 132240 + }, + { + "epoch": 18.7721788502484, + "grad_norm": 0.2177436351776123, + "learning_rate": 8.123577004968063e-05, + "loss": 0.038435956835746764, + "step": 132250 + }, + { + "epoch": 18.7735982966643, + "grad_norm": 13.23100757598877, + "learning_rate": 8.123435060326472e-05, + "loss": 0.03151096999645233, + "step": 132260 + }, + { + "epoch": 18.7750177430802, + "grad_norm": 0.11837724596261978, + "learning_rate": 8.123293115684884e-05, + "loss": 0.009069995582103729, + "step": 132270 + }, + { + "epoch": 18.776437189496097, + "grad_norm": 0.14518579840660095, + "learning_rate": 8.123151171043293e-05, + "loss": 0.017826008796691894, + "step": 132280 + }, + { + "epoch": 18.777856635911995, + "grad_norm": 0.11879577487707138, + "learning_rate": 8.123009226401703e-05, + "loss": 0.016228602826595308, + "step": 132290 + }, + { + "epoch": 18.779276082327893, + "grad_norm": 2.983297348022461, + "learning_rate": 8.122867281760114e-05, + "loss": 0.009481226652860641, + "step": 132300 + }, + { + "epoch": 18.78069552874379, + "grad_norm": 1.8368933200836182, + "learning_rate": 8.122725337118524e-05, + "loss": 0.009812879562377929, + "step": 132310 + }, + { + "epoch": 18.782114975159686, + "grad_norm": 4.023184776306152, + "learning_rate": 8.122583392476935e-05, + "loss": 0.01623908579349518, + "step": 132320 + }, + { + "epoch": 18.783534421575585, + "grad_norm": 0.06996183842420578, + "learning_rate": 8.122441447835345e-05, + "loss": 0.002210826799273491, + "step": 132330 + }, + { + "epoch": 18.784953867991483, + "grad_norm": 0.025891892611980438, + "learning_rate": 8.122299503193754e-05, + "loss": 0.022765421867370607, + "step": 132340 + }, + { + "epoch": 18.78637331440738, + "grad_norm": 2.33005428314209, + "learning_rate": 8.122157558552164e-05, + "loss": 0.019331203401088716, + "step": 132350 + }, + { + "epoch": 18.78779276082328, + "grad_norm": 0.031298790127038956, + "learning_rate": 8.122015613910575e-05, + "loss": 0.01923527717590332, + "step": 132360 + }, + { + "epoch": 18.789212207239178, + "grad_norm": 0.45524969696998596, + "learning_rate": 8.121873669268985e-05, + "loss": 0.002389763668179512, + "step": 132370 + }, + { + "epoch": 18.790631653655076, + "grad_norm": 0.018605032935738564, + "learning_rate": 8.121731724627396e-05, + "loss": 0.013690409064292908, + "step": 132380 + }, + { + "epoch": 18.79205110007097, + "grad_norm": 0.103347048163414, + "learning_rate": 8.121589779985806e-05, + "loss": 0.04742929637432099, + "step": 132390 + }, + { + "epoch": 18.79347054648687, + "grad_norm": 0.85923832654953, + "learning_rate": 8.121447835344216e-05, + "loss": 0.004268684610724449, + "step": 132400 + }, + { + "epoch": 18.794889992902768, + "grad_norm": 0.4262538254261017, + "learning_rate": 8.121305890702627e-05, + "loss": 0.01819155514240265, + "step": 132410 + }, + { + "epoch": 18.796309439318666, + "grad_norm": 0.03946376219391823, + "learning_rate": 8.121163946061036e-05, + "loss": 0.033000385761260985, + "step": 132420 + }, + { + "epoch": 18.797728885734564, + "grad_norm": 0.0055903540924191475, + "learning_rate": 8.121022001419447e-05, + "loss": 0.01689928025007248, + "step": 132430 + }, + { + "epoch": 18.799148332150462, + "grad_norm": 1.8662238121032715, + "learning_rate": 8.120880056777857e-05, + "loss": 0.010890743136405945, + "step": 132440 + }, + { + "epoch": 18.80056777856636, + "grad_norm": 0.051471561193466187, + "learning_rate": 8.120738112136267e-05, + "loss": 0.02852175831794739, + "step": 132450 + }, + { + "epoch": 18.801987224982255, + "grad_norm": 0.11889596283435822, + "learning_rate": 8.120596167494677e-05, + "loss": 0.016614697873592377, + "step": 132460 + }, + { + "epoch": 18.803406671398154, + "grad_norm": 8.242818832397461, + "learning_rate": 8.120454222853088e-05, + "loss": 0.016646860539913176, + "step": 132470 + }, + { + "epoch": 18.804826117814052, + "grad_norm": 0.044861502945423126, + "learning_rate": 8.120312278211498e-05, + "loss": 0.013706690073013306, + "step": 132480 + }, + { + "epoch": 18.80624556422995, + "grad_norm": 0.5428833365440369, + "learning_rate": 8.120170333569909e-05, + "loss": 0.02067845016717911, + "step": 132490 + }, + { + "epoch": 18.80766501064585, + "grad_norm": 3.9887773990631104, + "learning_rate": 8.120028388928318e-05, + "loss": 0.007190878689289093, + "step": 132500 + }, + { + "epoch": 18.80766501064585, + "eval_accuracy": 0.9873466013861512, + "eval_loss": 0.04539346694946289, + "eval_runtime": 32.5262, + "eval_samples_per_second": 483.518, + "eval_steps_per_second": 15.126, + "step": 132500 + }, + { + "epoch": 18.809084457061747, + "grad_norm": 2.4611332416534424, + "learning_rate": 8.119886444286728e-05, + "loss": 0.008373191952705384, + "step": 132510 + }, + { + "epoch": 18.810503903477645, + "grad_norm": 1.2000856399536133, + "learning_rate": 8.119744499645139e-05, + "loss": 0.029330307245254518, + "step": 132520 + }, + { + "epoch": 18.81192334989354, + "grad_norm": 0.08291121572256088, + "learning_rate": 8.119602555003549e-05, + "loss": 0.014740046858787537, + "step": 132530 + }, + { + "epoch": 18.81334279630944, + "grad_norm": 7.6800079345703125, + "learning_rate": 8.11946061036196e-05, + "loss": 0.009070151299238206, + "step": 132540 + }, + { + "epoch": 18.814762242725337, + "grad_norm": 2.5658936500549316, + "learning_rate": 8.119318665720368e-05, + "loss": 0.018622465431690216, + "step": 132550 + }, + { + "epoch": 18.816181689141235, + "grad_norm": 0.3958927094936371, + "learning_rate": 8.11917672107878e-05, + "loss": 0.012245003879070283, + "step": 132560 + }, + { + "epoch": 18.817601135557133, + "grad_norm": 0.7030619382858276, + "learning_rate": 8.119034776437189e-05, + "loss": 0.010460810363292694, + "step": 132570 + }, + { + "epoch": 18.81902058197303, + "grad_norm": 0.0067835235968232155, + "learning_rate": 8.1188928317956e-05, + "loss": 0.010131156444549561, + "step": 132580 + }, + { + "epoch": 18.82044002838893, + "grad_norm": 0.11783251166343689, + "learning_rate": 8.118750887154011e-05, + "loss": 0.04217685461044311, + "step": 132590 + }, + { + "epoch": 18.821859474804825, + "grad_norm": 1.0168848037719727, + "learning_rate": 8.11860894251242e-05, + "loss": 0.014414319396018982, + "step": 132600 + }, + { + "epoch": 18.823278921220723, + "grad_norm": 3.850590229034424, + "learning_rate": 8.118466997870831e-05, + "loss": 0.016610829532146452, + "step": 132610 + }, + { + "epoch": 18.82469836763662, + "grad_norm": 4.185357570648193, + "learning_rate": 8.11832505322924e-05, + "loss": 0.005007806792855262, + "step": 132620 + }, + { + "epoch": 18.82611781405252, + "grad_norm": 0.17095859348773956, + "learning_rate": 8.118183108587652e-05, + "loss": 0.01426699459552765, + "step": 132630 + }, + { + "epoch": 18.827537260468418, + "grad_norm": 6.672366619110107, + "learning_rate": 8.118041163946061e-05, + "loss": 0.0266873300075531, + "step": 132640 + }, + { + "epoch": 18.828956706884316, + "grad_norm": 0.004397100303322077, + "learning_rate": 8.117899219304471e-05, + "loss": 0.047002002596855164, + "step": 132650 + }, + { + "epoch": 18.830376153300215, + "grad_norm": 0.038003381341695786, + "learning_rate": 8.117757274662881e-05, + "loss": 0.022296585142612457, + "step": 132660 + }, + { + "epoch": 18.83179559971611, + "grad_norm": 0.055123794823884964, + "learning_rate": 8.117615330021292e-05, + "loss": 0.03208612203598023, + "step": 132670 + }, + { + "epoch": 18.833215046132008, + "grad_norm": 0.4400286376476288, + "learning_rate": 8.117473385379703e-05, + "loss": 0.006797407567501068, + "step": 132680 + }, + { + "epoch": 18.834634492547906, + "grad_norm": 6.971785068511963, + "learning_rate": 8.117331440738113e-05, + "loss": 0.029937750101089476, + "step": 132690 + }, + { + "epoch": 18.836053938963804, + "grad_norm": 0.39356622099876404, + "learning_rate": 8.117189496096523e-05, + "loss": 0.04213964343070984, + "step": 132700 + }, + { + "epoch": 18.837473385379703, + "grad_norm": 1.2587052583694458, + "learning_rate": 8.117047551454932e-05, + "loss": 0.015325626730918885, + "step": 132710 + }, + { + "epoch": 18.8388928317956, + "grad_norm": 1.8314698934555054, + "learning_rate": 8.116905606813343e-05, + "loss": 0.0008087139576673508, + "step": 132720 + }, + { + "epoch": 18.8403122782115, + "grad_norm": 5.425319194793701, + "learning_rate": 8.116763662171753e-05, + "loss": 0.036557963490486144, + "step": 132730 + }, + { + "epoch": 18.841731724627394, + "grad_norm": 0.8596503138542175, + "learning_rate": 8.116621717530164e-05, + "loss": 0.01629253327846527, + "step": 132740 + }, + { + "epoch": 18.843151171043292, + "grad_norm": 0.3366922438144684, + "learning_rate": 8.116479772888573e-05, + "loss": 0.006476753205060959, + "step": 132750 + }, + { + "epoch": 18.84457061745919, + "grad_norm": 0.43940088152885437, + "learning_rate": 8.116337828246984e-05, + "loss": 0.0200645849108696, + "step": 132760 + }, + { + "epoch": 18.84599006387509, + "grad_norm": 0.3031410276889801, + "learning_rate": 8.116195883605395e-05, + "loss": 0.005779065564274788, + "step": 132770 + }, + { + "epoch": 18.847409510290987, + "grad_norm": 0.7902480363845825, + "learning_rate": 8.116053938963805e-05, + "loss": 0.04022659361362457, + "step": 132780 + }, + { + "epoch": 18.848828956706885, + "grad_norm": 3.379422903060913, + "learning_rate": 8.115911994322216e-05, + "loss": 0.02182978093624115, + "step": 132790 + }, + { + "epoch": 18.850248403122784, + "grad_norm": 0.19076137244701385, + "learning_rate": 8.115770049680625e-05, + "loss": 0.012058556824922562, + "step": 132800 + }, + { + "epoch": 18.85166784953868, + "grad_norm": 0.011338554322719574, + "learning_rate": 8.115628105039035e-05, + "loss": 0.029507333040237428, + "step": 132810 + }, + { + "epoch": 18.853087295954577, + "grad_norm": 3.2964704036712646, + "learning_rate": 8.115486160397445e-05, + "loss": 0.005234985426068306, + "step": 132820 + }, + { + "epoch": 18.854506742370475, + "grad_norm": 0.04063963517546654, + "learning_rate": 8.115344215755856e-05, + "loss": 0.06484124064445496, + "step": 132830 + }, + { + "epoch": 18.855926188786373, + "grad_norm": 0.14326469600200653, + "learning_rate": 8.115202271114266e-05, + "loss": 0.04315328299999237, + "step": 132840 + }, + { + "epoch": 18.85734563520227, + "grad_norm": 0.8972834348678589, + "learning_rate": 8.115060326472677e-05, + "loss": 0.0273000031709671, + "step": 132850 + }, + { + "epoch": 18.85876508161817, + "grad_norm": 2.150590181350708, + "learning_rate": 8.114918381831087e-05, + "loss": 0.00660116970539093, + "step": 132860 + }, + { + "epoch": 18.86018452803407, + "grad_norm": 0.6882023215293884, + "learning_rate": 8.114776437189496e-05, + "loss": 0.019053636491298674, + "step": 132870 + }, + { + "epoch": 18.861603974449963, + "grad_norm": 0.007473459001630545, + "learning_rate": 8.114634492547907e-05, + "loss": 0.00932258814573288, + "step": 132880 + }, + { + "epoch": 18.86302342086586, + "grad_norm": 0.026022188365459442, + "learning_rate": 8.114492547906317e-05, + "loss": 0.01030050665140152, + "step": 132890 + }, + { + "epoch": 18.86444286728176, + "grad_norm": 1.0625227689743042, + "learning_rate": 8.114350603264728e-05, + "loss": 0.013700217008590698, + "step": 132900 + }, + { + "epoch": 18.865862313697658, + "grad_norm": 0.2801578938961029, + "learning_rate": 8.114208658623137e-05, + "loss": 0.010946492850780486, + "step": 132910 + }, + { + "epoch": 18.867281760113556, + "grad_norm": 16.305551528930664, + "learning_rate": 8.114066713981548e-05, + "loss": 0.03810953497886658, + "step": 132920 + }, + { + "epoch": 18.868701206529455, + "grad_norm": 3.9440793991088867, + "learning_rate": 8.113924769339957e-05, + "loss": 0.024695229530334473, + "step": 132930 + }, + { + "epoch": 18.870120652945353, + "grad_norm": 3.1893086433410645, + "learning_rate": 8.113782824698369e-05, + "loss": 0.02541588544845581, + "step": 132940 + }, + { + "epoch": 18.871540099361248, + "grad_norm": 1.5765419006347656, + "learning_rate": 8.113640880056778e-05, + "loss": 0.07892866134643554, + "step": 132950 + }, + { + "epoch": 18.872959545777146, + "grad_norm": 18.041797637939453, + "learning_rate": 8.113498935415188e-05, + "loss": 0.06794158816337585, + "step": 132960 + }, + { + "epoch": 18.874378992193044, + "grad_norm": 6.357567310333252, + "learning_rate": 8.113356990773599e-05, + "loss": 0.0168626606464386, + "step": 132970 + }, + { + "epoch": 18.875798438608943, + "grad_norm": 0.0431709848344326, + "learning_rate": 8.113229240596168e-05, + "loss": 0.06689361929893493, + "step": 132980 + }, + { + "epoch": 18.87721788502484, + "grad_norm": 0.09627433121204376, + "learning_rate": 8.113087295954577e-05, + "loss": 0.003917117789387703, + "step": 132990 + }, + { + "epoch": 18.87863733144074, + "grad_norm": 0.45058372616767883, + "learning_rate": 8.112945351312988e-05, + "loss": 0.04650241732597351, + "step": 133000 + }, + { + "epoch": 18.87863733144074, + "eval_accuracy": 0.9823233928912062, + "eval_loss": 0.06960802525281906, + "eval_runtime": 33.9214, + "eval_samples_per_second": 463.63, + "eval_steps_per_second": 14.504, + "step": 133000 + }, + { + "epoch": 18.880056777856637, + "grad_norm": 1.5822498798370361, + "learning_rate": 8.112803406671398e-05, + "loss": 0.042826077342033385, + "step": 133010 + }, + { + "epoch": 18.881476224272532, + "grad_norm": 2.025569438934326, + "learning_rate": 8.112661462029809e-05, + "loss": 0.024586796760559082, + "step": 133020 + }, + { + "epoch": 18.88289567068843, + "grad_norm": 0.025201864540576935, + "learning_rate": 8.112519517388219e-05, + "loss": 0.043155121803283694, + "step": 133030 + }, + { + "epoch": 18.88431511710433, + "grad_norm": 5.285928249359131, + "learning_rate": 8.112377572746629e-05, + "loss": 0.04339152872562409, + "step": 133040 + }, + { + "epoch": 18.885734563520227, + "grad_norm": 0.02572530508041382, + "learning_rate": 8.11223562810504e-05, + "loss": 0.044859197735786435, + "step": 133050 + }, + { + "epoch": 18.887154009936125, + "grad_norm": 1.1444945335388184, + "learning_rate": 8.11209368346345e-05, + "loss": 0.019503407180309296, + "step": 133060 + }, + { + "epoch": 18.888573456352024, + "grad_norm": 8.799510955810547, + "learning_rate": 8.11195173882186e-05, + "loss": 0.049301111698150636, + "step": 133070 + }, + { + "epoch": 18.889992902767922, + "grad_norm": 1.0782595872879028, + "learning_rate": 8.111809794180269e-05, + "loss": 0.06624003052711487, + "step": 133080 + }, + { + "epoch": 18.891412349183817, + "grad_norm": 0.3835201859474182, + "learning_rate": 8.11166784953868e-05, + "loss": 0.020512942969799042, + "step": 133090 + }, + { + "epoch": 18.892831795599715, + "grad_norm": 0.2754994034767151, + "learning_rate": 8.11152590489709e-05, + "loss": 0.002916569635272026, + "step": 133100 + }, + { + "epoch": 18.894251242015613, + "grad_norm": 12.092395782470703, + "learning_rate": 8.111383960255501e-05, + "loss": 0.03223346471786499, + "step": 133110 + }, + { + "epoch": 18.89567068843151, + "grad_norm": 0.173954039812088, + "learning_rate": 8.11124201561391e-05, + "loss": 0.040129071474075316, + "step": 133120 + }, + { + "epoch": 18.89709013484741, + "grad_norm": 5.376662731170654, + "learning_rate": 8.111100070972322e-05, + "loss": 0.03927198350429535, + "step": 133130 + }, + { + "epoch": 18.89850958126331, + "grad_norm": 0.8540018200874329, + "learning_rate": 8.110958126330732e-05, + "loss": 0.04299411475658417, + "step": 133140 + }, + { + "epoch": 18.899929027679207, + "grad_norm": 5.0016584396362305, + "learning_rate": 8.110816181689141e-05, + "loss": 0.0349914163351059, + "step": 133150 + }, + { + "epoch": 18.9013484740951, + "grad_norm": 2.7536025047302246, + "learning_rate": 8.110674237047552e-05, + "loss": 0.004328594729304314, + "step": 133160 + }, + { + "epoch": 18.902767920511, + "grad_norm": 8.211243629455566, + "learning_rate": 8.110532292405962e-05, + "loss": 0.03376585841178894, + "step": 133170 + }, + { + "epoch": 18.904187366926898, + "grad_norm": 0.545396089553833, + "learning_rate": 8.110390347764373e-05, + "loss": 0.038627082109451295, + "step": 133180 + }, + { + "epoch": 18.905606813342796, + "grad_norm": 0.3872591257095337, + "learning_rate": 8.110248403122782e-05, + "loss": 0.06577336192131042, + "step": 133190 + }, + { + "epoch": 18.907026259758695, + "grad_norm": 2.4864211082458496, + "learning_rate": 8.110106458481193e-05, + "loss": 0.03767527341842651, + "step": 133200 + }, + { + "epoch": 18.908445706174593, + "grad_norm": 0.5065162777900696, + "learning_rate": 8.109964513839602e-05, + "loss": 0.012854862213134765, + "step": 133210 + }, + { + "epoch": 18.90986515259049, + "grad_norm": 1.854183554649353, + "learning_rate": 8.109822569198013e-05, + "loss": 0.0450294703245163, + "step": 133220 + }, + { + "epoch": 18.911284599006386, + "grad_norm": 3.7991325855255127, + "learning_rate": 8.109680624556423e-05, + "loss": 0.009919236600399017, + "step": 133230 + }, + { + "epoch": 18.912704045422284, + "grad_norm": 0.4304609000682831, + "learning_rate": 8.109538679914833e-05, + "loss": 0.03659535348415375, + "step": 133240 + }, + { + "epoch": 18.914123491838183, + "grad_norm": 11.989385604858398, + "learning_rate": 8.109396735273244e-05, + "loss": 0.01260078400373459, + "step": 133250 + }, + { + "epoch": 18.91554293825408, + "grad_norm": 5.348883628845215, + "learning_rate": 8.109254790631654e-05, + "loss": 0.04840273857116699, + "step": 133260 + }, + { + "epoch": 18.91696238466998, + "grad_norm": 1.8591395616531372, + "learning_rate": 8.109112845990065e-05, + "loss": 0.008823969960212707, + "step": 133270 + }, + { + "epoch": 18.918381831085878, + "grad_norm": 3.6820366382598877, + "learning_rate": 8.108970901348475e-05, + "loss": 0.020488184690475465, + "step": 133280 + }, + { + "epoch": 18.919801277501776, + "grad_norm": 0.0690523087978363, + "learning_rate": 8.108828956706884e-05, + "loss": 0.005634373798966408, + "step": 133290 + }, + { + "epoch": 18.92122072391767, + "grad_norm": 4.969079971313477, + "learning_rate": 8.108687012065294e-05, + "loss": 0.025790277123451232, + "step": 133300 + }, + { + "epoch": 18.92264017033357, + "grad_norm": 0.35796651244163513, + "learning_rate": 8.108545067423705e-05, + "loss": 0.020431703329086302, + "step": 133310 + }, + { + "epoch": 18.924059616749467, + "grad_norm": 2.3078744411468506, + "learning_rate": 8.108403122782115e-05, + "loss": 0.021640455722808837, + "step": 133320 + }, + { + "epoch": 18.925479063165366, + "grad_norm": 0.25514811277389526, + "learning_rate": 8.108261178140526e-05, + "loss": 0.017673870921134947, + "step": 133330 + }, + { + "epoch": 18.926898509581264, + "grad_norm": 0.7440028786659241, + "learning_rate": 8.108119233498936e-05, + "loss": 0.008043202757835387, + "step": 133340 + }, + { + "epoch": 18.928317955997162, + "grad_norm": 4.3911895751953125, + "learning_rate": 8.107977288857345e-05, + "loss": 0.011199419945478439, + "step": 133350 + }, + { + "epoch": 18.92973740241306, + "grad_norm": 1.4787195920944214, + "learning_rate": 8.107835344215757e-05, + "loss": 0.028583604097366332, + "step": 133360 + }, + { + "epoch": 18.931156848828955, + "grad_norm": 2.194730281829834, + "learning_rate": 8.107693399574166e-05, + "loss": 0.03149364292621613, + "step": 133370 + }, + { + "epoch": 18.932576295244854, + "grad_norm": 1.954034686088562, + "learning_rate": 8.107551454932577e-05, + "loss": 0.012651622295379639, + "step": 133380 + }, + { + "epoch": 18.933995741660752, + "grad_norm": 13.001426696777344, + "learning_rate": 8.107409510290986e-05, + "loss": 0.028492489457130434, + "step": 133390 + }, + { + "epoch": 18.93541518807665, + "grad_norm": 0.1190931424498558, + "learning_rate": 8.107267565649397e-05, + "loss": 0.035154017806053164, + "step": 133400 + }, + { + "epoch": 18.93683463449255, + "grad_norm": 21.984230041503906, + "learning_rate": 8.107125621007807e-05, + "loss": 0.01489710807800293, + "step": 133410 + }, + { + "epoch": 18.938254080908447, + "grad_norm": 0.12106990069150925, + "learning_rate": 8.106983676366218e-05, + "loss": 0.03257212340831757, + "step": 133420 + }, + { + "epoch": 18.939673527324345, + "grad_norm": 0.6076300740242004, + "learning_rate": 8.106841731724629e-05, + "loss": 0.02213844209909439, + "step": 133430 + }, + { + "epoch": 18.94109297374024, + "grad_norm": 0.021633967757225037, + "learning_rate": 8.106699787083037e-05, + "loss": 0.004343704506754875, + "step": 133440 + }, + { + "epoch": 18.942512420156138, + "grad_norm": 4.2764763832092285, + "learning_rate": 8.106557842441448e-05, + "loss": 0.0022585604339838026, + "step": 133450 + }, + { + "epoch": 18.943931866572036, + "grad_norm": 0.11006913334131241, + "learning_rate": 8.106415897799858e-05, + "loss": 0.01216980665922165, + "step": 133460 + }, + { + "epoch": 18.945351312987935, + "grad_norm": 1.3304197788238525, + "learning_rate": 8.106273953158269e-05, + "loss": 0.010916084051132202, + "step": 133470 + }, + { + "epoch": 18.946770759403833, + "grad_norm": 0.7497724294662476, + "learning_rate": 8.106132008516679e-05, + "loss": 0.004376733303070068, + "step": 133480 + }, + { + "epoch": 18.94819020581973, + "grad_norm": 0.24626189470291138, + "learning_rate": 8.10599006387509e-05, + "loss": 0.005943173170089721, + "step": 133490 + }, + { + "epoch": 18.94960965223563, + "grad_norm": 3.5472681522369385, + "learning_rate": 8.105848119233498e-05, + "loss": 0.02357488125562668, + "step": 133500 + }, + { + "epoch": 18.94960965223563, + "eval_accuracy": 0.9898264131747949, + "eval_loss": 0.037218160927295685, + "eval_runtime": 32.7903, + "eval_samples_per_second": 479.623, + "eval_steps_per_second": 15.004, + "step": 133500 + }, + { + "epoch": 18.951029098651524, + "grad_norm": 0.05089549347758293, + "learning_rate": 8.10570617459191e-05, + "loss": 0.022551646828651427, + "step": 133510 + }, + { + "epoch": 18.952448545067423, + "grad_norm": 0.4436090588569641, + "learning_rate": 8.10556422995032e-05, + "loss": 0.01959022432565689, + "step": 133520 + }, + { + "epoch": 18.95386799148332, + "grad_norm": 1.9277952909469604, + "learning_rate": 8.10542228530873e-05, + "loss": 0.006059730798006058, + "step": 133530 + }, + { + "epoch": 18.95528743789922, + "grad_norm": 0.009567571803927422, + "learning_rate": 8.105280340667141e-05, + "loss": 0.005125709250569344, + "step": 133540 + }, + { + "epoch": 18.956706884315118, + "grad_norm": 0.0779455155134201, + "learning_rate": 8.10513839602555e-05, + "loss": 0.004056332260370254, + "step": 133550 + }, + { + "epoch": 18.958126330731016, + "grad_norm": 0.33066481351852417, + "learning_rate": 8.104996451383961e-05, + "loss": 0.039429694414138794, + "step": 133560 + }, + { + "epoch": 18.959545777146914, + "grad_norm": 0.033027224242687225, + "learning_rate": 8.10485450674237e-05, + "loss": 0.0031930457800626753, + "step": 133570 + }, + { + "epoch": 18.96096522356281, + "grad_norm": 0.028807181864976883, + "learning_rate": 8.104712562100782e-05, + "loss": 0.014510068297386169, + "step": 133580 + }, + { + "epoch": 18.962384669978707, + "grad_norm": 2.8515522480010986, + "learning_rate": 8.104570617459191e-05, + "loss": 0.01636279672384262, + "step": 133590 + }, + { + "epoch": 18.963804116394606, + "grad_norm": 2.692272901535034, + "learning_rate": 8.104428672817601e-05, + "loss": 0.006249012798070908, + "step": 133600 + }, + { + "epoch": 18.965223562810504, + "grad_norm": 0.1441679447889328, + "learning_rate": 8.104286728176012e-05, + "loss": 0.023763060569763184, + "step": 133610 + }, + { + "epoch": 18.966643009226402, + "grad_norm": 1.413166880607605, + "learning_rate": 8.104144783534422e-05, + "loss": 0.020673489570617674, + "step": 133620 + }, + { + "epoch": 18.9680624556423, + "grad_norm": 0.05413550138473511, + "learning_rate": 8.104002838892833e-05, + "loss": 0.0032124049961566926, + "step": 133630 + }, + { + "epoch": 18.9694819020582, + "grad_norm": 0.9596293568611145, + "learning_rate": 8.103860894251243e-05, + "loss": 0.016143888235092163, + "step": 133640 + }, + { + "epoch": 18.970901348474094, + "grad_norm": 0.23774424195289612, + "learning_rate": 8.103718949609653e-05, + "loss": 0.006650367379188537, + "step": 133650 + }, + { + "epoch": 18.972320794889992, + "grad_norm": 0.2125353366136551, + "learning_rate": 8.103577004968062e-05, + "loss": 0.003049859404563904, + "step": 133660 + }, + { + "epoch": 18.97374024130589, + "grad_norm": 15.742388725280762, + "learning_rate": 8.103435060326473e-05, + "loss": 0.03530228137969971, + "step": 133670 + }, + { + "epoch": 18.97515968772179, + "grad_norm": 2.1937851905822754, + "learning_rate": 8.103293115684883e-05, + "loss": 0.010408701002597808, + "step": 133680 + }, + { + "epoch": 18.976579134137687, + "grad_norm": 3.891181707382202, + "learning_rate": 8.103151171043294e-05, + "loss": 0.010996301472187043, + "step": 133690 + }, + { + "epoch": 18.977998580553585, + "grad_norm": 0.6607540249824524, + "learning_rate": 8.103009226401704e-05, + "loss": 0.005656911060214043, + "step": 133700 + }, + { + "epoch": 18.979418026969483, + "grad_norm": 0.36218443512916565, + "learning_rate": 8.102867281760114e-05, + "loss": 0.0029310058802366258, + "step": 133710 + }, + { + "epoch": 18.980837473385378, + "grad_norm": 6.956638336181641, + "learning_rate": 8.102725337118525e-05, + "loss": 0.037996691465377805, + "step": 133720 + }, + { + "epoch": 18.982256919801276, + "grad_norm": 6.165159702301025, + "learning_rate": 8.102583392476934e-05, + "loss": 0.025007152557373048, + "step": 133730 + }, + { + "epoch": 18.983676366217175, + "grad_norm": 2.438201904296875, + "learning_rate": 8.102441447835346e-05, + "loss": 0.03142993748188019, + "step": 133740 + }, + { + "epoch": 18.985095812633073, + "grad_norm": 0.006362368352711201, + "learning_rate": 8.102299503193754e-05, + "loss": 0.009314981102943421, + "step": 133750 + }, + { + "epoch": 18.98651525904897, + "grad_norm": 0.33121517300605774, + "learning_rate": 8.102157558552165e-05, + "loss": 0.012771032750606537, + "step": 133760 + }, + { + "epoch": 18.98793470546487, + "grad_norm": 0.0070489030331373215, + "learning_rate": 8.102015613910575e-05, + "loss": 0.001782979816198349, + "step": 133770 + }, + { + "epoch": 18.989354151880768, + "grad_norm": 0.011138354428112507, + "learning_rate": 8.101873669268986e-05, + "loss": 0.015682095289230348, + "step": 133780 + }, + { + "epoch": 18.990773598296663, + "grad_norm": 1.3822154998779297, + "learning_rate": 8.101731724627396e-05, + "loss": 0.017892464995384216, + "step": 133790 + }, + { + "epoch": 18.99219304471256, + "grad_norm": 2.4268062114715576, + "learning_rate": 8.101589779985805e-05, + "loss": 0.006347347795963287, + "step": 133800 + }, + { + "epoch": 18.99361249112846, + "grad_norm": 8.877155303955078, + "learning_rate": 8.101447835344216e-05, + "loss": 0.06186530590057373, + "step": 133810 + }, + { + "epoch": 18.995031937544358, + "grad_norm": 0.3811427652835846, + "learning_rate": 8.101305890702626e-05, + "loss": 0.015398317575454712, + "step": 133820 + }, + { + "epoch": 18.996451383960256, + "grad_norm": 0.15157096087932587, + "learning_rate": 8.101163946061037e-05, + "loss": 0.01020585596561432, + "step": 133830 + }, + { + "epoch": 18.997870830376154, + "grad_norm": 0.04400681331753731, + "learning_rate": 8.101022001419447e-05, + "loss": 0.0356217622756958, + "step": 133840 + }, + { + "epoch": 18.999290276792053, + "grad_norm": 1.2333203554153442, + "learning_rate": 8.100880056777858e-05, + "loss": 0.007522010058164596, + "step": 133850 + }, + { + "epoch": 19.000709723207947, + "grad_norm": 0.028951354324817657, + "learning_rate": 8.100738112136266e-05, + "loss": 0.016396063566207885, + "step": 133860 + }, + { + "epoch": 19.002129169623846, + "grad_norm": 0.028261512517929077, + "learning_rate": 8.100596167494678e-05, + "loss": 0.015919487178325652, + "step": 133870 + }, + { + "epoch": 19.003548616039744, + "grad_norm": 0.04226310923695564, + "learning_rate": 8.100454222853087e-05, + "loss": 0.02628237009048462, + "step": 133880 + }, + { + "epoch": 19.004968062455642, + "grad_norm": 0.2772606909275055, + "learning_rate": 8.100312278211498e-05, + "loss": 0.044051063060760495, + "step": 133890 + }, + { + "epoch": 19.00638750887154, + "grad_norm": 2.0943820476531982, + "learning_rate": 8.100170333569908e-05, + "loss": 0.003144432231783867, + "step": 133900 + }, + { + "epoch": 19.00780695528744, + "grad_norm": 0.044655878096818924, + "learning_rate": 8.100028388928318e-05, + "loss": 0.011542373895645141, + "step": 133910 + }, + { + "epoch": 19.009226401703337, + "grad_norm": 0.25398340821266174, + "learning_rate": 8.099886444286729e-05, + "loss": 0.018914687633514404, + "step": 133920 + }, + { + "epoch": 19.010645848119232, + "grad_norm": 0.08015237003564835, + "learning_rate": 8.099744499645139e-05, + "loss": 0.012097981572151185, + "step": 133930 + }, + { + "epoch": 19.01206529453513, + "grad_norm": 1.3208882808685303, + "learning_rate": 8.09960255500355e-05, + "loss": 0.030928075313568115, + "step": 133940 + }, + { + "epoch": 19.01348474095103, + "grad_norm": 0.009170650504529476, + "learning_rate": 8.09946061036196e-05, + "loss": 0.0023461733013391494, + "step": 133950 + }, + { + "epoch": 19.014904187366927, + "grad_norm": 8.084667205810547, + "learning_rate": 8.099318665720369e-05, + "loss": 0.017482061684131623, + "step": 133960 + }, + { + "epoch": 19.016323633782825, + "grad_norm": 16.573938369750977, + "learning_rate": 8.099176721078779e-05, + "loss": 0.02548588514328003, + "step": 133970 + }, + { + "epoch": 19.017743080198724, + "grad_norm": 2.9577677249908447, + "learning_rate": 8.09903477643719e-05, + "loss": 0.02603309750556946, + "step": 133980 + }, + { + "epoch": 19.019162526614622, + "grad_norm": 6.544336795806885, + "learning_rate": 8.0988928317956e-05, + "loss": 0.022039610147476196, + "step": 133990 + }, + { + "epoch": 19.020581973030517, + "grad_norm": 0.12144874036312103, + "learning_rate": 8.098750887154011e-05, + "loss": 0.011551780998706818, + "step": 134000 + }, + { + "epoch": 19.020581973030517, + "eval_accuracy": 0.9851211292681376, + "eval_loss": 0.059864241629838943, + "eval_runtime": 32.8201, + "eval_samples_per_second": 479.189, + "eval_steps_per_second": 14.991, + "step": 134000 + }, + { + "epoch": 19.022001419446415, + "grad_norm": 0.29416728019714355, + "learning_rate": 8.098608942512421e-05, + "loss": 0.061245912313461305, + "step": 134010 + }, + { + "epoch": 19.023420865862313, + "grad_norm": 0.021602200344204903, + "learning_rate": 8.09846699787083e-05, + "loss": 0.0019267242401838302, + "step": 134020 + }, + { + "epoch": 19.02484031227821, + "grad_norm": 0.3599778711795807, + "learning_rate": 8.098325053229242e-05, + "loss": 0.007973751425743103, + "step": 134030 + }, + { + "epoch": 19.02625975869411, + "grad_norm": 7.223097324371338, + "learning_rate": 8.098183108587651e-05, + "loss": 0.01779400110244751, + "step": 134040 + }, + { + "epoch": 19.027679205110008, + "grad_norm": 5.916549205780029, + "learning_rate": 8.098041163946062e-05, + "loss": 0.013569539785385132, + "step": 134050 + }, + { + "epoch": 19.029098651525906, + "grad_norm": 0.7093172669410706, + "learning_rate": 8.097899219304471e-05, + "loss": 0.01011343002319336, + "step": 134060 + }, + { + "epoch": 19.0305180979418, + "grad_norm": 0.5323686599731445, + "learning_rate": 8.097757274662882e-05, + "loss": 0.0035029586404562, + "step": 134070 + }, + { + "epoch": 19.0319375443577, + "grad_norm": 0.4400314390659332, + "learning_rate": 8.097615330021292e-05, + "loss": 0.0046637110412120816, + "step": 134080 + }, + { + "epoch": 19.033356990773598, + "grad_norm": 0.008682881481945515, + "learning_rate": 8.097473385379703e-05, + "loss": 0.0200369730591774, + "step": 134090 + }, + { + "epoch": 19.034776437189496, + "grad_norm": 0.03299645334482193, + "learning_rate": 8.097331440738112e-05, + "loss": 0.019125807285308837, + "step": 134100 + }, + { + "epoch": 19.036195883605394, + "grad_norm": 0.09096463769674301, + "learning_rate": 8.097189496096522e-05, + "loss": 0.0013361256569623947, + "step": 134110 + }, + { + "epoch": 19.037615330021293, + "grad_norm": 6.4878315925598145, + "learning_rate": 8.097047551454933e-05, + "loss": 0.028588488698005676, + "step": 134120 + }, + { + "epoch": 19.03903477643719, + "grad_norm": 0.0908045843243599, + "learning_rate": 8.096905606813343e-05, + "loss": 0.029576560854911803, + "step": 134130 + }, + { + "epoch": 19.040454222853086, + "grad_norm": 1.544521689414978, + "learning_rate": 8.096763662171754e-05, + "loss": 0.011837373673915862, + "step": 134140 + }, + { + "epoch": 19.041873669268984, + "grad_norm": 0.19672341644763947, + "learning_rate": 8.096621717530164e-05, + "loss": 0.006500184535980225, + "step": 134150 + }, + { + "epoch": 19.043293115684882, + "grad_norm": 0.08466586470603943, + "learning_rate": 8.096479772888575e-05, + "loss": 0.02133823335170746, + "step": 134160 + }, + { + "epoch": 19.04471256210078, + "grad_norm": 8.22453784942627, + "learning_rate": 8.096337828246983e-05, + "loss": 0.013680379092693328, + "step": 134170 + }, + { + "epoch": 19.04613200851668, + "grad_norm": 0.022004850208759308, + "learning_rate": 8.096195883605394e-05, + "loss": 0.041504427790641785, + "step": 134180 + }, + { + "epoch": 19.047551454932577, + "grad_norm": 1.6565831899642944, + "learning_rate": 8.096053938963804e-05, + "loss": 0.013404084742069245, + "step": 134190 + }, + { + "epoch": 19.048970901348476, + "grad_norm": 9.642035484313965, + "learning_rate": 8.095911994322215e-05, + "loss": 0.042037194967269896, + "step": 134200 + }, + { + "epoch": 19.05039034776437, + "grad_norm": 0.35203757882118225, + "learning_rate": 8.095770049680625e-05, + "loss": 0.031157463788986206, + "step": 134210 + }, + { + "epoch": 19.05180979418027, + "grad_norm": 9.968067169189453, + "learning_rate": 8.095628105039035e-05, + "loss": 0.0336113691329956, + "step": 134220 + }, + { + "epoch": 19.053229240596167, + "grad_norm": 0.3345049023628235, + "learning_rate": 8.095486160397446e-05, + "loss": 0.03366290926933289, + "step": 134230 + }, + { + "epoch": 19.054648687012065, + "grad_norm": 0.02600511722266674, + "learning_rate": 8.095344215755855e-05, + "loss": 0.021112054586410522, + "step": 134240 + }, + { + "epoch": 19.056068133427964, + "grad_norm": 0.06966177374124527, + "learning_rate": 8.095202271114267e-05, + "loss": 0.009837976098060608, + "step": 134250 + }, + { + "epoch": 19.057487579843862, + "grad_norm": 8.749176979064941, + "learning_rate": 8.095060326472676e-05, + "loss": 0.045498573780059816, + "step": 134260 + }, + { + "epoch": 19.05890702625976, + "grad_norm": 14.655566215515137, + "learning_rate": 8.094918381831086e-05, + "loss": 0.025707656145095827, + "step": 134270 + }, + { + "epoch": 19.060326472675655, + "grad_norm": 0.17003805935382843, + "learning_rate": 8.094776437189496e-05, + "loss": 0.012677818536758423, + "step": 134280 + }, + { + "epoch": 19.061745919091553, + "grad_norm": 5.308541774749756, + "learning_rate": 8.094634492547907e-05, + "loss": 0.02551833987236023, + "step": 134290 + }, + { + "epoch": 19.06316536550745, + "grad_norm": 0.01365375891327858, + "learning_rate": 8.094492547906317e-05, + "loss": 0.020414717495441437, + "step": 134300 + }, + { + "epoch": 19.06458481192335, + "grad_norm": 0.15614201128482819, + "learning_rate": 8.094350603264728e-05, + "loss": 0.023772512376308442, + "step": 134310 + }, + { + "epoch": 19.066004258339248, + "grad_norm": 0.07233654707670212, + "learning_rate": 8.094208658623137e-05, + "loss": 0.04059212505817413, + "step": 134320 + }, + { + "epoch": 19.067423704755146, + "grad_norm": 0.026217155158519745, + "learning_rate": 8.094066713981547e-05, + "loss": 0.0021272551268339155, + "step": 134330 + }, + { + "epoch": 19.068843151171045, + "grad_norm": 0.0481400303542614, + "learning_rate": 8.093924769339958e-05, + "loss": 0.05546886920928955, + "step": 134340 + }, + { + "epoch": 19.07026259758694, + "grad_norm": 19.32974624633789, + "learning_rate": 8.093782824698368e-05, + "loss": 0.03558627963066101, + "step": 134350 + }, + { + "epoch": 19.071682044002838, + "grad_norm": 0.06256501376628876, + "learning_rate": 8.093640880056779e-05, + "loss": 0.009789368510246277, + "step": 134360 + }, + { + "epoch": 19.073101490418736, + "grad_norm": 1.345982551574707, + "learning_rate": 8.093498935415187e-05, + "loss": 0.034044647216796876, + "step": 134370 + }, + { + "epoch": 19.074520936834634, + "grad_norm": 0.13332176208496094, + "learning_rate": 8.093356990773599e-05, + "loss": 0.002076190337538719, + "step": 134380 + }, + { + "epoch": 19.075940383250533, + "grad_norm": 9.257709503173828, + "learning_rate": 8.093215046132008e-05, + "loss": 0.037376236915588376, + "step": 134390 + }, + { + "epoch": 19.07735982966643, + "grad_norm": 0.0649016723036766, + "learning_rate": 8.09307310149042e-05, + "loss": 0.016293227672576904, + "step": 134400 + }, + { + "epoch": 19.07877927608233, + "grad_norm": 0.0038299565203487873, + "learning_rate": 8.092931156848829e-05, + "loss": 0.025033700466156005, + "step": 134410 + }, + { + "epoch": 19.080198722498224, + "grad_norm": 0.9183870553970337, + "learning_rate": 8.092789212207239e-05, + "loss": 0.010904945433139801, + "step": 134420 + }, + { + "epoch": 19.081618168914122, + "grad_norm": 0.7665177583694458, + "learning_rate": 8.09264726756565e-05, + "loss": 0.007632662355899811, + "step": 134430 + }, + { + "epoch": 19.08303761533002, + "grad_norm": 0.15135960280895233, + "learning_rate": 8.09250532292406e-05, + "loss": 0.008608837425708771, + "step": 134440 + }, + { + "epoch": 19.08445706174592, + "grad_norm": 0.017531752586364746, + "learning_rate": 8.092363378282471e-05, + "loss": 0.07331695556640624, + "step": 134450 + }, + { + "epoch": 19.085876508161817, + "grad_norm": 0.24432286620140076, + "learning_rate": 8.09222143364088e-05, + "loss": 0.011470304429531097, + "step": 134460 + }, + { + "epoch": 19.087295954577716, + "grad_norm": 0.009214629419147968, + "learning_rate": 8.09207948899929e-05, + "loss": 0.0062458343803882595, + "step": 134470 + }, + { + "epoch": 19.088715400993614, + "grad_norm": 0.03448065370321274, + "learning_rate": 8.0919375443577e-05, + "loss": 0.0029633276164531706, + "step": 134480 + }, + { + "epoch": 19.09013484740951, + "grad_norm": 0.473985880613327, + "learning_rate": 8.091795599716111e-05, + "loss": 0.011840125918388367, + "step": 134490 + }, + { + "epoch": 19.091554293825407, + "grad_norm": 0.008454610593616962, + "learning_rate": 8.091653655074521e-05, + "loss": 0.01086244136095047, + "step": 134500 + }, + { + "epoch": 19.091554293825407, + "eval_accuracy": 0.9876009410567813, + "eval_loss": 0.04235000163316727, + "eval_runtime": 32.6911, + "eval_samples_per_second": 481.079, + "eval_steps_per_second": 15.05, + "step": 134500 + }, + { + "epoch": 19.092973740241305, + "grad_norm": 11.254133224487305, + "learning_rate": 8.091511710432932e-05, + "loss": 0.009967145323753358, + "step": 134510 + }, + { + "epoch": 19.094393186657204, + "grad_norm": 0.08455899357795715, + "learning_rate": 8.091369765791342e-05, + "loss": 0.01689300835132599, + "step": 134520 + }, + { + "epoch": 19.095812633073102, + "grad_norm": 0.8373426795005798, + "learning_rate": 8.091227821149751e-05, + "loss": 0.006131380051374436, + "step": 134530 + }, + { + "epoch": 19.097232079489, + "grad_norm": 0.337380588054657, + "learning_rate": 8.091085876508163e-05, + "loss": 0.006717808544635773, + "step": 134540 + }, + { + "epoch": 19.0986515259049, + "grad_norm": 1.9571492671966553, + "learning_rate": 8.090943931866572e-05, + "loss": 0.00572134368121624, + "step": 134550 + }, + { + "epoch": 19.100070972320793, + "grad_norm": 0.024347834289073944, + "learning_rate": 8.090801987224983e-05, + "loss": 0.0018487922847270966, + "step": 134560 + }, + { + "epoch": 19.10149041873669, + "grad_norm": 0.6312090754508972, + "learning_rate": 8.090660042583393e-05, + "loss": 0.006242816522717476, + "step": 134570 + }, + { + "epoch": 19.10290986515259, + "grad_norm": 1.1977765560150146, + "learning_rate": 8.090518097941803e-05, + "loss": 0.018849599361419677, + "step": 134580 + }, + { + "epoch": 19.10432931156849, + "grad_norm": 0.005179632920771837, + "learning_rate": 8.090376153300213e-05, + "loss": 0.0015044763684272766, + "step": 134590 + }, + { + "epoch": 19.105748757984387, + "grad_norm": 1.7063950300216675, + "learning_rate": 8.090234208658624e-05, + "loss": 0.01455221027135849, + "step": 134600 + }, + { + "epoch": 19.107168204400285, + "grad_norm": 14.58033275604248, + "learning_rate": 8.090092264017033e-05, + "loss": 0.03353846073150635, + "step": 134610 + }, + { + "epoch": 19.108587650816183, + "grad_norm": 5.8170647621154785, + "learning_rate": 8.089950319375444e-05, + "loss": 0.05731701850891113, + "step": 134620 + }, + { + "epoch": 19.110007097232078, + "grad_norm": 3.257978677749634, + "learning_rate": 8.089808374733854e-05, + "loss": 0.018321070075035095, + "step": 134630 + }, + { + "epoch": 19.111426543647976, + "grad_norm": 4.303315162658691, + "learning_rate": 8.089666430092264e-05, + "loss": 0.009317952394485473, + "step": 134640 + }, + { + "epoch": 19.112845990063875, + "grad_norm": 11.137117385864258, + "learning_rate": 8.089524485450675e-05, + "loss": 0.043749278783798216, + "step": 134650 + }, + { + "epoch": 19.114265436479773, + "grad_norm": 0.8089771270751953, + "learning_rate": 8.089382540809085e-05, + "loss": 0.03437838852405548, + "step": 134660 + }, + { + "epoch": 19.11568488289567, + "grad_norm": 0.16115444898605347, + "learning_rate": 8.089240596167496e-05, + "loss": 0.002757306769490242, + "step": 134670 + }, + { + "epoch": 19.11710432931157, + "grad_norm": 0.10439011454582214, + "learning_rate": 8.089098651525904e-05, + "loss": 0.024527326226234436, + "step": 134680 + }, + { + "epoch": 19.118523775727468, + "grad_norm": 0.03504888340830803, + "learning_rate": 8.088956706884315e-05, + "loss": 0.013709086179733276, + "step": 134690 + }, + { + "epoch": 19.119943222143363, + "grad_norm": 0.25188112258911133, + "learning_rate": 8.088814762242725e-05, + "loss": 0.023006241023540496, + "step": 134700 + }, + { + "epoch": 19.12136266855926, + "grad_norm": 0.06671018153429031, + "learning_rate": 8.088672817601136e-05, + "loss": 0.024523438513278963, + "step": 134710 + }, + { + "epoch": 19.12278211497516, + "grad_norm": 2.077873468399048, + "learning_rate": 8.088530872959546e-05, + "loss": 0.007115639746189117, + "step": 134720 + }, + { + "epoch": 19.124201561391057, + "grad_norm": 0.1570131927728653, + "learning_rate": 8.088388928317956e-05, + "loss": 0.00751805305480957, + "step": 134730 + }, + { + "epoch": 19.125621007806956, + "grad_norm": 0.027703123167157173, + "learning_rate": 8.088246983676367e-05, + "loss": 0.0043018519878387455, + "step": 134740 + }, + { + "epoch": 19.127040454222854, + "grad_norm": 14.652771949768066, + "learning_rate": 8.088105039034776e-05, + "loss": 0.020449712872505188, + "step": 134750 + }, + { + "epoch": 19.128459900638752, + "grad_norm": 7.697845458984375, + "learning_rate": 8.087963094393188e-05, + "loss": 0.01461091786623001, + "step": 134760 + }, + { + "epoch": 19.129879347054647, + "grad_norm": 0.2831219732761383, + "learning_rate": 8.087821149751597e-05, + "loss": 0.0011369768530130387, + "step": 134770 + }, + { + "epoch": 19.131298793470545, + "grad_norm": 0.17854426801204681, + "learning_rate": 8.087679205110007e-05, + "loss": 0.026823663711547853, + "step": 134780 + }, + { + "epoch": 19.132718239886444, + "grad_norm": 11.814420700073242, + "learning_rate": 8.087537260468417e-05, + "loss": 0.019499766826629638, + "step": 134790 + }, + { + "epoch": 19.134137686302342, + "grad_norm": 0.003499184036627412, + "learning_rate": 8.087395315826828e-05, + "loss": 0.010670372098684312, + "step": 134800 + }, + { + "epoch": 19.13555713271824, + "grad_norm": 2.334817886352539, + "learning_rate": 8.087253371185238e-05, + "loss": 0.020394699275493623, + "step": 134810 + }, + { + "epoch": 19.13697657913414, + "grad_norm": 17.925743103027344, + "learning_rate": 8.087111426543649e-05, + "loss": 0.027310144901275635, + "step": 134820 + }, + { + "epoch": 19.138396025550037, + "grad_norm": 0.029750006273388863, + "learning_rate": 8.086969481902058e-05, + "loss": 0.03104974627494812, + "step": 134830 + }, + { + "epoch": 19.13981547196593, + "grad_norm": 0.0954161286354065, + "learning_rate": 8.086827537260468e-05, + "loss": 0.041319483518600465, + "step": 134840 + }, + { + "epoch": 19.14123491838183, + "grad_norm": 0.026760157197713852, + "learning_rate": 8.086685592618879e-05, + "loss": 0.003623197972774506, + "step": 134850 + }, + { + "epoch": 19.14265436479773, + "grad_norm": 11.640395164489746, + "learning_rate": 8.086543647977289e-05, + "loss": 0.016904018819332123, + "step": 134860 + }, + { + "epoch": 19.144073811213627, + "grad_norm": 0.035630472004413605, + "learning_rate": 8.0864017033357e-05, + "loss": 0.007674677670001984, + "step": 134870 + }, + { + "epoch": 19.145493257629525, + "grad_norm": 10.433150291442871, + "learning_rate": 8.08625975869411e-05, + "loss": 0.02237817645072937, + "step": 134880 + }, + { + "epoch": 19.146912704045423, + "grad_norm": 0.3572457432746887, + "learning_rate": 8.08611781405252e-05, + "loss": 0.06344624161720276, + "step": 134890 + }, + { + "epoch": 19.14833215046132, + "grad_norm": 1.9099338054656982, + "learning_rate": 8.08597586941093e-05, + "loss": 0.02476053237915039, + "step": 134900 + }, + { + "epoch": 19.149751596877216, + "grad_norm": 4.4671311378479, + "learning_rate": 8.08583392476934e-05, + "loss": 0.024169033765792845, + "step": 134910 + }, + { + "epoch": 19.151171043293115, + "grad_norm": 1.4765146970748901, + "learning_rate": 8.085691980127752e-05, + "loss": 0.02690877616405487, + "step": 134920 + }, + { + "epoch": 19.152590489709013, + "grad_norm": 0.014275099150836468, + "learning_rate": 8.085550035486161e-05, + "loss": 0.008906839787960053, + "step": 134930 + }, + { + "epoch": 19.15400993612491, + "grad_norm": 0.4602917730808258, + "learning_rate": 8.085408090844571e-05, + "loss": 0.03422538936138153, + "step": 134940 + }, + { + "epoch": 19.15542938254081, + "grad_norm": 0.015164846554398537, + "learning_rate": 8.085266146202981e-05, + "loss": 0.04223176836967468, + "step": 134950 + }, + { + "epoch": 19.156848828956708, + "grad_norm": 6.864485740661621, + "learning_rate": 8.085124201561392e-05, + "loss": 0.052771955728530884, + "step": 134960 + }, + { + "epoch": 19.158268275372606, + "grad_norm": 0.4479176998138428, + "learning_rate": 8.084982256919802e-05, + "loss": 0.005521007999777794, + "step": 134970 + }, + { + "epoch": 19.1596877217885, + "grad_norm": 0.5584779977798462, + "learning_rate": 8.084840312278213e-05, + "loss": 0.024714210629463197, + "step": 134980 + }, + { + "epoch": 19.1611071682044, + "grad_norm": 3.680006742477417, + "learning_rate": 8.084698367636621e-05, + "loss": 0.015227210521697999, + "step": 134990 + }, + { + "epoch": 19.162526614620297, + "grad_norm": 0.018869522958993912, + "learning_rate": 8.084556422995032e-05, + "loss": 0.001959379017353058, + "step": 135000 + }, + { + "epoch": 19.162526614620297, + "eval_accuracy": 0.9850575443504801, + "eval_loss": 0.05439043045043945, + "eval_runtime": 32.4017, + "eval_samples_per_second": 485.375, + "eval_steps_per_second": 15.184, + "step": 135000 + }, + { + "epoch": 19.163946061036196, + "grad_norm": 0.021521301940083504, + "learning_rate": 8.084414478353443e-05, + "loss": 0.024100886285305025, + "step": 135010 + }, + { + "epoch": 19.165365507452094, + "grad_norm": 0.20940202474594116, + "learning_rate": 8.084272533711853e-05, + "loss": 0.05533415079116821, + "step": 135020 + }, + { + "epoch": 19.166784953867992, + "grad_norm": 0.4249863624572754, + "learning_rate": 8.084130589070264e-05, + "loss": 0.008059429377317429, + "step": 135030 + }, + { + "epoch": 19.16820440028389, + "grad_norm": 0.6474401354789734, + "learning_rate": 8.083988644428672e-05, + "loss": 0.014611485600471496, + "step": 135040 + }, + { + "epoch": 19.169623846699785, + "grad_norm": 0.08407270163297653, + "learning_rate": 8.083846699787084e-05, + "loss": 0.028042757511138917, + "step": 135050 + }, + { + "epoch": 19.171043293115684, + "grad_norm": 0.09232445806264877, + "learning_rate": 8.083704755145493e-05, + "loss": 0.025106951594352722, + "step": 135060 + }, + { + "epoch": 19.172462739531582, + "grad_norm": 6.32170295715332, + "learning_rate": 8.083562810503904e-05, + "loss": 0.013578245043754577, + "step": 135070 + }, + { + "epoch": 19.17388218594748, + "grad_norm": 0.01305151917040348, + "learning_rate": 8.083420865862314e-05, + "loss": 0.01248614490032196, + "step": 135080 + }, + { + "epoch": 19.17530163236338, + "grad_norm": 0.12669546902179718, + "learning_rate": 8.083278921220724e-05, + "loss": 0.011508259177207946, + "step": 135090 + }, + { + "epoch": 19.176721078779277, + "grad_norm": 2.873619794845581, + "learning_rate": 8.083136976579135e-05, + "loss": 0.03385049104690552, + "step": 135100 + }, + { + "epoch": 19.178140525195175, + "grad_norm": 8.848909378051758, + "learning_rate": 8.082995031937545e-05, + "loss": 0.03305015861988068, + "step": 135110 + }, + { + "epoch": 19.17955997161107, + "grad_norm": 5.337402820587158, + "learning_rate": 8.082853087295956e-05, + "loss": 0.011400558054447174, + "step": 135120 + }, + { + "epoch": 19.18097941802697, + "grad_norm": 1.4635014533996582, + "learning_rate": 8.082711142654365e-05, + "loss": 0.006388545036315918, + "step": 135130 + }, + { + "epoch": 19.182398864442867, + "grad_norm": 6.023395538330078, + "learning_rate": 8.082569198012775e-05, + "loss": 0.00347101092338562, + "step": 135140 + }, + { + "epoch": 19.183818310858765, + "grad_norm": 0.009968786500394344, + "learning_rate": 8.082427253371185e-05, + "loss": 0.06805426478385926, + "step": 135150 + }, + { + "epoch": 19.185237757274663, + "grad_norm": 0.17491233348846436, + "learning_rate": 8.082285308729596e-05, + "loss": 0.005591537803411484, + "step": 135160 + }, + { + "epoch": 19.18665720369056, + "grad_norm": 2.5479822158813477, + "learning_rate": 8.082143364088006e-05, + "loss": 0.013746441900730133, + "step": 135170 + }, + { + "epoch": 19.18807665010646, + "grad_norm": 1.8305355310440063, + "learning_rate": 8.082001419446417e-05, + "loss": 0.0023511968553066253, + "step": 135180 + }, + { + "epoch": 19.189496096522355, + "grad_norm": 1.3410097360610962, + "learning_rate": 8.081859474804827e-05, + "loss": 0.0441491037607193, + "step": 135190 + }, + { + "epoch": 19.190915542938253, + "grad_norm": 0.06187102571129799, + "learning_rate": 8.081717530163236e-05, + "loss": 0.019126801192760466, + "step": 135200 + }, + { + "epoch": 19.19233498935415, + "grad_norm": 0.324034720659256, + "learning_rate": 8.081575585521647e-05, + "loss": 0.02774391770362854, + "step": 135210 + }, + { + "epoch": 19.19375443577005, + "grad_norm": 0.09795793890953064, + "learning_rate": 8.081433640880057e-05, + "loss": 0.001038951799273491, + "step": 135220 + }, + { + "epoch": 19.195173882185948, + "grad_norm": 0.6911133527755737, + "learning_rate": 8.081291696238468e-05, + "loss": 0.020565421879291536, + "step": 135230 + }, + { + "epoch": 19.196593328601846, + "grad_norm": 0.024742472916841507, + "learning_rate": 8.081149751596878e-05, + "loss": 0.000863572210073471, + "step": 135240 + }, + { + "epoch": 19.198012775017745, + "grad_norm": 0.07502426952123642, + "learning_rate": 8.081007806955288e-05, + "loss": 0.01390819251537323, + "step": 135250 + }, + { + "epoch": 19.19943222143364, + "grad_norm": 0.006235470529645681, + "learning_rate": 8.080865862313698e-05, + "loss": 0.019389943778514863, + "step": 135260 + }, + { + "epoch": 19.200851667849538, + "grad_norm": 10.355511665344238, + "learning_rate": 8.080723917672109e-05, + "loss": 0.01820642948150635, + "step": 135270 + }, + { + "epoch": 19.202271114265436, + "grad_norm": 0.2703514099121094, + "learning_rate": 8.080581973030518e-05, + "loss": 0.01304187923669815, + "step": 135280 + }, + { + "epoch": 19.203690560681334, + "grad_norm": 0.03895045816898346, + "learning_rate": 8.08044002838893e-05, + "loss": 0.023221737146377562, + "step": 135290 + }, + { + "epoch": 19.205110007097232, + "grad_norm": 0.14150403439998627, + "learning_rate": 8.080298083747339e-05, + "loss": 0.00740317702293396, + "step": 135300 + }, + { + "epoch": 19.20652945351313, + "grad_norm": 0.2885378301143646, + "learning_rate": 8.080156139105749e-05, + "loss": 0.050024384260177614, + "step": 135310 + }, + { + "epoch": 19.20794889992903, + "grad_norm": 1.2044950723648071, + "learning_rate": 8.08001419446416e-05, + "loss": 0.006965817511081695, + "step": 135320 + }, + { + "epoch": 19.209368346344924, + "grad_norm": 17.22332000732422, + "learning_rate": 8.07987224982257e-05, + "loss": 0.032378381490707396, + "step": 135330 + }, + { + "epoch": 19.210787792760822, + "grad_norm": 0.031519077718257904, + "learning_rate": 8.079730305180981e-05, + "loss": 0.008376619219779969, + "step": 135340 + }, + { + "epoch": 19.21220723917672, + "grad_norm": 0.13358646631240845, + "learning_rate": 8.079588360539389e-05, + "loss": 0.0036485549062490463, + "step": 135350 + }, + { + "epoch": 19.21362668559262, + "grad_norm": 0.03483152762055397, + "learning_rate": 8.0794464158978e-05, + "loss": 0.0036309100687503815, + "step": 135360 + }, + { + "epoch": 19.215046132008517, + "grad_norm": 7.076496601104736, + "learning_rate": 8.07930447125621e-05, + "loss": 0.03006422519683838, + "step": 135370 + }, + { + "epoch": 19.216465578424415, + "grad_norm": 2.300114154815674, + "learning_rate": 8.079162526614621e-05, + "loss": 0.0059611741453409195, + "step": 135380 + }, + { + "epoch": 19.217885024840314, + "grad_norm": 1.368050217628479, + "learning_rate": 8.079020581973031e-05, + "loss": 0.02752087116241455, + "step": 135390 + }, + { + "epoch": 19.21930447125621, + "grad_norm": 0.16258902847766876, + "learning_rate": 8.07887863733144e-05, + "loss": 0.003470803052186966, + "step": 135400 + }, + { + "epoch": 19.220723917672107, + "grad_norm": 0.3476499021053314, + "learning_rate": 8.078736692689852e-05, + "loss": 0.020617493987083436, + "step": 135410 + }, + { + "epoch": 19.222143364088005, + "grad_norm": 0.781749427318573, + "learning_rate": 8.078594748048261e-05, + "loss": 0.0015029162168502808, + "step": 135420 + }, + { + "epoch": 19.223562810503903, + "grad_norm": 1.2078951597213745, + "learning_rate": 8.078452803406673e-05, + "loss": 0.01431012749671936, + "step": 135430 + }, + { + "epoch": 19.2249822569198, + "grad_norm": 0.09040942788124084, + "learning_rate": 8.078310858765082e-05, + "loss": 0.0208782359957695, + "step": 135440 + }, + { + "epoch": 19.2264017033357, + "grad_norm": 0.3921969532966614, + "learning_rate": 8.078168914123492e-05, + "loss": 0.020682938396930695, + "step": 135450 + }, + { + "epoch": 19.2278211497516, + "grad_norm": 0.14331085979938507, + "learning_rate": 8.078026969481902e-05, + "loss": 0.012914702296257019, + "step": 135460 + }, + { + "epoch": 19.229240596167493, + "grad_norm": 1.7515239715576172, + "learning_rate": 8.077885024840313e-05, + "loss": 0.007830117642879487, + "step": 135470 + }, + { + "epoch": 19.23066004258339, + "grad_norm": 2.976736068725586, + "learning_rate": 8.077743080198723e-05, + "loss": 0.0039331987500190735, + "step": 135480 + }, + { + "epoch": 19.23207948899929, + "grad_norm": 0.058782994747161865, + "learning_rate": 8.077601135557134e-05, + "loss": 0.004452567175030708, + "step": 135490 + }, + { + "epoch": 19.233498935415188, + "grad_norm": 0.6067075729370117, + "learning_rate": 8.077459190915543e-05, + "loss": 0.01762688010931015, + "step": 135500 + }, + { + "epoch": 19.233498935415188, + "eval_accuracy": 0.9860749030330006, + "eval_loss": 0.053748831152915955, + "eval_runtime": 33.162, + "eval_samples_per_second": 474.248, + "eval_steps_per_second": 14.836, + "step": 135500 + }, + { + "epoch": 19.234918381831086, + "grad_norm": 0.1507439762353897, + "learning_rate": 8.077317246273953e-05, + "loss": 0.030294719338417053, + "step": 135510 + }, + { + "epoch": 19.236337828246985, + "grad_norm": 9.084303855895996, + "learning_rate": 8.077175301632364e-05, + "loss": 0.011562639474868774, + "step": 135520 + }, + { + "epoch": 19.237757274662883, + "grad_norm": 2.929205894470215, + "learning_rate": 8.077033356990774e-05, + "loss": 0.06092924475669861, + "step": 135530 + }, + { + "epoch": 19.239176721078778, + "grad_norm": 10.090849876403809, + "learning_rate": 8.076891412349185e-05, + "loss": 0.02417505532503128, + "step": 135540 + }, + { + "epoch": 19.240596167494676, + "grad_norm": 0.04496016725897789, + "learning_rate": 8.076749467707593e-05, + "loss": 0.013739706575870514, + "step": 135550 + }, + { + "epoch": 19.242015613910574, + "grad_norm": 2.531179666519165, + "learning_rate": 8.076607523066005e-05, + "loss": 0.023041173815727234, + "step": 135560 + }, + { + "epoch": 19.243435060326473, + "grad_norm": 0.288542240858078, + "learning_rate": 8.076465578424414e-05, + "loss": 0.025524574518203735, + "step": 135570 + }, + { + "epoch": 19.24485450674237, + "grad_norm": 0.27630212903022766, + "learning_rate": 8.076323633782825e-05, + "loss": 0.035649356245994565, + "step": 135580 + }, + { + "epoch": 19.24627395315827, + "grad_norm": 0.21555322408676147, + "learning_rate": 8.076181689141235e-05, + "loss": 0.02202434986829758, + "step": 135590 + }, + { + "epoch": 19.247693399574167, + "grad_norm": 0.428025484085083, + "learning_rate": 8.076039744499646e-05, + "loss": 0.008564649522304535, + "step": 135600 + }, + { + "epoch": 19.249112845990062, + "grad_norm": 1.058066725730896, + "learning_rate": 8.075897799858056e-05, + "loss": 0.02624986469745636, + "step": 135610 + }, + { + "epoch": 19.25053229240596, + "grad_norm": 0.0745265781879425, + "learning_rate": 8.075755855216466e-05, + "loss": 0.023323173820972442, + "step": 135620 + }, + { + "epoch": 19.25195173882186, + "grad_norm": 10.500258445739746, + "learning_rate": 8.075613910574877e-05, + "loss": 0.02508750557899475, + "step": 135630 + }, + { + "epoch": 19.253371185237757, + "grad_norm": 0.005136394407600164, + "learning_rate": 8.075471965933287e-05, + "loss": 0.00155564583837986, + "step": 135640 + }, + { + "epoch": 19.254790631653655, + "grad_norm": 0.07520145177841187, + "learning_rate": 8.075330021291698e-05, + "loss": 0.01268192082643509, + "step": 135650 + }, + { + "epoch": 19.256210078069554, + "grad_norm": 0.08487255871295929, + "learning_rate": 8.075188076650106e-05, + "loss": 0.002196522429585457, + "step": 135660 + }, + { + "epoch": 19.257629524485452, + "grad_norm": 1.8649828433990479, + "learning_rate": 8.075046132008517e-05, + "loss": 0.0182554692029953, + "step": 135670 + }, + { + "epoch": 19.259048970901347, + "grad_norm": 0.21836072206497192, + "learning_rate": 8.074904187366927e-05, + "loss": 0.008518166840076447, + "step": 135680 + }, + { + "epoch": 19.260468417317245, + "grad_norm": 0.07416136562824249, + "learning_rate": 8.074762242725338e-05, + "loss": 0.020471793413162232, + "step": 135690 + }, + { + "epoch": 19.261887863733143, + "grad_norm": 0.04966128617525101, + "learning_rate": 8.074620298083748e-05, + "loss": 0.008542297780513764, + "step": 135700 + }, + { + "epoch": 19.26330731014904, + "grad_norm": 4.001149654388428, + "learning_rate": 8.074478353442157e-05, + "loss": 0.07088310718536377, + "step": 135710 + }, + { + "epoch": 19.26472675656494, + "grad_norm": 17.749656677246094, + "learning_rate": 8.074336408800568e-05, + "loss": 0.027709412574768066, + "step": 135720 + }, + { + "epoch": 19.26614620298084, + "grad_norm": 2.207138776779175, + "learning_rate": 8.074194464158978e-05, + "loss": 0.032978209853172305, + "step": 135730 + }, + { + "epoch": 19.267565649396737, + "grad_norm": 0.8686169981956482, + "learning_rate": 8.074052519517389e-05, + "loss": 0.02525452971458435, + "step": 135740 + }, + { + "epoch": 19.26898509581263, + "grad_norm": 0.48470228910446167, + "learning_rate": 8.073910574875799e-05, + "loss": 0.04213707149028778, + "step": 135750 + }, + { + "epoch": 19.27040454222853, + "grad_norm": 5.371331691741943, + "learning_rate": 8.073768630234209e-05, + "loss": 0.04031191170215607, + "step": 135760 + }, + { + "epoch": 19.271823988644428, + "grad_norm": 3.2471160888671875, + "learning_rate": 8.073626685592619e-05, + "loss": 0.012299670279026032, + "step": 135770 + }, + { + "epoch": 19.273243435060326, + "grad_norm": 0.3554064631462097, + "learning_rate": 8.07348474095103e-05, + "loss": 0.03185656070709229, + "step": 135780 + }, + { + "epoch": 19.274662881476225, + "grad_norm": 0.8942446708679199, + "learning_rate": 8.07334279630944e-05, + "loss": 0.020659103989601135, + "step": 135790 + }, + { + "epoch": 19.276082327892123, + "grad_norm": 0.14760670065879822, + "learning_rate": 8.07320085166785e-05, + "loss": 0.02072883993387222, + "step": 135800 + }, + { + "epoch": 19.27750177430802, + "grad_norm": 0.3239462971687317, + "learning_rate": 8.07305890702626e-05, + "loss": 0.002651532366871834, + "step": 135810 + }, + { + "epoch": 19.278921220723916, + "grad_norm": 8.159717559814453, + "learning_rate": 8.07291696238467e-05, + "loss": 0.014492425322532653, + "step": 135820 + }, + { + "epoch": 19.280340667139814, + "grad_norm": 0.27178525924682617, + "learning_rate": 8.072775017743081e-05, + "loss": 0.018134912848472594, + "step": 135830 + }, + { + "epoch": 19.281760113555713, + "grad_norm": 0.029552889987826347, + "learning_rate": 8.072633073101491e-05, + "loss": 0.00932794064283371, + "step": 135840 + }, + { + "epoch": 19.28317955997161, + "grad_norm": 3.9056999683380127, + "learning_rate": 8.072491128459902e-05, + "loss": 0.007778584957122803, + "step": 135850 + }, + { + "epoch": 19.28459900638751, + "grad_norm": 2.5379421710968018, + "learning_rate": 8.07234918381831e-05, + "loss": 0.01643350124359131, + "step": 135860 + }, + { + "epoch": 19.286018452803408, + "grad_norm": 2.7979471683502197, + "learning_rate": 8.072207239176721e-05, + "loss": 0.022671455144882204, + "step": 135870 + }, + { + "epoch": 19.287437899219306, + "grad_norm": 4.062838554382324, + "learning_rate": 8.072065294535131e-05, + "loss": 0.036980432271957395, + "step": 135880 + }, + { + "epoch": 19.2888573456352, + "grad_norm": 0.6683847308158875, + "learning_rate": 8.071923349893542e-05, + "loss": 0.004612940177321434, + "step": 135890 + }, + { + "epoch": 19.2902767920511, + "grad_norm": 0.08301468193531036, + "learning_rate": 8.071781405251952e-05, + "loss": 0.03441115915775299, + "step": 135900 + }, + { + "epoch": 19.291696238466997, + "grad_norm": 1.0100123882293701, + "learning_rate": 8.071639460610362e-05, + "loss": 0.004451007023453713, + "step": 135910 + }, + { + "epoch": 19.293115684882896, + "grad_norm": 0.06388121843338013, + "learning_rate": 8.071497515968773e-05, + "loss": 0.003654952719807625, + "step": 135920 + }, + { + "epoch": 19.294535131298794, + "grad_norm": 10.2322359085083, + "learning_rate": 8.071355571327182e-05, + "loss": 0.04433683156967163, + "step": 135930 + }, + { + "epoch": 19.295954577714692, + "grad_norm": 0.38884854316711426, + "learning_rate": 8.071213626685594e-05, + "loss": 0.030773085355758668, + "step": 135940 + }, + { + "epoch": 19.29737402413059, + "grad_norm": 0.6468043923377991, + "learning_rate": 8.071071682044003e-05, + "loss": 0.02660565674304962, + "step": 135950 + }, + { + "epoch": 19.298793470546485, + "grad_norm": 0.015155358240008354, + "learning_rate": 8.070929737402414e-05, + "loss": 0.015232937037944793, + "step": 135960 + }, + { + "epoch": 19.300212916962384, + "grad_norm": 0.1312216967344284, + "learning_rate": 8.070787792760823e-05, + "loss": 0.06477776765823365, + "step": 135970 + }, + { + "epoch": 19.301632363378282, + "grad_norm": 0.007115287706255913, + "learning_rate": 8.070645848119234e-05, + "loss": 0.00944189801812172, + "step": 135980 + }, + { + "epoch": 19.30305180979418, + "grad_norm": 2.0040953159332275, + "learning_rate": 8.070503903477644e-05, + "loss": 0.02078152447938919, + "step": 135990 + }, + { + "epoch": 19.30447125621008, + "grad_norm": 0.0038018589839339256, + "learning_rate": 8.070361958836055e-05, + "loss": 0.004979583621025086, + "step": 136000 + }, + { + "epoch": 19.30447125621008, + "eval_accuracy": 0.9849939594328225, + "eval_loss": 0.0570266917347908, + "eval_runtime": 33.364, + "eval_samples_per_second": 471.376, + "eval_steps_per_second": 14.746, + "step": 136000 + }, + { + "epoch": 19.305890702625977, + "grad_norm": 0.06633606553077698, + "learning_rate": 8.070220014194464e-05, + "loss": 0.0009724553674459457, + "step": 136010 + }, + { + "epoch": 19.307310149041875, + "grad_norm": 3.7323360443115234, + "learning_rate": 8.070078069552874e-05, + "loss": 0.009088954329490662, + "step": 136020 + }, + { + "epoch": 19.30872959545777, + "grad_norm": 0.17580485343933105, + "learning_rate": 8.069936124911285e-05, + "loss": 0.010520386695861816, + "step": 136030 + }, + { + "epoch": 19.310149041873668, + "grad_norm": 3.0736656188964844, + "learning_rate": 8.069794180269695e-05, + "loss": 0.010726400464773179, + "step": 136040 + }, + { + "epoch": 19.311568488289566, + "grad_norm": 12.675605773925781, + "learning_rate": 8.069652235628106e-05, + "loss": 0.020539800822734832, + "step": 136050 + }, + { + "epoch": 19.312987934705465, + "grad_norm": 14.800597190856934, + "learning_rate": 8.069510290986516e-05, + "loss": 0.03159726858139038, + "step": 136060 + }, + { + "epoch": 19.314407381121363, + "grad_norm": 0.014132910408079624, + "learning_rate": 8.069368346344926e-05, + "loss": 0.007642312347888947, + "step": 136070 + }, + { + "epoch": 19.31582682753726, + "grad_norm": 0.3927164077758789, + "learning_rate": 8.069226401703335e-05, + "loss": 0.02600858509540558, + "step": 136080 + }, + { + "epoch": 19.31724627395316, + "grad_norm": 0.3638625741004944, + "learning_rate": 8.069084457061746e-05, + "loss": 0.013808271288871765, + "step": 136090 + }, + { + "epoch": 19.318665720369054, + "grad_norm": 0.25037309527397156, + "learning_rate": 8.068942512420156e-05, + "loss": 0.0123081237077713, + "step": 136100 + }, + { + "epoch": 19.320085166784953, + "grad_norm": 0.08696846663951874, + "learning_rate": 8.068800567778567e-05, + "loss": 0.0217467337846756, + "step": 136110 + }, + { + "epoch": 19.32150461320085, + "grad_norm": 0.10587045550346375, + "learning_rate": 8.068658623136977e-05, + "loss": 0.010732536017894746, + "step": 136120 + }, + { + "epoch": 19.32292405961675, + "grad_norm": 2.229795455932617, + "learning_rate": 8.068516678495387e-05, + "loss": 0.019755491614341737, + "step": 136130 + }, + { + "epoch": 19.324343506032648, + "grad_norm": 0.13305720686912537, + "learning_rate": 8.068374733853798e-05, + "loss": 0.011332526803016663, + "step": 136140 + }, + { + "epoch": 19.325762952448546, + "grad_norm": 0.05416805297136307, + "learning_rate": 8.068232789212208e-05, + "loss": 0.010781797766685485, + "step": 136150 + }, + { + "epoch": 19.327182398864444, + "grad_norm": 0.052029553800821304, + "learning_rate": 8.068090844570619e-05, + "loss": 0.025164490938186644, + "step": 136160 + }, + { + "epoch": 19.32860184528034, + "grad_norm": 0.32842883467674255, + "learning_rate": 8.067948899929027e-05, + "loss": 0.00847855508327484, + "step": 136170 + }, + { + "epoch": 19.330021291696237, + "grad_norm": 1.8143550157546997, + "learning_rate": 8.067806955287438e-05, + "loss": 0.015515325963497162, + "step": 136180 + }, + { + "epoch": 19.331440738112136, + "grad_norm": 8.770236015319824, + "learning_rate": 8.067665010645848e-05, + "loss": 0.03948104083538055, + "step": 136190 + }, + { + "epoch": 19.332860184528034, + "grad_norm": 11.575773239135742, + "learning_rate": 8.067523066004259e-05, + "loss": 0.042207008600234984, + "step": 136200 + }, + { + "epoch": 19.334279630943932, + "grad_norm": 0.3276245892047882, + "learning_rate": 8.067381121362669e-05, + "loss": 0.023077908158302306, + "step": 136210 + }, + { + "epoch": 19.33569907735983, + "grad_norm": 2.758852243423462, + "learning_rate": 8.067239176721078e-05, + "loss": 0.08552910685539246, + "step": 136220 + }, + { + "epoch": 19.33711852377573, + "grad_norm": 3.7656126022338867, + "learning_rate": 8.06709723207949e-05, + "loss": 0.011476999521255494, + "step": 136230 + }, + { + "epoch": 19.338537970191624, + "grad_norm": 8.272939682006836, + "learning_rate": 8.066955287437899e-05, + "loss": 0.02469935715198517, + "step": 136240 + }, + { + "epoch": 19.339957416607522, + "grad_norm": 0.0628243237733841, + "learning_rate": 8.06681334279631e-05, + "loss": 0.007306870818138122, + "step": 136250 + }, + { + "epoch": 19.34137686302342, + "grad_norm": 10.782951354980469, + "learning_rate": 8.06667139815472e-05, + "loss": 0.011383648216724395, + "step": 136260 + }, + { + "epoch": 19.34279630943932, + "grad_norm": 0.1845972090959549, + "learning_rate": 8.06652945351313e-05, + "loss": 0.020281805098056792, + "step": 136270 + }, + { + "epoch": 19.344215755855217, + "grad_norm": 1.6857872009277344, + "learning_rate": 8.06638750887154e-05, + "loss": 0.0024313628673553467, + "step": 136280 + }, + { + "epoch": 19.345635202271115, + "grad_norm": 0.6315666437149048, + "learning_rate": 8.06624556422995e-05, + "loss": 0.027705147862434387, + "step": 136290 + }, + { + "epoch": 19.347054648687013, + "grad_norm": 5.49241828918457, + "learning_rate": 8.06610361958836e-05, + "loss": 0.006352344155311584, + "step": 136300 + }, + { + "epoch": 19.348474095102908, + "grad_norm": 0.10188443958759308, + "learning_rate": 8.065961674946771e-05, + "loss": 0.023860082030296326, + "step": 136310 + }, + { + "epoch": 19.349893541518806, + "grad_norm": 2.3538029193878174, + "learning_rate": 8.065819730305183e-05, + "loss": 0.0065624013543128966, + "step": 136320 + }, + { + "epoch": 19.351312987934705, + "grad_norm": 5.646554946899414, + "learning_rate": 8.065677785663591e-05, + "loss": 0.012268754839897155, + "step": 136330 + }, + { + "epoch": 19.352732434350603, + "grad_norm": 0.18078984320163727, + "learning_rate": 8.065535841022002e-05, + "loss": 0.008300693333148956, + "step": 136340 + }, + { + "epoch": 19.3541518807665, + "grad_norm": 0.032229866832494736, + "learning_rate": 8.065393896380412e-05, + "loss": 0.01744912415742874, + "step": 136350 + }, + { + "epoch": 19.3555713271824, + "grad_norm": 0.04063647240400314, + "learning_rate": 8.065251951738823e-05, + "loss": 0.010966229438781738, + "step": 136360 + }, + { + "epoch": 19.356990773598298, + "grad_norm": 0.2905770540237427, + "learning_rate": 8.065110007097233e-05, + "loss": 0.03950079083442688, + "step": 136370 + }, + { + "epoch": 19.358410220014193, + "grad_norm": 0.28508496284484863, + "learning_rate": 8.064968062455642e-05, + "loss": 0.01320267766714096, + "step": 136380 + }, + { + "epoch": 19.35982966643009, + "grad_norm": 0.727344274520874, + "learning_rate": 8.064826117814052e-05, + "loss": 0.020682677626609802, + "step": 136390 + }, + { + "epoch": 19.36124911284599, + "grad_norm": 0.1807493418455124, + "learning_rate": 8.064684173172463e-05, + "loss": 0.009905187785625458, + "step": 136400 + }, + { + "epoch": 19.362668559261888, + "grad_norm": 10.234251976013184, + "learning_rate": 8.064542228530874e-05, + "loss": 0.04010664820671082, + "step": 136410 + }, + { + "epoch": 19.364088005677786, + "grad_norm": 0.504630446434021, + "learning_rate": 8.064400283889284e-05, + "loss": 0.013863730430603027, + "step": 136420 + }, + { + "epoch": 19.365507452093684, + "grad_norm": 0.6506510376930237, + "learning_rate": 8.064258339247694e-05, + "loss": 0.0075215592980384825, + "step": 136430 + }, + { + "epoch": 19.366926898509583, + "grad_norm": 0.2712944447994232, + "learning_rate": 8.064116394606103e-05, + "loss": 0.01054297536611557, + "step": 136440 + }, + { + "epoch": 19.368346344925477, + "grad_norm": 0.02783302217721939, + "learning_rate": 8.063974449964515e-05, + "loss": 0.01245361790060997, + "step": 136450 + }, + { + "epoch": 19.369765791341376, + "grad_norm": 0.8528205156326294, + "learning_rate": 8.063832505322924e-05, + "loss": 0.05546398758888245, + "step": 136460 + }, + { + "epoch": 19.371185237757274, + "grad_norm": 3.5433225631713867, + "learning_rate": 8.063690560681335e-05, + "loss": 0.010478836297988892, + "step": 136470 + }, + { + "epoch": 19.372604684173172, + "grad_norm": 2.782099485397339, + "learning_rate": 8.063548616039744e-05, + "loss": 0.006739428639411927, + "step": 136480 + }, + { + "epoch": 19.37402413058907, + "grad_norm": 0.13132494688034058, + "learning_rate": 8.063406671398155e-05, + "loss": 0.0035120531916618346, + "step": 136490 + }, + { + "epoch": 19.37544357700497, + "grad_norm": 0.8922194838523865, + "learning_rate": 8.063264726756566e-05, + "loss": 0.018763212859630583, + "step": 136500 + }, + { + "epoch": 19.37544357700497, + "eval_accuracy": 0.9877916958097539, + "eval_loss": 0.042008642107248306, + "eval_runtime": 33.4344, + "eval_samples_per_second": 470.383, + "eval_steps_per_second": 14.715, + "step": 136500 + }, + { + "epoch": 19.376863023420867, + "grad_norm": 1.9103628396987915, + "learning_rate": 8.063122782114976e-05, + "loss": 0.03362273871898651, + "step": 136510 + }, + { + "epoch": 19.378282469836762, + "grad_norm": 0.08694620430469513, + "learning_rate": 8.062980837473387e-05, + "loss": 0.021892617642879485, + "step": 136520 + }, + { + "epoch": 19.37970191625266, + "grad_norm": 6.578057765960693, + "learning_rate": 8.062838892831795e-05, + "loss": 0.026524189114570617, + "step": 136530 + }, + { + "epoch": 19.38112136266856, + "grad_norm": 0.4580860137939453, + "learning_rate": 8.062696948190206e-05, + "loss": 0.018569007515907288, + "step": 136540 + }, + { + "epoch": 19.382540809084457, + "grad_norm": 7.30665397644043, + "learning_rate": 8.062555003548616e-05, + "loss": 0.03951026201248169, + "step": 136550 + }, + { + "epoch": 19.383960255500355, + "grad_norm": 6.436558246612549, + "learning_rate": 8.062413058907027e-05, + "loss": 0.02224210202693939, + "step": 136560 + }, + { + "epoch": 19.385379701916253, + "grad_norm": 0.022254034876823425, + "learning_rate": 8.062271114265437e-05, + "loss": 0.06707167029380798, + "step": 136570 + }, + { + "epoch": 19.386799148332152, + "grad_norm": 1.2745411396026611, + "learning_rate": 8.062129169623847e-05, + "loss": 0.022767841815948486, + "step": 136580 + }, + { + "epoch": 19.388218594748047, + "grad_norm": 0.22142945230007172, + "learning_rate": 8.061987224982258e-05, + "loss": 0.06356111168861389, + "step": 136590 + }, + { + "epoch": 19.389638041163945, + "grad_norm": 0.017324067652225494, + "learning_rate": 8.061845280340667e-05, + "loss": 0.008104667067527771, + "step": 136600 + }, + { + "epoch": 19.391057487579843, + "grad_norm": 1.24216628074646, + "learning_rate": 8.061703335699078e-05, + "loss": 0.01950208395719528, + "step": 136610 + }, + { + "epoch": 19.39247693399574, + "grad_norm": 1.775840401649475, + "learning_rate": 8.061561391057488e-05, + "loss": 0.005649669468402863, + "step": 136620 + }, + { + "epoch": 19.39389638041164, + "grad_norm": 0.4429466128349304, + "learning_rate": 8.061419446415898e-05, + "loss": 0.0057801961898803714, + "step": 136630 + }, + { + "epoch": 19.395315826827538, + "grad_norm": 0.035086508840322495, + "learning_rate": 8.061277501774308e-05, + "loss": 0.010500229895114899, + "step": 136640 + }, + { + "epoch": 19.396735273243436, + "grad_norm": 0.6924319863319397, + "learning_rate": 8.061135557132719e-05, + "loss": 0.004986953735351562, + "step": 136650 + }, + { + "epoch": 19.39815471965933, + "grad_norm": 1.0726513862609863, + "learning_rate": 8.060993612491129e-05, + "loss": 0.040648224949836734, + "step": 136660 + }, + { + "epoch": 19.39957416607523, + "grad_norm": 8.95368766784668, + "learning_rate": 8.06085166784954e-05, + "loss": 0.02026577889919281, + "step": 136670 + }, + { + "epoch": 19.400993612491128, + "grad_norm": 0.01722809486091137, + "learning_rate": 8.06070972320795e-05, + "loss": 0.01899726241827011, + "step": 136680 + }, + { + "epoch": 19.402413058907026, + "grad_norm": 9.28128719329834, + "learning_rate": 8.060567778566359e-05, + "loss": 0.016845521330833436, + "step": 136690 + }, + { + "epoch": 19.403832505322924, + "grad_norm": 6.689579486846924, + "learning_rate": 8.06042583392477e-05, + "loss": 0.03843151926994324, + "step": 136700 + }, + { + "epoch": 19.405251951738823, + "grad_norm": 0.0509391613304615, + "learning_rate": 8.06028388928318e-05, + "loss": 0.008636415004730225, + "step": 136710 + }, + { + "epoch": 19.40667139815472, + "grad_norm": 0.10041744261980057, + "learning_rate": 8.060141944641591e-05, + "loss": 0.03410317003726959, + "step": 136720 + }, + { + "epoch": 19.408090844570616, + "grad_norm": 1.3901797533035278, + "learning_rate": 8.060000000000001e-05, + "loss": 0.011419706791639329, + "step": 136730 + }, + { + "epoch": 19.409510290986514, + "grad_norm": 0.08800628036260605, + "learning_rate": 8.05985805535841e-05, + "loss": 0.019385628402233124, + "step": 136740 + }, + { + "epoch": 19.410929737402412, + "grad_norm": 2.3375251293182373, + "learning_rate": 8.05971611071682e-05, + "loss": 0.040280142426490785, + "step": 136750 + }, + { + "epoch": 19.41234918381831, + "grad_norm": 1.1224855184555054, + "learning_rate": 8.059574166075231e-05, + "loss": 0.01825178861618042, + "step": 136760 + }, + { + "epoch": 19.41376863023421, + "grad_norm": 2.9055190086364746, + "learning_rate": 8.059432221433641e-05, + "loss": 0.03754159212112427, + "step": 136770 + }, + { + "epoch": 19.415188076650107, + "grad_norm": 11.777843475341797, + "learning_rate": 8.059290276792052e-05, + "loss": 0.05444482564926147, + "step": 136780 + }, + { + "epoch": 19.416607523066006, + "grad_norm": 0.005288857501000166, + "learning_rate": 8.059148332150462e-05, + "loss": 0.06572566032409669, + "step": 136790 + }, + { + "epoch": 19.4180269694819, + "grad_norm": 0.06092311069369316, + "learning_rate": 8.059006387508872e-05, + "loss": 0.039466175436973575, + "step": 136800 + }, + { + "epoch": 19.4194464158978, + "grad_norm": 0.3441387116909027, + "learning_rate": 8.058864442867283e-05, + "loss": 0.032243740558624265, + "step": 136810 + }, + { + "epoch": 19.420865862313697, + "grad_norm": 0.6930990219116211, + "learning_rate": 8.058722498225692e-05, + "loss": 0.005177357420325279, + "step": 136820 + }, + { + "epoch": 19.422285308729595, + "grad_norm": 7.753748416900635, + "learning_rate": 8.058580553584104e-05, + "loss": 0.0090839721262455, + "step": 136830 + }, + { + "epoch": 19.423704755145494, + "grad_norm": 0.012613370083272457, + "learning_rate": 8.058438608942512e-05, + "loss": 0.0094209223985672, + "step": 136840 + }, + { + "epoch": 19.425124201561392, + "grad_norm": 6.884276866912842, + "learning_rate": 8.058296664300923e-05, + "loss": 0.013114632666110992, + "step": 136850 + }, + { + "epoch": 19.42654364797729, + "grad_norm": 0.020718177780508995, + "learning_rate": 8.058154719659333e-05, + "loss": 0.006486819684505462, + "step": 136860 + }, + { + "epoch": 19.427963094393185, + "grad_norm": 3.612637996673584, + "learning_rate": 8.058012775017744e-05, + "loss": 0.022154442965984344, + "step": 136870 + }, + { + "epoch": 19.429382540809083, + "grad_norm": 5.871427536010742, + "learning_rate": 8.057870830376154e-05, + "loss": 0.04860754907131195, + "step": 136880 + }, + { + "epoch": 19.43080198722498, + "grad_norm": 1.2898229360580444, + "learning_rate": 8.057728885734563e-05, + "loss": 0.029851025342941283, + "step": 136890 + }, + { + "epoch": 19.43222143364088, + "grad_norm": 0.04951537400484085, + "learning_rate": 8.057586941092974e-05, + "loss": 0.006623544543981552, + "step": 136900 + }, + { + "epoch": 19.433640880056778, + "grad_norm": 0.006419728510081768, + "learning_rate": 8.057444996451384e-05, + "loss": 0.007638537138700485, + "step": 136910 + }, + { + "epoch": 19.435060326472676, + "grad_norm": 2.1527960300445557, + "learning_rate": 8.057303051809795e-05, + "loss": 0.004995374009013176, + "step": 136920 + }, + { + "epoch": 19.436479772888575, + "grad_norm": 0.4171311855316162, + "learning_rate": 8.057161107168205e-05, + "loss": 0.02046493738889694, + "step": 136930 + }, + { + "epoch": 19.43789921930447, + "grad_norm": 3.070953607559204, + "learning_rate": 8.057019162526615e-05, + "loss": 0.00913369506597519, + "step": 136940 + }, + { + "epoch": 19.439318665720368, + "grad_norm": 0.25687679648399353, + "learning_rate": 8.056877217885024e-05, + "loss": 0.005855118855834007, + "step": 136950 + }, + { + "epoch": 19.440738112136266, + "grad_norm": 0.04907587170600891, + "learning_rate": 8.056735273243436e-05, + "loss": 0.022059416770935057, + "step": 136960 + }, + { + "epoch": 19.442157558552164, + "grad_norm": 1.4365266561508179, + "learning_rate": 8.056593328601845e-05, + "loss": 0.025617489218711854, + "step": 136970 + }, + { + "epoch": 19.443577004968063, + "grad_norm": 0.06065221130847931, + "learning_rate": 8.056451383960256e-05, + "loss": 0.011717283725738525, + "step": 136980 + }, + { + "epoch": 19.44499645138396, + "grad_norm": 0.294193834066391, + "learning_rate": 8.056309439318666e-05, + "loss": 0.017100825905799866, + "step": 136990 + }, + { + "epoch": 19.44641589779986, + "grad_norm": 0.29871729016304016, + "learning_rate": 8.056167494677076e-05, + "loss": 0.022216755151748657, + "step": 137000 + }, + { + "epoch": 19.44641589779986, + "eval_accuracy": 0.9853118840211101, + "eval_loss": 0.056656353175640106, + "eval_runtime": 33.7296, + "eval_samples_per_second": 466.268, + "eval_steps_per_second": 14.587, + "step": 137000 + }, + { + "epoch": 19.447835344215754, + "grad_norm": 15.346325874328613, + "learning_rate": 8.056025550035487e-05, + "loss": 0.05268167853355408, + "step": 137010 + }, + { + "epoch": 19.449254790631652, + "grad_norm": 0.06122482940554619, + "learning_rate": 8.055883605393897e-05, + "loss": 0.001168125495314598, + "step": 137020 + }, + { + "epoch": 19.45067423704755, + "grad_norm": 0.06554862856864929, + "learning_rate": 8.055741660752308e-05, + "loss": 0.03985010087490082, + "step": 137030 + }, + { + "epoch": 19.45209368346345, + "grad_norm": 0.04417649656534195, + "learning_rate": 8.055599716110718e-05, + "loss": 0.0025506459176540376, + "step": 137040 + }, + { + "epoch": 19.453513129879347, + "grad_norm": 0.17910021543502808, + "learning_rate": 8.055457771469127e-05, + "loss": 0.011689558625221252, + "step": 137050 + }, + { + "epoch": 19.454932576295246, + "grad_norm": 7.048887252807617, + "learning_rate": 8.055315826827537e-05, + "loss": 0.011845612525939941, + "step": 137060 + }, + { + "epoch": 19.456352022711144, + "grad_norm": 0.5460303425788879, + "learning_rate": 8.055188076650107e-05, + "loss": 0.04180977344512939, + "step": 137070 + }, + { + "epoch": 19.45777146912704, + "grad_norm": 0.05867021530866623, + "learning_rate": 8.055046132008517e-05, + "loss": 0.026071444153785706, + "step": 137080 + }, + { + "epoch": 19.459190915542937, + "grad_norm": 0.015164356678724289, + "learning_rate": 8.054904187366928e-05, + "loss": 0.00244579054415226, + "step": 137090 + }, + { + "epoch": 19.460610361958835, + "grad_norm": 1.2402942180633545, + "learning_rate": 8.054762242725337e-05, + "loss": 0.013297773897647858, + "step": 137100 + }, + { + "epoch": 19.462029808374734, + "grad_norm": 0.3962368369102478, + "learning_rate": 8.054620298083749e-05, + "loss": 0.0031079955399036406, + "step": 137110 + }, + { + "epoch": 19.463449254790632, + "grad_norm": 0.12435641884803772, + "learning_rate": 8.054478353442157e-05, + "loss": 0.009549376368522645, + "step": 137120 + }, + { + "epoch": 19.46486870120653, + "grad_norm": 0.3245566487312317, + "learning_rate": 8.054336408800568e-05, + "loss": 0.025057733058929443, + "step": 137130 + }, + { + "epoch": 19.46628814762243, + "grad_norm": 0.7982626557350159, + "learning_rate": 8.054194464158978e-05, + "loss": 0.022643449902534484, + "step": 137140 + }, + { + "epoch": 19.467707594038323, + "grad_norm": 0.22677786648273468, + "learning_rate": 8.054052519517389e-05, + "loss": 0.019562676548957825, + "step": 137150 + }, + { + "epoch": 19.46912704045422, + "grad_norm": 0.8642492294311523, + "learning_rate": 8.0539105748758e-05, + "loss": 0.0022503107786178587, + "step": 137160 + }, + { + "epoch": 19.47054648687012, + "grad_norm": 0.018497856333851814, + "learning_rate": 8.053768630234208e-05, + "loss": 0.015945518016815187, + "step": 137170 + }, + { + "epoch": 19.471965933286018, + "grad_norm": 0.13687346875667572, + "learning_rate": 8.05362668559262e-05, + "loss": 0.02993578016757965, + "step": 137180 + }, + { + "epoch": 19.473385379701917, + "grad_norm": 0.4365893304347992, + "learning_rate": 8.053484740951029e-05, + "loss": 0.013093468546867371, + "step": 137190 + }, + { + "epoch": 19.474804826117815, + "grad_norm": 0.4910050332546234, + "learning_rate": 8.05334279630944e-05, + "loss": 0.030686333775520325, + "step": 137200 + }, + { + "epoch": 19.476224272533713, + "grad_norm": 0.07861913740634918, + "learning_rate": 8.05320085166785e-05, + "loss": 0.036781036853790285, + "step": 137210 + }, + { + "epoch": 19.477643718949608, + "grad_norm": 0.2435026317834854, + "learning_rate": 8.05305890702626e-05, + "loss": 0.008760906010866164, + "step": 137220 + }, + { + "epoch": 19.479063165365506, + "grad_norm": 0.11083705723285675, + "learning_rate": 8.05291696238467e-05, + "loss": 0.025510281324386597, + "step": 137230 + }, + { + "epoch": 19.480482611781405, + "grad_norm": 6.562149524688721, + "learning_rate": 8.05277501774308e-05, + "loss": 0.013110196590423584, + "step": 137240 + }, + { + "epoch": 19.481902058197303, + "grad_norm": 0.015752902254462242, + "learning_rate": 8.052633073101492e-05, + "loss": 0.004713873565196991, + "step": 137250 + }, + { + "epoch": 19.4833215046132, + "grad_norm": 0.4762270152568817, + "learning_rate": 8.052491128459901e-05, + "loss": 0.0068397440016269686, + "step": 137260 + }, + { + "epoch": 19.4847409510291, + "grad_norm": 1.1166408061981201, + "learning_rate": 8.052349183818311e-05, + "loss": 0.006467262655496598, + "step": 137270 + }, + { + "epoch": 19.486160397444998, + "grad_norm": 2.362943649291992, + "learning_rate": 8.052207239176721e-05, + "loss": 0.0054939169436693195, + "step": 137280 + }, + { + "epoch": 19.487579843860892, + "grad_norm": 6.425047874450684, + "learning_rate": 8.052065294535132e-05, + "loss": 0.05379377603530884, + "step": 137290 + }, + { + "epoch": 19.48899929027679, + "grad_norm": 0.8402780294418335, + "learning_rate": 8.051923349893542e-05, + "loss": 0.002586003392934799, + "step": 137300 + }, + { + "epoch": 19.49041873669269, + "grad_norm": 2.7678849697113037, + "learning_rate": 8.051795599716112e-05, + "loss": 0.0818613350391388, + "step": 137310 + }, + { + "epoch": 19.491838183108587, + "grad_norm": 0.03431839123368263, + "learning_rate": 8.051653655074521e-05, + "loss": 0.013517959415912629, + "step": 137320 + }, + { + "epoch": 19.493257629524486, + "grad_norm": 0.013028129935264587, + "learning_rate": 8.051511710432932e-05, + "loss": 0.03265658915042877, + "step": 137330 + }, + { + "epoch": 19.494677075940384, + "grad_norm": 2.9575202465057373, + "learning_rate": 8.051369765791342e-05, + "loss": 0.015177388489246369, + "step": 137340 + }, + { + "epoch": 19.496096522356282, + "grad_norm": 0.03283777832984924, + "learning_rate": 8.051227821149752e-05, + "loss": 0.034216710925102235, + "step": 137350 + }, + { + "epoch": 19.497515968772177, + "grad_norm": 0.10726157575845718, + "learning_rate": 8.051085876508162e-05, + "loss": 0.013336297869682313, + "step": 137360 + }, + { + "epoch": 19.498935415188075, + "grad_norm": 0.10614597797393799, + "learning_rate": 8.050943931866573e-05, + "loss": 0.0020912285894155503, + "step": 137370 + }, + { + "epoch": 19.500354861603974, + "grad_norm": 0.15989543497562408, + "learning_rate": 8.050801987224982e-05, + "loss": 0.005193191021680832, + "step": 137380 + }, + { + "epoch": 19.501774308019872, + "grad_norm": 0.03939535468816757, + "learning_rate": 8.050660042583393e-05, + "loss": 0.001727219671010971, + "step": 137390 + }, + { + "epoch": 19.50319375443577, + "grad_norm": 0.04129151254892349, + "learning_rate": 8.050518097941803e-05, + "loss": 0.009687118232250214, + "step": 137400 + }, + { + "epoch": 19.50461320085167, + "grad_norm": 9.898896217346191, + "learning_rate": 8.050376153300213e-05, + "loss": 0.02417587786912918, + "step": 137410 + }, + { + "epoch": 19.506032647267567, + "grad_norm": 0.6055755019187927, + "learning_rate": 8.050234208658624e-05, + "loss": 0.012402527779340745, + "step": 137420 + }, + { + "epoch": 19.50745209368346, + "grad_norm": 0.5404500365257263, + "learning_rate": 8.050092264017034e-05, + "loss": 0.01648992598056793, + "step": 137430 + }, + { + "epoch": 19.50887154009936, + "grad_norm": 4.68674898147583, + "learning_rate": 8.049950319375445e-05, + "loss": 0.028700920939445495, + "step": 137440 + }, + { + "epoch": 19.51029098651526, + "grad_norm": 6.661839485168457, + "learning_rate": 8.049808374733853e-05, + "loss": 0.023427054286003113, + "step": 137450 + }, + { + "epoch": 19.511710432931157, + "grad_norm": 0.07466240972280502, + "learning_rate": 8.049666430092264e-05, + "loss": 0.015752318501472472, + "step": 137460 + }, + { + "epoch": 19.513129879347055, + "grad_norm": 0.1380309760570526, + "learning_rate": 8.049524485450674e-05, + "loss": 0.009265802055597305, + "step": 137470 + }, + { + "epoch": 19.514549325762953, + "grad_norm": 0.3679251968860626, + "learning_rate": 8.049382540809085e-05, + "loss": 0.002256740629673004, + "step": 137480 + }, + { + "epoch": 19.51596877217885, + "grad_norm": 0.18063372373580933, + "learning_rate": 8.049240596167495e-05, + "loss": 0.0041745990514755246, + "step": 137490 + }, + { + "epoch": 19.517388218594746, + "grad_norm": 8.763851165771484, + "learning_rate": 8.049098651525905e-05, + "loss": 0.0062512621283531185, + "step": 137500 + }, + { + "epoch": 19.517388218594746, + "eval_accuracy": 0.988046035480384, + "eval_loss": 0.05227066949009895, + "eval_runtime": 34.1858, + "eval_samples_per_second": 460.045, + "eval_steps_per_second": 14.392, + "step": 137500 + }, + { + "epoch": 19.518807665010645, + "grad_norm": 4.479283809661865, + "learning_rate": 8.048956706884316e-05, + "loss": 0.007906591892242432, + "step": 137510 + }, + { + "epoch": 19.520227111426543, + "grad_norm": 4.453495025634766, + "learning_rate": 8.048814762242725e-05, + "loss": 0.007586041092872619, + "step": 137520 + }, + { + "epoch": 19.52164655784244, + "grad_norm": 2.1542985439300537, + "learning_rate": 8.048672817601137e-05, + "loss": 0.01741510033607483, + "step": 137530 + }, + { + "epoch": 19.52306600425834, + "grad_norm": 0.005334902089089155, + "learning_rate": 8.048530872959546e-05, + "loss": 0.02001422792673111, + "step": 137540 + }, + { + "epoch": 19.524485450674238, + "grad_norm": 21.645727157592773, + "learning_rate": 8.048388928317956e-05, + "loss": 0.029499968886375426, + "step": 137550 + }, + { + "epoch": 19.525904897090136, + "grad_norm": 7.598186492919922, + "learning_rate": 8.048246983676366e-05, + "loss": 0.077846360206604, + "step": 137560 + }, + { + "epoch": 19.52732434350603, + "grad_norm": 0.08335864543914795, + "learning_rate": 8.048105039034777e-05, + "loss": 0.0755756139755249, + "step": 137570 + }, + { + "epoch": 19.52874378992193, + "grad_norm": 11.596985816955566, + "learning_rate": 8.047963094393187e-05, + "loss": 0.04176556766033172, + "step": 137580 + }, + { + "epoch": 19.530163236337827, + "grad_norm": 0.6123652458190918, + "learning_rate": 8.047821149751598e-05, + "loss": 0.005750620737671852, + "step": 137590 + }, + { + "epoch": 19.531582682753726, + "grad_norm": 0.0034010582603514194, + "learning_rate": 8.047679205110007e-05, + "loss": 0.002270437404513359, + "step": 137600 + }, + { + "epoch": 19.533002129169624, + "grad_norm": 0.09473180025815964, + "learning_rate": 8.047537260468417e-05, + "loss": 0.006962215900421143, + "step": 137610 + }, + { + "epoch": 19.534421575585522, + "grad_norm": 1.2294211387634277, + "learning_rate": 8.047395315826828e-05, + "loss": 0.023443740606307984, + "step": 137620 + }, + { + "epoch": 19.53584102200142, + "grad_norm": 0.6309512853622437, + "learning_rate": 8.047253371185238e-05, + "loss": 0.03320013284683228, + "step": 137630 + }, + { + "epoch": 19.537260468417315, + "grad_norm": 2.6269350051879883, + "learning_rate": 8.047111426543649e-05, + "loss": 0.022580428421497344, + "step": 137640 + }, + { + "epoch": 19.538679914833214, + "grad_norm": 14.174617767333984, + "learning_rate": 8.046969481902059e-05, + "loss": 0.031341654062271115, + "step": 137650 + }, + { + "epoch": 19.540099361249112, + "grad_norm": 0.0707431212067604, + "learning_rate": 8.046827537260469e-05, + "loss": 0.0052914883941411976, + "step": 137660 + }, + { + "epoch": 19.54151880766501, + "grad_norm": 6.212255954742432, + "learning_rate": 8.046685592618878e-05, + "loss": 0.008136822283267975, + "step": 137670 + }, + { + "epoch": 19.54293825408091, + "grad_norm": 0.12009210139513016, + "learning_rate": 8.04654364797729e-05, + "loss": 0.02860853374004364, + "step": 137680 + }, + { + "epoch": 19.544357700496807, + "grad_norm": 0.17526400089263916, + "learning_rate": 8.046401703335699e-05, + "loss": 0.0434247076511383, + "step": 137690 + }, + { + "epoch": 19.545777146912705, + "grad_norm": 0.8788767457008362, + "learning_rate": 8.04625975869411e-05, + "loss": 0.03675092458724975, + "step": 137700 + }, + { + "epoch": 19.5471965933286, + "grad_norm": 3.4288830757141113, + "learning_rate": 8.04611781405252e-05, + "loss": 0.022850652039051057, + "step": 137710 + }, + { + "epoch": 19.5486160397445, + "grad_norm": 12.267441749572754, + "learning_rate": 8.04597586941093e-05, + "loss": 0.04059075117111206, + "step": 137720 + }, + { + "epoch": 19.550035486160397, + "grad_norm": 6.900718688964844, + "learning_rate": 8.045833924769341e-05, + "loss": 0.0090223990380764, + "step": 137730 + }, + { + "epoch": 19.551454932576295, + "grad_norm": 0.49269208312034607, + "learning_rate": 8.04569198012775e-05, + "loss": 0.05740405917167664, + "step": 137740 + }, + { + "epoch": 19.552874378992193, + "grad_norm": 15.897799491882324, + "learning_rate": 8.045550035486162e-05, + "loss": 0.030545425415039063, + "step": 137750 + }, + { + "epoch": 19.55429382540809, + "grad_norm": 0.03522428870201111, + "learning_rate": 8.04540809084457e-05, + "loss": 0.008827247470617295, + "step": 137760 + }, + { + "epoch": 19.55571327182399, + "grad_norm": 0.38180023431777954, + "learning_rate": 8.045266146202981e-05, + "loss": 0.0034719817340373993, + "step": 137770 + }, + { + "epoch": 19.557132718239885, + "grad_norm": 0.09027215838432312, + "learning_rate": 8.045124201561391e-05, + "loss": 0.012767669558525086, + "step": 137780 + }, + { + "epoch": 19.558552164655783, + "grad_norm": 14.821966171264648, + "learning_rate": 8.044982256919802e-05, + "loss": 0.024404963850975035, + "step": 137790 + }, + { + "epoch": 19.55997161107168, + "grad_norm": 0.0908409133553505, + "learning_rate": 8.044840312278212e-05, + "loss": 0.004921406880021096, + "step": 137800 + }, + { + "epoch": 19.56139105748758, + "grad_norm": 7.770122051239014, + "learning_rate": 8.044698367636621e-05, + "loss": 0.012073308229446411, + "step": 137810 + }, + { + "epoch": 19.562810503903478, + "grad_norm": 11.927591323852539, + "learning_rate": 8.044556422995033e-05, + "loss": 0.03332340717315674, + "step": 137820 + }, + { + "epoch": 19.564229950319376, + "grad_norm": 3.9551355838775635, + "learning_rate": 8.044414478353442e-05, + "loss": 0.02835809588432312, + "step": 137830 + }, + { + "epoch": 19.565649396735274, + "grad_norm": 0.30653974413871765, + "learning_rate": 8.044272533711853e-05, + "loss": 0.014580333232879638, + "step": 137840 + }, + { + "epoch": 19.56706884315117, + "grad_norm": 0.9868557453155518, + "learning_rate": 8.044130589070263e-05, + "loss": 0.029064875841140748, + "step": 137850 + }, + { + "epoch": 19.568488289567068, + "grad_norm": 8.199562072753906, + "learning_rate": 8.043988644428673e-05, + "loss": 0.04607329964637756, + "step": 137860 + }, + { + "epoch": 19.569907735982966, + "grad_norm": 0.038842763751745224, + "learning_rate": 8.043846699787083e-05, + "loss": 0.020066358149051666, + "step": 137870 + }, + { + "epoch": 19.571327182398864, + "grad_norm": 0.06427346915006638, + "learning_rate": 8.043704755145494e-05, + "loss": 0.020113299787044524, + "step": 137880 + }, + { + "epoch": 19.572746628814762, + "grad_norm": 0.018326854333281517, + "learning_rate": 8.043562810503903e-05, + "loss": 0.013383975625038147, + "step": 137890 + }, + { + "epoch": 19.57416607523066, + "grad_norm": 16.43416976928711, + "learning_rate": 8.043420865862314e-05, + "loss": 0.040350151062011716, + "step": 137900 + }, + { + "epoch": 19.57558552164656, + "grad_norm": 0.596812903881073, + "learning_rate": 8.043278921220724e-05, + "loss": 0.004540695995092392, + "step": 137910 + }, + { + "epoch": 19.577004968062454, + "grad_norm": 0.22771741449832916, + "learning_rate": 8.043136976579134e-05, + "loss": 0.005516242608428002, + "step": 137920 + }, + { + "epoch": 19.578424414478352, + "grad_norm": 0.12957964837551117, + "learning_rate": 8.042995031937545e-05, + "loss": 0.002573397010564804, + "step": 137930 + }, + { + "epoch": 19.57984386089425, + "grad_norm": 0.14717452228069305, + "learning_rate": 8.042853087295955e-05, + "loss": 0.002750430628657341, + "step": 137940 + }, + { + "epoch": 19.58126330731015, + "grad_norm": 0.2578049302101135, + "learning_rate": 8.042711142654366e-05, + "loss": 0.015633463859558105, + "step": 137950 + }, + { + "epoch": 19.582682753726047, + "grad_norm": 0.0091329924762249, + "learning_rate": 8.042569198012774e-05, + "loss": 0.01024954691529274, + "step": 137960 + }, + { + "epoch": 19.584102200141945, + "grad_norm": 9.630097389221191, + "learning_rate": 8.042427253371185e-05, + "loss": 0.017204731702804565, + "step": 137970 + }, + { + "epoch": 19.585521646557844, + "grad_norm": 0.018298881128430367, + "learning_rate": 8.042285308729595e-05, + "loss": 0.0032829798758029936, + "step": 137980 + }, + { + "epoch": 19.58694109297374, + "grad_norm": 3.565556764602661, + "learning_rate": 8.042143364088006e-05, + "loss": 0.01979718804359436, + "step": 137990 + }, + { + "epoch": 19.588360539389637, + "grad_norm": 3.9285240173339844, + "learning_rate": 8.042001419446417e-05, + "loss": 0.023946885764598847, + "step": 138000 + }, + { + "epoch": 19.588360539389637, + "eval_accuracy": 0.9872830164684937, + "eval_loss": 0.050660859793424606, + "eval_runtime": 34.0231, + "eval_samples_per_second": 462.245, + "eval_steps_per_second": 14.461, + "step": 138000 + }, + { + "epoch": 19.589779985805535, + "grad_norm": 7.529336452484131, + "learning_rate": 8.041859474804827e-05, + "loss": 0.0438548356294632, + "step": 138010 + }, + { + "epoch": 19.591199432221433, + "grad_norm": 0.0073790960013866425, + "learning_rate": 8.041717530163237e-05, + "loss": 0.029686707258224487, + "step": 138020 + }, + { + "epoch": 19.59261887863733, + "grad_norm": 2.1580491065979004, + "learning_rate": 8.041575585521646e-05, + "loss": 0.022687962651252745, + "step": 138030 + }, + { + "epoch": 19.59403832505323, + "grad_norm": 6.535014629364014, + "learning_rate": 8.041433640880058e-05, + "loss": 0.008999032527208328, + "step": 138040 + }, + { + "epoch": 19.59545777146913, + "grad_norm": 3.3322510719299316, + "learning_rate": 8.041291696238467e-05, + "loss": 0.017945469915866853, + "step": 138050 + }, + { + "epoch": 19.596877217885023, + "grad_norm": 0.1896844357252121, + "learning_rate": 8.041149751596878e-05, + "loss": 0.017649631202220916, + "step": 138060 + }, + { + "epoch": 19.59829666430092, + "grad_norm": 6.295166015625, + "learning_rate": 8.041007806955287e-05, + "loss": 0.04750989973545074, + "step": 138070 + }, + { + "epoch": 19.59971611071682, + "grad_norm": 0.003022885648533702, + "learning_rate": 8.040865862313698e-05, + "loss": 0.01910801827907562, + "step": 138080 + }, + { + "epoch": 19.601135557132718, + "grad_norm": 14.71426773071289, + "learning_rate": 8.040723917672109e-05, + "loss": 0.03957200050354004, + "step": 138090 + }, + { + "epoch": 19.602555003548616, + "grad_norm": 0.1794685423374176, + "learning_rate": 8.040581973030519e-05, + "loss": 0.008308248966932297, + "step": 138100 + }, + { + "epoch": 19.603974449964515, + "grad_norm": 8.984095573425293, + "learning_rate": 8.04044002838893e-05, + "loss": 0.046796905994415286, + "step": 138110 + }, + { + "epoch": 19.605393896380413, + "grad_norm": 9.828641891479492, + "learning_rate": 8.040298083747338e-05, + "loss": 0.05411611199378967, + "step": 138120 + }, + { + "epoch": 19.606813342796308, + "grad_norm": 0.029189540073275566, + "learning_rate": 8.040156139105749e-05, + "loss": 0.04993451535701752, + "step": 138130 + }, + { + "epoch": 19.608232789212206, + "grad_norm": 1.7016907930374146, + "learning_rate": 8.040014194464159e-05, + "loss": 0.015500412881374359, + "step": 138140 + }, + { + "epoch": 19.609652235628104, + "grad_norm": 0.015035353600978851, + "learning_rate": 8.03987224982257e-05, + "loss": 0.019698965549468993, + "step": 138150 + }, + { + "epoch": 19.611071682044003, + "grad_norm": 12.345022201538086, + "learning_rate": 8.03973030518098e-05, + "loss": 0.03160671889781952, + "step": 138160 + }, + { + "epoch": 19.6124911284599, + "grad_norm": 0.5339881777763367, + "learning_rate": 8.03958836053939e-05, + "loss": 0.007254537940025329, + "step": 138170 + }, + { + "epoch": 19.6139105748758, + "grad_norm": 0.9007466435432434, + "learning_rate": 8.039446415897801e-05, + "loss": 0.036528339982032774, + "step": 138180 + }, + { + "epoch": 19.615330021291697, + "grad_norm": 1.618058443069458, + "learning_rate": 8.03930447125621e-05, + "loss": 0.004342170059680938, + "step": 138190 + }, + { + "epoch": 19.616749467707596, + "grad_norm": 0.0197339728474617, + "learning_rate": 8.039162526614622e-05, + "loss": 0.055916589498519895, + "step": 138200 + }, + { + "epoch": 19.61816891412349, + "grad_norm": 0.10298982262611389, + "learning_rate": 8.039020581973031e-05, + "loss": 0.05364638566970825, + "step": 138210 + }, + { + "epoch": 19.61958836053939, + "grad_norm": 1.0998340845108032, + "learning_rate": 8.038878637331441e-05, + "loss": 0.03447889089584351, + "step": 138220 + }, + { + "epoch": 19.621007806955287, + "grad_norm": 1.7219066619873047, + "learning_rate": 8.038736692689851e-05, + "loss": 0.022036303579807282, + "step": 138230 + }, + { + "epoch": 19.622427253371185, + "grad_norm": 0.3388693928718567, + "learning_rate": 8.038594748048262e-05, + "loss": 0.030643409490585326, + "step": 138240 + }, + { + "epoch": 19.623846699787084, + "grad_norm": 0.4383505582809448, + "learning_rate": 8.038452803406672e-05, + "loss": 0.0045415710657835005, + "step": 138250 + }, + { + "epoch": 19.625266146202982, + "grad_norm": 2.166379451751709, + "learning_rate": 8.038310858765083e-05, + "loss": 0.022049549221992492, + "step": 138260 + }, + { + "epoch": 19.62668559261888, + "grad_norm": 11.74815845489502, + "learning_rate": 8.038168914123491e-05, + "loss": 0.015070411562919616, + "step": 138270 + }, + { + "epoch": 19.628105039034775, + "grad_norm": 13.44038200378418, + "learning_rate": 8.038026969481902e-05, + "loss": 0.033355483412742616, + "step": 138280 + }, + { + "epoch": 19.629524485450673, + "grad_norm": 3.0908122062683105, + "learning_rate": 8.037885024840313e-05, + "loss": 0.018821220099925994, + "step": 138290 + }, + { + "epoch": 19.63094393186657, + "grad_norm": 0.18924617767333984, + "learning_rate": 8.037743080198723e-05, + "loss": 0.004630044847726822, + "step": 138300 + }, + { + "epoch": 19.63236337828247, + "grad_norm": 0.19256950914859772, + "learning_rate": 8.037601135557134e-05, + "loss": 0.010572195053100586, + "step": 138310 + }, + { + "epoch": 19.63378282469837, + "grad_norm": 0.09992443770170212, + "learning_rate": 8.037459190915542e-05, + "loss": 0.022526408731937408, + "step": 138320 + }, + { + "epoch": 19.635202271114267, + "grad_norm": 0.23397351801395416, + "learning_rate": 8.037317246273954e-05, + "loss": 0.0019818637520074844, + "step": 138330 + }, + { + "epoch": 19.636621717530165, + "grad_norm": 0.0030143605545163155, + "learning_rate": 8.037175301632363e-05, + "loss": 0.009634774923324586, + "step": 138340 + }, + { + "epoch": 19.63804116394606, + "grad_norm": 2.1461405754089355, + "learning_rate": 8.037033356990774e-05, + "loss": 0.015274564921855926, + "step": 138350 + }, + { + "epoch": 19.639460610361958, + "grad_norm": 0.09206445515155792, + "learning_rate": 8.036891412349184e-05, + "loss": 0.012372268736362458, + "step": 138360 + }, + { + "epoch": 19.640880056777856, + "grad_norm": 0.015879858285188675, + "learning_rate": 8.036749467707595e-05, + "loss": 0.028729519248008727, + "step": 138370 + }, + { + "epoch": 19.642299503193755, + "grad_norm": 0.01536844577640295, + "learning_rate": 8.036607523066005e-05, + "loss": 0.06816805005073548, + "step": 138380 + }, + { + "epoch": 19.643718949609653, + "grad_norm": 0.4715758264064789, + "learning_rate": 8.036465578424415e-05, + "loss": 0.03528289496898651, + "step": 138390 + }, + { + "epoch": 19.64513839602555, + "grad_norm": 5.921476364135742, + "learning_rate": 8.036323633782826e-05, + "loss": 0.017167089879512785, + "step": 138400 + }, + { + "epoch": 19.64655784244145, + "grad_norm": 0.18063852190971375, + "learning_rate": 8.036181689141235e-05, + "loss": 0.03132513463497162, + "step": 138410 + }, + { + "epoch": 19.647977288857344, + "grad_norm": 2.3100228309631348, + "learning_rate": 8.036039744499647e-05, + "loss": 0.041241294145584105, + "step": 138420 + }, + { + "epoch": 19.649396735273243, + "grad_norm": 0.04371634125709534, + "learning_rate": 8.035897799858055e-05, + "loss": 0.042258650064468384, + "step": 138430 + }, + { + "epoch": 19.65081618168914, + "grad_norm": 4.336278438568115, + "learning_rate": 8.035755855216466e-05, + "loss": 0.033323565125465394, + "step": 138440 + }, + { + "epoch": 19.65223562810504, + "grad_norm": 0.19271036982536316, + "learning_rate": 8.035613910574876e-05, + "loss": 0.03666624128818512, + "step": 138450 + }, + { + "epoch": 19.653655074520938, + "grad_norm": 0.12941347062587738, + "learning_rate": 8.035471965933287e-05, + "loss": 0.00817318558692932, + "step": 138460 + }, + { + "epoch": 19.655074520936836, + "grad_norm": 1.3784598112106323, + "learning_rate": 8.035330021291697e-05, + "loss": 0.00503128431737423, + "step": 138470 + }, + { + "epoch": 19.656493967352734, + "grad_norm": 0.35918229818344116, + "learning_rate": 8.035188076650106e-05, + "loss": 0.011499008536338806, + "step": 138480 + }, + { + "epoch": 19.65791341376863, + "grad_norm": 1.7034169435501099, + "learning_rate": 8.035046132008517e-05, + "loss": 0.01096985787153244, + "step": 138490 + }, + { + "epoch": 19.659332860184527, + "grad_norm": 8.493868827819824, + "learning_rate": 8.034904187366927e-05, + "loss": 0.042811107635498044, + "step": 138500 + }, + { + "epoch": 19.659332860184527, + "eval_accuracy": 0.986011318115343, + "eval_loss": 0.051940690726041794, + "eval_runtime": 32.9397, + "eval_samples_per_second": 477.449, + "eval_steps_per_second": 14.936, + "step": 138500 + }, + { + "epoch": 19.660752306600425, + "grad_norm": 11.311059951782227, + "learning_rate": 8.034762242725338e-05, + "loss": 0.020223698019981383, + "step": 138510 + }, + { + "epoch": 19.662171753016324, + "grad_norm": 0.9719066619873047, + "learning_rate": 8.034620298083748e-05, + "loss": 0.03731773793697357, + "step": 138520 + }, + { + "epoch": 19.663591199432222, + "grad_norm": 0.04316421225667, + "learning_rate": 8.034478353442158e-05, + "loss": 0.006004315614700317, + "step": 138530 + }, + { + "epoch": 19.66501064584812, + "grad_norm": 0.058855392038822174, + "learning_rate": 8.034336408800568e-05, + "loss": 0.017175130546092987, + "step": 138540 + }, + { + "epoch": 19.66643009226402, + "grad_norm": 0.013701051473617554, + "learning_rate": 8.034194464158979e-05, + "loss": 0.05722183585166931, + "step": 138550 + }, + { + "epoch": 19.667849538679913, + "grad_norm": 0.054416071623563766, + "learning_rate": 8.034052519517388e-05, + "loss": 0.03281792104244232, + "step": 138560 + }, + { + "epoch": 19.669268985095812, + "grad_norm": 0.026844222098588943, + "learning_rate": 8.0339105748758e-05, + "loss": 0.03597028255462646, + "step": 138570 + }, + { + "epoch": 19.67068843151171, + "grad_norm": 0.2913174331188202, + "learning_rate": 8.033768630234209e-05, + "loss": 0.07092903852462769, + "step": 138580 + }, + { + "epoch": 19.67210787792761, + "grad_norm": 0.08362773805856705, + "learning_rate": 8.033626685592619e-05, + "loss": 0.011146743595600129, + "step": 138590 + }, + { + "epoch": 19.673527324343507, + "grad_norm": 5.899541854858398, + "learning_rate": 8.03348474095103e-05, + "loss": 0.01189286783337593, + "step": 138600 + }, + { + "epoch": 19.674946770759405, + "grad_norm": 2.3376224040985107, + "learning_rate": 8.03334279630944e-05, + "loss": 0.004916764795780182, + "step": 138610 + }, + { + "epoch": 19.676366217175303, + "grad_norm": 1.4671872854232788, + "learning_rate": 8.033200851667851e-05, + "loss": 0.015109889209270477, + "step": 138620 + }, + { + "epoch": 19.677785663591198, + "grad_norm": 0.12707369029521942, + "learning_rate": 8.033058907026259e-05, + "loss": 0.0031056158244609834, + "step": 138630 + }, + { + "epoch": 19.679205110007096, + "grad_norm": 0.43206775188446045, + "learning_rate": 8.03291696238467e-05, + "loss": 0.013545922935009003, + "step": 138640 + }, + { + "epoch": 19.680624556422995, + "grad_norm": 0.06616683304309845, + "learning_rate": 8.03277501774308e-05, + "loss": 0.01973980963230133, + "step": 138650 + }, + { + "epoch": 19.682044002838893, + "grad_norm": 0.08876846730709076, + "learning_rate": 8.032633073101491e-05, + "loss": 0.030627089738845825, + "step": 138660 + }, + { + "epoch": 19.68346344925479, + "grad_norm": 0.38616421818733215, + "learning_rate": 8.032491128459901e-05, + "loss": 0.01867861896753311, + "step": 138670 + }, + { + "epoch": 19.68488289567069, + "grad_norm": 12.174175262451172, + "learning_rate": 8.03234918381831e-05, + "loss": 0.04813571572303772, + "step": 138680 + }, + { + "epoch": 19.686302342086588, + "grad_norm": 0.02565019391477108, + "learning_rate": 8.032207239176722e-05, + "loss": 0.006767947971820831, + "step": 138690 + }, + { + "epoch": 19.687721788502483, + "grad_norm": 4.341635227203369, + "learning_rate": 8.032065294535131e-05, + "loss": 0.017228135466575624, + "step": 138700 + }, + { + "epoch": 19.68914123491838, + "grad_norm": 6.133458614349365, + "learning_rate": 8.031923349893543e-05, + "loss": 0.011636064946651458, + "step": 138710 + }, + { + "epoch": 19.69056068133428, + "grad_norm": 3.0059027671813965, + "learning_rate": 8.031781405251952e-05, + "loss": 0.012886139750480651, + "step": 138720 + }, + { + "epoch": 19.691980127750178, + "grad_norm": 2.7686121463775635, + "learning_rate": 8.031639460610363e-05, + "loss": 0.032272160053253174, + "step": 138730 + }, + { + "epoch": 19.693399574166076, + "grad_norm": 0.11860307306051254, + "learning_rate": 8.031497515968772e-05, + "loss": 0.02647608518600464, + "step": 138740 + }, + { + "epoch": 19.694819020581974, + "grad_norm": 0.8833091855049133, + "learning_rate": 8.031355571327183e-05, + "loss": 0.0020510002970695494, + "step": 138750 + }, + { + "epoch": 19.696238466997873, + "grad_norm": 0.8521214127540588, + "learning_rate": 8.031213626685593e-05, + "loss": 0.009417210519313813, + "step": 138760 + }, + { + "epoch": 19.697657913413767, + "grad_norm": 4.539623260498047, + "learning_rate": 8.031071682044004e-05, + "loss": 0.00844470113515854, + "step": 138770 + }, + { + "epoch": 19.699077359829666, + "grad_norm": 0.4428519010543823, + "learning_rate": 8.030929737402413e-05, + "loss": 0.06552926301956177, + "step": 138780 + }, + { + "epoch": 19.700496806245564, + "grad_norm": 0.4736507534980774, + "learning_rate": 8.030787792760823e-05, + "loss": 0.028255379199981688, + "step": 138790 + }, + { + "epoch": 19.701916252661462, + "grad_norm": 0.004549229517579079, + "learning_rate": 8.030645848119234e-05, + "loss": 0.04177501499652862, + "step": 138800 + }, + { + "epoch": 19.70333569907736, + "grad_norm": 0.1801537275314331, + "learning_rate": 8.030503903477644e-05, + "loss": 0.02348633110523224, + "step": 138810 + }, + { + "epoch": 19.70475514549326, + "grad_norm": 10.447527885437012, + "learning_rate": 8.030361958836055e-05, + "loss": 0.029440933465957643, + "step": 138820 + }, + { + "epoch": 19.706174591909157, + "grad_norm": 14.18250846862793, + "learning_rate": 8.030220014194465e-05, + "loss": 0.028277039527893066, + "step": 138830 + }, + { + "epoch": 19.707594038325052, + "grad_norm": 0.3381829261779785, + "learning_rate": 8.030078069552875e-05, + "loss": 0.05422346591949463, + "step": 138840 + }, + { + "epoch": 19.70901348474095, + "grad_norm": 4.960470676422119, + "learning_rate": 8.029936124911284e-05, + "loss": 0.057622271776199344, + "step": 138850 + }, + { + "epoch": 19.71043293115685, + "grad_norm": 0.07115144282579422, + "learning_rate": 8.029794180269695e-05, + "loss": 0.035918551683425906, + "step": 138860 + }, + { + "epoch": 19.711852377572747, + "grad_norm": 2.7241153717041016, + "learning_rate": 8.029652235628105e-05, + "loss": 0.030051085352897643, + "step": 138870 + }, + { + "epoch": 19.713271823988645, + "grad_norm": 0.11607307940721512, + "learning_rate": 8.029510290986516e-05, + "loss": 0.022767902910709382, + "step": 138880 + }, + { + "epoch": 19.714691270404543, + "grad_norm": 0.5954616665840149, + "learning_rate": 8.029368346344926e-05, + "loss": 0.024277564883232117, + "step": 138890 + }, + { + "epoch": 19.71611071682044, + "grad_norm": 0.011503017507493496, + "learning_rate": 8.029226401703336e-05, + "loss": 0.022690418362617492, + "step": 138900 + }, + { + "epoch": 19.717530163236336, + "grad_norm": 0.4360412061214447, + "learning_rate": 8.029084457061747e-05, + "loss": 0.014873498678207397, + "step": 138910 + }, + { + "epoch": 19.718949609652235, + "grad_norm": 10.609220504760742, + "learning_rate": 8.028942512420157e-05, + "loss": 0.02726553976535797, + "step": 138920 + }, + { + "epoch": 19.720369056068133, + "grad_norm": 10.142011642456055, + "learning_rate": 8.028800567778568e-05, + "loss": 0.032623404264450075, + "step": 138930 + }, + { + "epoch": 19.72178850248403, + "grad_norm": 0.6974702477455139, + "learning_rate": 8.028658623136976e-05, + "loss": 0.003662829473614693, + "step": 138940 + }, + { + "epoch": 19.72320794889993, + "grad_norm": 2.2197811603546143, + "learning_rate": 8.028516678495387e-05, + "loss": 0.02944878935813904, + "step": 138950 + }, + { + "epoch": 19.724627395315828, + "grad_norm": 0.06566993147134781, + "learning_rate": 8.028374733853797e-05, + "loss": 0.006655294448137283, + "step": 138960 + }, + { + "epoch": 19.726046841731726, + "grad_norm": 0.04445694759488106, + "learning_rate": 8.028232789212208e-05, + "loss": 0.0028432216495275497, + "step": 138970 + }, + { + "epoch": 19.72746628814762, + "grad_norm": 0.1260414719581604, + "learning_rate": 8.028090844570618e-05, + "loss": 0.005070878565311432, + "step": 138980 + }, + { + "epoch": 19.72888573456352, + "grad_norm": 0.020465118810534477, + "learning_rate": 8.027948899929027e-05, + "loss": 0.019085513055324556, + "step": 138990 + }, + { + "epoch": 19.730305180979418, + "grad_norm": 8.512038230895996, + "learning_rate": 8.027806955287438e-05, + "loss": 0.04188350141048432, + "step": 139000 + }, + { + "epoch": 19.730305180979418, + "eval_accuracy": 0.9787626375023845, + "eval_loss": 0.07654014974832535, + "eval_runtime": 33.0569, + "eval_samples_per_second": 475.756, + "eval_steps_per_second": 14.883, + "step": 139000 + }, + { + "epoch": 19.731724627395316, + "grad_norm": 0.19638408720493317, + "learning_rate": 8.027665010645848e-05, + "loss": 0.09535208940505982, + "step": 139010 + }, + { + "epoch": 19.733144073811214, + "grad_norm": 10.96106243133545, + "learning_rate": 8.027523066004259e-05, + "loss": 0.017398083209991456, + "step": 139020 + }, + { + "epoch": 19.734563520227113, + "grad_norm": 4.2082133293151855, + "learning_rate": 8.027381121362669e-05, + "loss": 0.006317382305860519, + "step": 139030 + }, + { + "epoch": 19.73598296664301, + "grad_norm": 4.548252582550049, + "learning_rate": 8.027239176721079e-05, + "loss": 0.02355315685272217, + "step": 139040 + }, + { + "epoch": 19.737402413058906, + "grad_norm": 3.755049228668213, + "learning_rate": 8.027097232079489e-05, + "loss": 0.018561244010925293, + "step": 139050 + }, + { + "epoch": 19.738821859474804, + "grad_norm": 17.563447952270508, + "learning_rate": 8.0269552874379e-05, + "loss": 0.00837668552994728, + "step": 139060 + }, + { + "epoch": 19.740241305890702, + "grad_norm": 0.0856865718960762, + "learning_rate": 8.02681334279631e-05, + "loss": 0.01875850260257721, + "step": 139070 + }, + { + "epoch": 19.7416607523066, + "grad_norm": 1.0457096099853516, + "learning_rate": 8.02667139815472e-05, + "loss": 0.006809623539447784, + "step": 139080 + }, + { + "epoch": 19.7430801987225, + "grad_norm": 0.6450104117393494, + "learning_rate": 8.02652945351313e-05, + "loss": 0.010183577239513398, + "step": 139090 + }, + { + "epoch": 19.744499645138397, + "grad_norm": 1.8897494077682495, + "learning_rate": 8.02638750887154e-05, + "loss": 0.016523340344429018, + "step": 139100 + }, + { + "epoch": 19.745919091554295, + "grad_norm": 7.488572120666504, + "learning_rate": 8.026245564229951e-05, + "loss": 0.016927893459796905, + "step": 139110 + }, + { + "epoch": 19.74733853797019, + "grad_norm": 0.09959287941455841, + "learning_rate": 8.026103619588361e-05, + "loss": 0.004210712760686875, + "step": 139120 + }, + { + "epoch": 19.74875798438609, + "grad_norm": 0.032705750316381454, + "learning_rate": 8.025961674946772e-05, + "loss": 0.005322665721178055, + "step": 139130 + }, + { + "epoch": 19.750177430801987, + "grad_norm": 1.9483076333999634, + "learning_rate": 8.025819730305182e-05, + "loss": 0.013049685955047607, + "step": 139140 + }, + { + "epoch": 19.751596877217885, + "grad_norm": 0.4516810476779938, + "learning_rate": 8.025677785663591e-05, + "loss": 0.010224513709545135, + "step": 139150 + }, + { + "epoch": 19.753016323633783, + "grad_norm": 0.03128146752715111, + "learning_rate": 8.025535841022001e-05, + "loss": 0.04900606870651245, + "step": 139160 + }, + { + "epoch": 19.75443577004968, + "grad_norm": 1.1649566888809204, + "learning_rate": 8.025393896380412e-05, + "loss": 0.01893424540758133, + "step": 139170 + }, + { + "epoch": 19.75585521646558, + "grad_norm": 0.4992629587650299, + "learning_rate": 8.025251951738822e-05, + "loss": 0.029203197360038756, + "step": 139180 + }, + { + "epoch": 19.757274662881475, + "grad_norm": 7.769445896148682, + "learning_rate": 8.025110007097233e-05, + "loss": 0.030580288171768187, + "step": 139190 + }, + { + "epoch": 19.758694109297373, + "grad_norm": 0.7296946048736572, + "learning_rate": 8.024968062455643e-05, + "loss": 0.03568483293056488, + "step": 139200 + }, + { + "epoch": 19.76011355571327, + "grad_norm": 0.1032835841178894, + "learning_rate": 8.024826117814052e-05, + "loss": 0.029664459824562072, + "step": 139210 + }, + { + "epoch": 19.76153300212917, + "grad_norm": 0.4114496409893036, + "learning_rate": 8.024684173172464e-05, + "loss": 0.05616902112960816, + "step": 139220 + }, + { + "epoch": 19.762952448545068, + "grad_norm": 0.25582075119018555, + "learning_rate": 8.024542228530873e-05, + "loss": 0.03344759047031402, + "step": 139230 + }, + { + "epoch": 19.764371894960966, + "grad_norm": 8.181188583374023, + "learning_rate": 8.024400283889284e-05, + "loss": 0.04690394699573517, + "step": 139240 + }, + { + "epoch": 19.765791341376865, + "grad_norm": 0.21906448900699615, + "learning_rate": 8.024258339247693e-05, + "loss": 0.011242108047008514, + "step": 139250 + }, + { + "epoch": 19.76721078779276, + "grad_norm": 1.6838641166687012, + "learning_rate": 8.024116394606104e-05, + "loss": 0.005934418737888336, + "step": 139260 + }, + { + "epoch": 19.768630234208658, + "grad_norm": 0.09939505159854889, + "learning_rate": 8.023974449964514e-05, + "loss": 0.0072611324489116665, + "step": 139270 + }, + { + "epoch": 19.770049680624556, + "grad_norm": 10.888344764709473, + "learning_rate": 8.023832505322925e-05, + "loss": 0.03970246315002442, + "step": 139280 + }, + { + "epoch": 19.771469127040454, + "grad_norm": 0.010829064063727856, + "learning_rate": 8.023690560681334e-05, + "loss": 0.006960421800613403, + "step": 139290 + }, + { + "epoch": 19.772888573456353, + "grad_norm": 8.759506225585938, + "learning_rate": 8.023548616039744e-05, + "loss": 0.02476315051317215, + "step": 139300 + }, + { + "epoch": 19.77430801987225, + "grad_norm": 0.019028963521122932, + "learning_rate": 8.023406671398155e-05, + "loss": 0.08776750564575195, + "step": 139310 + }, + { + "epoch": 19.77572746628815, + "grad_norm": 3.19905948638916, + "learning_rate": 8.023264726756565e-05, + "loss": 0.022287966310977937, + "step": 139320 + }, + { + "epoch": 19.777146912704044, + "grad_norm": 0.04139919579029083, + "learning_rate": 8.023122782114976e-05, + "loss": 0.0035886283963918685, + "step": 139330 + }, + { + "epoch": 19.778566359119942, + "grad_norm": 0.14054623246192932, + "learning_rate": 8.022980837473386e-05, + "loss": 0.017585280537605285, + "step": 139340 + }, + { + "epoch": 19.77998580553584, + "grad_norm": 1.868897557258606, + "learning_rate": 8.022838892831796e-05, + "loss": 0.012134405225515366, + "step": 139350 + }, + { + "epoch": 19.78140525195174, + "grad_norm": 0.010445545427501202, + "learning_rate": 8.022696948190205e-05, + "loss": 0.010198205709457397, + "step": 139360 + }, + { + "epoch": 19.782824698367637, + "grad_norm": 0.0983772873878479, + "learning_rate": 8.022555003548616e-05, + "loss": 0.03380552530288696, + "step": 139370 + }, + { + "epoch": 19.784244144783536, + "grad_norm": 0.01494747307151556, + "learning_rate": 8.022413058907026e-05, + "loss": 0.018215297162532805, + "step": 139380 + }, + { + "epoch": 19.785663591199434, + "grad_norm": 0.041560154408216476, + "learning_rate": 8.022271114265437e-05, + "loss": 0.014817462861537933, + "step": 139390 + }, + { + "epoch": 19.78708303761533, + "grad_norm": 0.37853920459747314, + "learning_rate": 8.022129169623847e-05, + "loss": 0.03316742777824402, + "step": 139400 + }, + { + "epoch": 19.788502484031227, + "grad_norm": 0.37998664379119873, + "learning_rate": 8.021987224982257e-05, + "loss": 0.003211744502186775, + "step": 139410 + }, + { + "epoch": 19.789921930447125, + "grad_norm": 1.9321067333221436, + "learning_rate": 8.021845280340668e-05, + "loss": 0.011175059527158738, + "step": 139420 + }, + { + "epoch": 19.791341376863024, + "grad_norm": 11.715725898742676, + "learning_rate": 8.021703335699078e-05, + "loss": 0.01863114982843399, + "step": 139430 + }, + { + "epoch": 19.792760823278922, + "grad_norm": 0.04460087791085243, + "learning_rate": 8.021561391057489e-05, + "loss": 0.013021458685398103, + "step": 139440 + }, + { + "epoch": 19.79418026969482, + "grad_norm": 0.5963287949562073, + "learning_rate": 8.021419446415898e-05, + "loss": 0.011012168228626251, + "step": 139450 + }, + { + "epoch": 19.79559971611072, + "grad_norm": 0.29790475964546204, + "learning_rate": 8.021277501774308e-05, + "loss": 0.020087811350822448, + "step": 139460 + }, + { + "epoch": 19.797019162526613, + "grad_norm": 0.044911980628967285, + "learning_rate": 8.021135557132718e-05, + "loss": 0.019449099898338318, + "step": 139470 + }, + { + "epoch": 19.79843860894251, + "grad_norm": 1.8258098363876343, + "learning_rate": 8.020993612491129e-05, + "loss": 0.0049147456884384155, + "step": 139480 + }, + { + "epoch": 19.79985805535841, + "grad_norm": 1.326615810394287, + "learning_rate": 8.02085166784954e-05, + "loss": 0.03728066086769104, + "step": 139490 + }, + { + "epoch": 19.801277501774308, + "grad_norm": 4.393553256988525, + "learning_rate": 8.02070972320795e-05, + "loss": 0.01763288676738739, + "step": 139500 + }, + { + "epoch": 19.801277501774308, + "eval_accuracy": 0.9856298086093979, + "eval_loss": 0.0544835589826107, + "eval_runtime": 33.1607, + "eval_samples_per_second": 474.267, + "eval_steps_per_second": 14.837, + "step": 139500 + }, + { + "epoch": 19.802696948190206, + "grad_norm": 0.11989425122737885, + "learning_rate": 8.02056777856636e-05, + "loss": 0.01877267360687256, + "step": 139510 + }, + { + "epoch": 19.804116394606105, + "grad_norm": 0.060171280056238174, + "learning_rate": 8.020425833924769e-05, + "loss": 0.00610172413289547, + "step": 139520 + }, + { + "epoch": 19.805535841022003, + "grad_norm": 0.6509189605712891, + "learning_rate": 8.02028388928318e-05, + "loss": 0.021013137698173524, + "step": 139530 + }, + { + "epoch": 19.806955287437898, + "grad_norm": 0.0794127956032753, + "learning_rate": 8.02014194464159e-05, + "loss": 0.011158202588558198, + "step": 139540 + }, + { + "epoch": 19.808374733853796, + "grad_norm": 0.05593372881412506, + "learning_rate": 8.020000000000001e-05, + "loss": 0.02410032004117966, + "step": 139550 + }, + { + "epoch": 19.809794180269694, + "grad_norm": 2.906390905380249, + "learning_rate": 8.01985805535841e-05, + "loss": 0.010091563314199447, + "step": 139560 + }, + { + "epoch": 19.811213626685593, + "grad_norm": 0.03298679366707802, + "learning_rate": 8.01971611071682e-05, + "loss": 0.01290801763534546, + "step": 139570 + }, + { + "epoch": 19.81263307310149, + "grad_norm": 0.02637336403131485, + "learning_rate": 8.019574166075232e-05, + "loss": 0.021950289607048035, + "step": 139580 + }, + { + "epoch": 19.81405251951739, + "grad_norm": 0.021645430475473404, + "learning_rate": 8.019432221433641e-05, + "loss": 0.02902255952358246, + "step": 139590 + }, + { + "epoch": 19.815471965933288, + "grad_norm": 0.014755524694919586, + "learning_rate": 8.019290276792053e-05, + "loss": 0.004876154288649559, + "step": 139600 + }, + { + "epoch": 19.816891412349182, + "grad_norm": 0.19308185577392578, + "learning_rate": 8.019148332150461e-05, + "loss": 0.002289852499961853, + "step": 139610 + }, + { + "epoch": 19.81831085876508, + "grad_norm": 3.7021358013153076, + "learning_rate": 8.019006387508872e-05, + "loss": 0.021032847464084625, + "step": 139620 + }, + { + "epoch": 19.81973030518098, + "grad_norm": 9.473984718322754, + "learning_rate": 8.018864442867282e-05, + "loss": 0.008704672008752823, + "step": 139630 + }, + { + "epoch": 19.821149751596877, + "grad_norm": 0.630850613117218, + "learning_rate": 8.018722498225693e-05, + "loss": 0.016734354197978973, + "step": 139640 + }, + { + "epoch": 19.822569198012776, + "grad_norm": 0.07666724175214767, + "learning_rate": 8.018580553584103e-05, + "loss": 0.004861601814627648, + "step": 139650 + }, + { + "epoch": 19.823988644428674, + "grad_norm": 4.326195240020752, + "learning_rate": 8.018438608942512e-05, + "loss": 0.00682557076215744, + "step": 139660 + }, + { + "epoch": 19.825408090844572, + "grad_norm": 0.04820561781525612, + "learning_rate": 8.018296664300923e-05, + "loss": 0.007097174227237701, + "step": 139670 + }, + { + "epoch": 19.826827537260467, + "grad_norm": 12.860306739807129, + "learning_rate": 8.018154719659333e-05, + "loss": 0.02446320354938507, + "step": 139680 + }, + { + "epoch": 19.828246983676365, + "grad_norm": 0.4204403758049011, + "learning_rate": 8.018012775017744e-05, + "loss": 0.017366963624954223, + "step": 139690 + }, + { + "epoch": 19.829666430092264, + "grad_norm": 0.0063424259424209595, + "learning_rate": 8.017870830376154e-05, + "loss": 0.026230162382125853, + "step": 139700 + }, + { + "epoch": 19.831085876508162, + "grad_norm": 1.6799793243408203, + "learning_rate": 8.017728885734564e-05, + "loss": 0.004461310058832169, + "step": 139710 + }, + { + "epoch": 19.83250532292406, + "grad_norm": 4.775684833526611, + "learning_rate": 8.017586941092973e-05, + "loss": 0.011769990622997283, + "step": 139720 + }, + { + "epoch": 19.83392476933996, + "grad_norm": 0.06816478073596954, + "learning_rate": 8.017444996451385e-05, + "loss": 0.01424238234758377, + "step": 139730 + }, + { + "epoch": 19.835344215755857, + "grad_norm": 3.2718963623046875, + "learning_rate": 8.017303051809794e-05, + "loss": 0.0027877304702997206, + "step": 139740 + }, + { + "epoch": 19.83676366217175, + "grad_norm": 10.610093116760254, + "learning_rate": 8.017161107168205e-05, + "loss": 0.01973050832748413, + "step": 139750 + }, + { + "epoch": 19.83818310858765, + "grad_norm": 13.073330879211426, + "learning_rate": 8.017019162526615e-05, + "loss": 0.028023535013198854, + "step": 139760 + }, + { + "epoch": 19.839602555003548, + "grad_norm": 17.038087844848633, + "learning_rate": 8.016877217885025e-05, + "loss": 0.03127200305461884, + "step": 139770 + }, + { + "epoch": 19.841022001419446, + "grad_norm": 11.913118362426758, + "learning_rate": 8.016735273243436e-05, + "loss": 0.016964422166347505, + "step": 139780 + }, + { + "epoch": 19.842441447835345, + "grad_norm": 0.44638150930404663, + "learning_rate": 8.016593328601846e-05, + "loss": 0.01624404489994049, + "step": 139790 + }, + { + "epoch": 19.843860894251243, + "grad_norm": 0.12529753148555756, + "learning_rate": 8.016451383960257e-05, + "loss": 0.05540893077850342, + "step": 139800 + }, + { + "epoch": 19.84528034066714, + "grad_norm": 0.14248676598072052, + "learning_rate": 8.016309439318667e-05, + "loss": 0.011490736156702042, + "step": 139810 + }, + { + "epoch": 19.846699787083036, + "grad_norm": 3.391721725463867, + "learning_rate": 8.016167494677076e-05, + "loss": 0.013984756171703338, + "step": 139820 + }, + { + "epoch": 19.848119233498934, + "grad_norm": 2.265918731689453, + "learning_rate": 8.016025550035486e-05, + "loss": 0.03134198486804962, + "step": 139830 + }, + { + "epoch": 19.849538679914833, + "grad_norm": 0.15010884404182434, + "learning_rate": 8.015883605393897e-05, + "loss": 0.0080152228474617, + "step": 139840 + }, + { + "epoch": 19.85095812633073, + "grad_norm": 0.5058720707893372, + "learning_rate": 8.015741660752307e-05, + "loss": 0.0023042641580104826, + "step": 139850 + }, + { + "epoch": 19.85237757274663, + "grad_norm": 0.3317244052886963, + "learning_rate": 8.015599716110718e-05, + "loss": 0.0036775771528482436, + "step": 139860 + }, + { + "epoch": 19.853797019162528, + "grad_norm": 5.731398582458496, + "learning_rate": 8.015457771469128e-05, + "loss": 0.0798199474811554, + "step": 139870 + }, + { + "epoch": 19.855216465578426, + "grad_norm": 1.4253054857254028, + "learning_rate": 8.015315826827537e-05, + "loss": 0.04291553795337677, + "step": 139880 + }, + { + "epoch": 19.85663591199432, + "grad_norm": 3.632465124130249, + "learning_rate": 8.015173882185948e-05, + "loss": 0.060555887222290036, + "step": 139890 + }, + { + "epoch": 19.85805535841022, + "grad_norm": 0.17125140130519867, + "learning_rate": 8.015031937544358e-05, + "loss": 0.04922077655792236, + "step": 139900 + }, + { + "epoch": 19.859474804826117, + "grad_norm": 3.8708529472351074, + "learning_rate": 8.014889992902769e-05, + "loss": 0.043092742562294006, + "step": 139910 + }, + { + "epoch": 19.860894251242016, + "grad_norm": 0.012349608354270458, + "learning_rate": 8.014748048261178e-05, + "loss": 0.004922491312026977, + "step": 139920 + }, + { + "epoch": 19.862313697657914, + "grad_norm": 0.11534589529037476, + "learning_rate": 8.014606103619589e-05, + "loss": 0.047697001695632936, + "step": 139930 + }, + { + "epoch": 19.863733144073812, + "grad_norm": 0.23999154567718506, + "learning_rate": 8.014464158977999e-05, + "loss": 0.019572360813617705, + "step": 139940 + }, + { + "epoch": 19.86515259048971, + "grad_norm": 10.510241508483887, + "learning_rate": 8.01432221433641e-05, + "loss": 0.019799953699111937, + "step": 139950 + }, + { + "epoch": 19.866572036905605, + "grad_norm": 0.5493953227996826, + "learning_rate": 8.01418026969482e-05, + "loss": 0.0007798772305250168, + "step": 139960 + }, + { + "epoch": 19.867991483321504, + "grad_norm": 9.87894058227539, + "learning_rate": 8.014038325053229e-05, + "loss": 0.027184116840362548, + "step": 139970 + }, + { + "epoch": 19.869410929737402, + "grad_norm": 2.0167622566223145, + "learning_rate": 8.01389638041164e-05, + "loss": 0.031891757249832155, + "step": 139980 + }, + { + "epoch": 19.8708303761533, + "grad_norm": 4.695428848266602, + "learning_rate": 8.01375443577005e-05, + "loss": 0.022933872044086458, + "step": 139990 + }, + { + "epoch": 19.8722498225692, + "grad_norm": 0.11179798096418381, + "learning_rate": 8.013612491128461e-05, + "loss": 0.03894450962543487, + "step": 140000 + }, + { + "epoch": 19.8722498225692, + "eval_accuracy": 0.9813060342086857, + "eval_loss": 0.07770732045173645, + "eval_runtime": 32.8225, + "eval_samples_per_second": 479.152, + "eval_steps_per_second": 14.99, + "step": 140000 + }, + { + "epoch": 19.873669268985097, + "grad_norm": 0.02420716919004917, + "learning_rate": 8.013470546486871e-05, + "loss": 0.05574930906295776, + "step": 140010 + }, + { + "epoch": 19.875088715400995, + "grad_norm": 0.048414770513772964, + "learning_rate": 8.01332860184528e-05, + "loss": 0.005164441466331482, + "step": 140020 + }, + { + "epoch": 19.87650816181689, + "grad_norm": 0.04100466147065163, + "learning_rate": 8.01318665720369e-05, + "loss": 0.044208526611328125, + "step": 140030 + }, + { + "epoch": 19.87792760823279, + "grad_norm": 0.20554162561893463, + "learning_rate": 8.013044712562101e-05, + "loss": 0.010324726998805999, + "step": 140040 + }, + { + "epoch": 19.879347054648687, + "grad_norm": 0.28824618458747864, + "learning_rate": 8.012902767920511e-05, + "loss": 0.00433928295969963, + "step": 140050 + }, + { + "epoch": 19.880766501064585, + "grad_norm": 0.4394190311431885, + "learning_rate": 8.012760823278922e-05, + "loss": 0.016210195422172547, + "step": 140060 + }, + { + "epoch": 19.882185947480483, + "grad_norm": 0.5106674432754517, + "learning_rate": 8.012618878637332e-05, + "loss": 0.019017013907432555, + "step": 140070 + }, + { + "epoch": 19.88360539389638, + "grad_norm": 0.030603084713220596, + "learning_rate": 8.012476933995742e-05, + "loss": 0.02650708854198456, + "step": 140080 + }, + { + "epoch": 19.88502484031228, + "grad_norm": 0.5879043936729431, + "learning_rate": 8.012334989354153e-05, + "loss": 0.01849451959133148, + "step": 140090 + }, + { + "epoch": 19.886444286728175, + "grad_norm": 0.05266867205500603, + "learning_rate": 8.012193044712562e-05, + "loss": 0.02872341573238373, + "step": 140100 + }, + { + "epoch": 19.887863733144073, + "grad_norm": 0.01276461873203516, + "learning_rate": 8.012051100070974e-05, + "loss": 0.012407783418893814, + "step": 140110 + }, + { + "epoch": 19.88928317955997, + "grad_norm": 0.1270163506269455, + "learning_rate": 8.011909155429383e-05, + "loss": 0.00402568094432354, + "step": 140120 + }, + { + "epoch": 19.89070262597587, + "grad_norm": 0.021485812962055206, + "learning_rate": 8.011767210787793e-05, + "loss": 0.01000293791294098, + "step": 140130 + }, + { + "epoch": 19.892122072391768, + "grad_norm": 0.07036632299423218, + "learning_rate": 8.011625266146203e-05, + "loss": 0.005553951486945152, + "step": 140140 + }, + { + "epoch": 19.893541518807666, + "grad_norm": 0.3364790976047516, + "learning_rate": 8.011483321504614e-05, + "loss": 0.005696587264537811, + "step": 140150 + }, + { + "epoch": 19.894960965223564, + "grad_norm": 8.817188262939453, + "learning_rate": 8.011341376863024e-05, + "loss": 0.03514130413532257, + "step": 140160 + }, + { + "epoch": 19.89638041163946, + "grad_norm": 7.810001373291016, + "learning_rate": 8.011199432221435e-05, + "loss": 0.027446284890174866, + "step": 140170 + }, + { + "epoch": 19.897799858055357, + "grad_norm": 7.393718242645264, + "learning_rate": 8.011057487579844e-05, + "loss": 0.04557895958423615, + "step": 140180 + }, + { + "epoch": 19.899219304471256, + "grad_norm": 4.390380859375, + "learning_rate": 8.010915542938254e-05, + "loss": 0.013040535151958466, + "step": 140190 + }, + { + "epoch": 19.900638750887154, + "grad_norm": 0.17087222635746002, + "learning_rate": 8.010773598296665e-05, + "loss": 0.052542030811309814, + "step": 140200 + }, + { + "epoch": 19.902058197303052, + "grad_norm": 0.10569733381271362, + "learning_rate": 8.010631653655075e-05, + "loss": 0.00950702428817749, + "step": 140210 + }, + { + "epoch": 19.90347764371895, + "grad_norm": 0.4821464717388153, + "learning_rate": 8.010489709013486e-05, + "loss": 0.006985366344451904, + "step": 140220 + }, + { + "epoch": 19.90489709013485, + "grad_norm": 1.4368162155151367, + "learning_rate": 8.010347764371894e-05, + "loss": 0.048772335052490234, + "step": 140230 + }, + { + "epoch": 19.906316536550744, + "grad_norm": 6.197601318359375, + "learning_rate": 8.010205819730306e-05, + "loss": 0.04889421761035919, + "step": 140240 + }, + { + "epoch": 19.907735982966642, + "grad_norm": 3.209953546524048, + "learning_rate": 8.010063875088715e-05, + "loss": 0.005126167461276055, + "step": 140250 + }, + { + "epoch": 19.90915542938254, + "grad_norm": 0.1128145158290863, + "learning_rate": 8.009921930447126e-05, + "loss": 0.003893549740314484, + "step": 140260 + }, + { + "epoch": 19.91057487579844, + "grad_norm": 0.0572998970746994, + "learning_rate": 8.009779985805536e-05, + "loss": 0.022830471396446228, + "step": 140270 + }, + { + "epoch": 19.911994322214337, + "grad_norm": 0.1309823989868164, + "learning_rate": 8.009638041163946e-05, + "loss": 0.0027685169130563735, + "step": 140280 + }, + { + "epoch": 19.913413768630235, + "grad_norm": 0.9516026973724365, + "learning_rate": 8.009496096522357e-05, + "loss": 0.020607098937034607, + "step": 140290 + }, + { + "epoch": 19.914833215046134, + "grad_norm": 0.10789740830659866, + "learning_rate": 8.009354151880767e-05, + "loss": 0.007872572541236878, + "step": 140300 + }, + { + "epoch": 19.91625266146203, + "grad_norm": 0.21346351504325867, + "learning_rate": 8.009212207239178e-05, + "loss": 0.024496954679489136, + "step": 140310 + }, + { + "epoch": 19.917672107877927, + "grad_norm": 0.15668897330760956, + "learning_rate": 8.009070262597588e-05, + "loss": 0.006305134296417237, + "step": 140320 + }, + { + "epoch": 19.919091554293825, + "grad_norm": 14.24783706665039, + "learning_rate": 8.008928317955997e-05, + "loss": 0.03388771116733551, + "step": 140330 + }, + { + "epoch": 19.920511000709723, + "grad_norm": 0.8930047154426575, + "learning_rate": 8.008786373314407e-05, + "loss": 0.014166541397571564, + "step": 140340 + }, + { + "epoch": 19.92193044712562, + "grad_norm": 8.249771118164062, + "learning_rate": 8.008644428672818e-05, + "loss": 0.028839975595474243, + "step": 140350 + }, + { + "epoch": 19.92334989354152, + "grad_norm": 4.8222880363464355, + "learning_rate": 8.008502484031228e-05, + "loss": 0.043366655707359314, + "step": 140360 + }, + { + "epoch": 19.924769339957418, + "grad_norm": 2.358502149581909, + "learning_rate": 8.008360539389639e-05, + "loss": 0.05386812090873718, + "step": 140370 + }, + { + "epoch": 19.926188786373313, + "grad_norm": 1.2944918870925903, + "learning_rate": 8.008218594748049e-05, + "loss": 0.08023052811622619, + "step": 140380 + }, + { + "epoch": 19.92760823278921, + "grad_norm": 1.1863116025924683, + "learning_rate": 8.008076650106458e-05, + "loss": 0.07636402249336242, + "step": 140390 + }, + { + "epoch": 19.92902767920511, + "grad_norm": 3.6277599334716797, + "learning_rate": 8.00793470546487e-05, + "loss": 0.018394359946250917, + "step": 140400 + }, + { + "epoch": 19.930447125621008, + "grad_norm": 5.336276054382324, + "learning_rate": 8.007792760823279e-05, + "loss": 0.052164393663406375, + "step": 140410 + }, + { + "epoch": 19.931866572036906, + "grad_norm": 0.47161370515823364, + "learning_rate": 8.00765081618169e-05, + "loss": 0.015585443377494812, + "step": 140420 + }, + { + "epoch": 19.933286018452804, + "grad_norm": 2.699232339859009, + "learning_rate": 8.007508871540099e-05, + "loss": 0.06586329936981201, + "step": 140430 + }, + { + "epoch": 19.934705464868703, + "grad_norm": 7.184756278991699, + "learning_rate": 8.00736692689851e-05, + "loss": 0.06342093348503113, + "step": 140440 + }, + { + "epoch": 19.936124911284598, + "grad_norm": 0.12634967267513275, + "learning_rate": 8.00722498225692e-05, + "loss": 0.008061951398849488, + "step": 140450 + }, + { + "epoch": 19.937544357700496, + "grad_norm": 6.7007269859313965, + "learning_rate": 8.00708303761533e-05, + "loss": 0.022827640175819397, + "step": 140460 + }, + { + "epoch": 19.938963804116394, + "grad_norm": 0.08524321019649506, + "learning_rate": 8.00694109297374e-05, + "loss": 0.021901366114616395, + "step": 140470 + }, + { + "epoch": 19.940383250532292, + "grad_norm": 4.822833061218262, + "learning_rate": 8.006799148332151e-05, + "loss": 0.018462111055850983, + "step": 140480 + }, + { + "epoch": 19.94180269694819, + "grad_norm": 1.6026697158813477, + "learning_rate": 8.006657203690561e-05, + "loss": 0.02011823356151581, + "step": 140490 + }, + { + "epoch": 19.94322214336409, + "grad_norm": 0.2379651665687561, + "learning_rate": 8.006515259048971e-05, + "loss": 0.015809541940689086, + "step": 140500 + }, + { + "epoch": 19.94322214336409, + "eval_accuracy": 0.9840401856679596, + "eval_loss": 0.05191691592335701, + "eval_runtime": 32.7801, + "eval_samples_per_second": 479.773, + "eval_steps_per_second": 15.009, + "step": 140500 + }, + { + "epoch": 19.944641589779987, + "grad_norm": 0.455178827047348, + "learning_rate": 8.006373314407382e-05, + "loss": 0.04101063311100006, + "step": 140510 + }, + { + "epoch": 19.946061036195882, + "grad_norm": 1.712772011756897, + "learning_rate": 8.006231369765792e-05, + "loss": 0.02889895737171173, + "step": 140520 + }, + { + "epoch": 19.94748048261178, + "grad_norm": 0.1457989364862442, + "learning_rate": 8.006089425124203e-05, + "loss": 0.019174237549304963, + "step": 140530 + }, + { + "epoch": 19.94889992902768, + "grad_norm": 0.6908969879150391, + "learning_rate": 8.005947480482611e-05, + "loss": 0.015175694227218628, + "step": 140540 + }, + { + "epoch": 19.950319375443577, + "grad_norm": 0.05797145515680313, + "learning_rate": 8.005805535841022e-05, + "loss": 0.034023651480674745, + "step": 140550 + }, + { + "epoch": 19.951738821859475, + "grad_norm": 4.641683578491211, + "learning_rate": 8.005663591199432e-05, + "loss": 0.031184056401252748, + "step": 140560 + }, + { + "epoch": 19.953158268275374, + "grad_norm": 0.24377094209194183, + "learning_rate": 8.005521646557843e-05, + "loss": 0.020593273639678954, + "step": 140570 + }, + { + "epoch": 19.954577714691272, + "grad_norm": 0.2801949977874756, + "learning_rate": 8.005379701916253e-05, + "loss": 0.012578007578849793, + "step": 140580 + }, + { + "epoch": 19.955997161107167, + "grad_norm": 0.00720011442899704, + "learning_rate": 8.005237757274663e-05, + "loss": 0.05097510814666748, + "step": 140590 + }, + { + "epoch": 19.957416607523065, + "grad_norm": 3.5743913650512695, + "learning_rate": 8.005095812633074e-05, + "loss": 0.015194380283355713, + "step": 140600 + }, + { + "epoch": 19.958836053938963, + "grad_norm": 11.544880867004395, + "learning_rate": 8.004953867991483e-05, + "loss": 0.011013476550579071, + "step": 140610 + }, + { + "epoch": 19.96025550035486, + "grad_norm": 1.3924674987792969, + "learning_rate": 8.004811923349895e-05, + "loss": 0.02060462683439255, + "step": 140620 + }, + { + "epoch": 19.96167494677076, + "grad_norm": 0.5600424408912659, + "learning_rate": 8.004669978708304e-05, + "loss": 0.0067936301231384276, + "step": 140630 + }, + { + "epoch": 19.96309439318666, + "grad_norm": 2.087996244430542, + "learning_rate": 8.004528034066714e-05, + "loss": 0.004950342327356338, + "step": 140640 + }, + { + "epoch": 19.964513839602557, + "grad_norm": 0.007394047453999519, + "learning_rate": 8.004386089425124e-05, + "loss": 0.005445841327309608, + "step": 140650 + }, + { + "epoch": 19.96593328601845, + "grad_norm": 7.113765716552734, + "learning_rate": 8.004244144783535e-05, + "loss": 0.008016441762447358, + "step": 140660 + }, + { + "epoch": 19.96735273243435, + "grad_norm": 0.10857373476028442, + "learning_rate": 8.004102200141945e-05, + "loss": 0.003368879854679108, + "step": 140670 + }, + { + "epoch": 19.968772178850248, + "grad_norm": 0.370307981967926, + "learning_rate": 8.003960255500356e-05, + "loss": 0.014521026611328125, + "step": 140680 + }, + { + "epoch": 19.970191625266146, + "grad_norm": 0.025977713987231255, + "learning_rate": 8.003818310858765e-05, + "loss": 0.014565077424049378, + "step": 140690 + }, + { + "epoch": 19.971611071682045, + "grad_norm": 0.11662080883979797, + "learning_rate": 8.003676366217175e-05, + "loss": 0.01800130307674408, + "step": 140700 + }, + { + "epoch": 19.973030518097943, + "grad_norm": 5.711389541625977, + "learning_rate": 8.003534421575586e-05, + "loss": 0.0410052627325058, + "step": 140710 + }, + { + "epoch": 19.97444996451384, + "grad_norm": 0.013483582064509392, + "learning_rate": 8.003392476933996e-05, + "loss": 0.009572294354438782, + "step": 140720 + }, + { + "epoch": 19.975869410929736, + "grad_norm": 17.010944366455078, + "learning_rate": 8.003250532292407e-05, + "loss": 0.02607950270175934, + "step": 140730 + }, + { + "epoch": 19.977288857345634, + "grad_norm": 0.10605667531490326, + "learning_rate": 8.003108587650815e-05, + "loss": 0.007246570289134979, + "step": 140740 + }, + { + "epoch": 19.978708303761533, + "grad_norm": 7.407968997955322, + "learning_rate": 8.002966643009227e-05, + "loss": 0.011455393582582473, + "step": 140750 + }, + { + "epoch": 19.98012775017743, + "grad_norm": 0.04318075627088547, + "learning_rate": 8.002824698367636e-05, + "loss": 0.014967672526836395, + "step": 140760 + }, + { + "epoch": 19.98154719659333, + "grad_norm": 0.34878456592559814, + "learning_rate": 8.002682753726047e-05, + "loss": 0.017500746250152587, + "step": 140770 + }, + { + "epoch": 19.982966643009227, + "grad_norm": 4.813690662384033, + "learning_rate": 8.002540809084457e-05, + "loss": 0.010691556334495544, + "step": 140780 + }, + { + "epoch": 19.984386089425126, + "grad_norm": 0.8639642000198364, + "learning_rate": 8.002398864442867e-05, + "loss": 0.0032157417386770248, + "step": 140790 + }, + { + "epoch": 19.98580553584102, + "grad_norm": 0.014026980847120285, + "learning_rate": 8.002256919801278e-05, + "loss": 0.015287537872791291, + "step": 140800 + }, + { + "epoch": 19.98722498225692, + "grad_norm": 0.1736358255147934, + "learning_rate": 8.002114975159688e-05, + "loss": 0.049898722767829896, + "step": 140810 + }, + { + "epoch": 19.988644428672817, + "grad_norm": 0.01327612716704607, + "learning_rate": 8.001973030518099e-05, + "loss": 0.019483727216720582, + "step": 140820 + }, + { + "epoch": 19.990063875088715, + "grad_norm": 0.06661958247423172, + "learning_rate": 8.001831085876509e-05, + "loss": 0.006842435896396637, + "step": 140830 + }, + { + "epoch": 19.991483321504614, + "grad_norm": 0.008325074799358845, + "learning_rate": 8.00168914123492e-05, + "loss": 0.006324427574872971, + "step": 140840 + }, + { + "epoch": 19.992902767920512, + "grad_norm": 0.2477165162563324, + "learning_rate": 8.001547196593328e-05, + "loss": 0.02415194809436798, + "step": 140850 + }, + { + "epoch": 19.99432221433641, + "grad_norm": 0.03715141862630844, + "learning_rate": 8.001405251951739e-05, + "loss": 0.015509502589702606, + "step": 140860 + }, + { + "epoch": 19.995741660752305, + "grad_norm": 0.06404056400060654, + "learning_rate": 8.001263307310149e-05, + "loss": 0.026072847843170165, + "step": 140870 + }, + { + "epoch": 19.997161107168203, + "grad_norm": 3.6626203060150146, + "learning_rate": 8.00112136266856e-05, + "loss": 0.016191501915454865, + "step": 140880 + }, + { + "epoch": 19.9985805535841, + "grad_norm": 4.256667137145996, + "learning_rate": 8.000979418026971e-05, + "loss": 0.00894407331943512, + "step": 140890 + }, + { + "epoch": 20.0, + "grad_norm": 0.0740240067243576, + "learning_rate": 8.00083747338538e-05, + "loss": 0.015609632432460784, + "step": 140900 + }, + { + "epoch": 20.0014194464159, + "grad_norm": 0.22740566730499268, + "learning_rate": 8.00069552874379e-05, + "loss": 0.041918623447418216, + "step": 140910 + }, + { + "epoch": 20.002838892831797, + "grad_norm": 0.4715220630168915, + "learning_rate": 8.0005535841022e-05, + "loss": 0.02898731529712677, + "step": 140920 + }, + { + "epoch": 20.004258339247695, + "grad_norm": 19.265684127807617, + "learning_rate": 8.000411639460611e-05, + "loss": 0.05128769874572754, + "step": 140930 + }, + { + "epoch": 20.00567778566359, + "grad_norm": 4.531553268432617, + "learning_rate": 8.000269694819021e-05, + "loss": 0.08501461148262024, + "step": 140940 + }, + { + "epoch": 20.007097232079488, + "grad_norm": 0.04022873565554619, + "learning_rate": 8.000127750177431e-05, + "loss": 0.03906202912330627, + "step": 140950 + }, + { + "epoch": 20.008516678495386, + "grad_norm": 0.02828882448375225, + "learning_rate": 7.99998580553584e-05, + "loss": 0.062254679203033444, + "step": 140960 + }, + { + "epoch": 20.009936124911285, + "grad_norm": 0.7951211333274841, + "learning_rate": 7.999843860894252e-05, + "loss": 0.012028510868549346, + "step": 140970 + }, + { + "epoch": 20.011355571327183, + "grad_norm": 0.1233096718788147, + "learning_rate": 7.999701916252663e-05, + "loss": 0.008423009514808654, + "step": 140980 + }, + { + "epoch": 20.01277501774308, + "grad_norm": 0.4016595780849457, + "learning_rate": 7.999559971611072e-05, + "loss": 0.009395520389080047, + "step": 140990 + }, + { + "epoch": 20.01419446415898, + "grad_norm": 1.084055781364441, + "learning_rate": 7.999418026969482e-05, + "loss": 0.015512244403362274, + "step": 141000 + }, + { + "epoch": 20.01419446415898, + "eval_accuracy": 0.9862656577859732, + "eval_loss": 0.05205647274851799, + "eval_runtime": 33.1367, + "eval_samples_per_second": 474.609, + "eval_steps_per_second": 14.848, + "step": 141000 + }, + { + "epoch": 20.015613910574874, + "grad_norm": 0.015475750900804996, + "learning_rate": 7.999276082327892e-05, + "loss": 0.028347843885421754, + "step": 141010 + }, + { + "epoch": 20.017033356990773, + "grad_norm": 6.349401950836182, + "learning_rate": 7.999134137686303e-05, + "loss": 0.031047925353050232, + "step": 141020 + }, + { + "epoch": 20.01845280340667, + "grad_norm": 5.873929977416992, + "learning_rate": 7.998992193044713e-05, + "loss": 0.00811959058046341, + "step": 141030 + }, + { + "epoch": 20.01987224982257, + "grad_norm": 0.0425172820687294, + "learning_rate": 7.998850248403124e-05, + "loss": 0.004664409905672073, + "step": 141040 + }, + { + "epoch": 20.021291696238467, + "grad_norm": 0.10378196835517883, + "learning_rate": 7.998708303761532e-05, + "loss": 0.016263917088508606, + "step": 141050 + }, + { + "epoch": 20.022711142654366, + "grad_norm": 0.8236654996871948, + "learning_rate": 7.998566359119943e-05, + "loss": 0.011845842748880387, + "step": 141060 + }, + { + "epoch": 20.024130589070264, + "grad_norm": 5.560585021972656, + "learning_rate": 7.998424414478354e-05, + "loss": 0.008132990449666977, + "step": 141070 + }, + { + "epoch": 20.02555003548616, + "grad_norm": 0.8124029636383057, + "learning_rate": 7.998282469836764e-05, + "loss": 0.008311792463064193, + "step": 141080 + }, + { + "epoch": 20.026969481902057, + "grad_norm": 0.23469218611717224, + "learning_rate": 7.998140525195175e-05, + "loss": 0.001304752752184868, + "step": 141090 + }, + { + "epoch": 20.028388928317955, + "grad_norm": 0.8229203820228577, + "learning_rate": 7.997998580553584e-05, + "loss": 0.0024736978113651274, + "step": 141100 + }, + { + "epoch": 20.029808374733854, + "grad_norm": 0.11447346210479736, + "learning_rate": 7.997856635911995e-05, + "loss": 0.04228949248790741, + "step": 141110 + }, + { + "epoch": 20.031227821149752, + "grad_norm": 0.06373047083616257, + "learning_rate": 7.997714691270404e-05, + "loss": 0.009620143473148346, + "step": 141120 + }, + { + "epoch": 20.03264726756565, + "grad_norm": 8.882691383361816, + "learning_rate": 7.997572746628816e-05, + "loss": 0.05514065623283386, + "step": 141130 + }, + { + "epoch": 20.03406671398155, + "grad_norm": 0.018412787467241287, + "learning_rate": 7.997430801987225e-05, + "loss": 0.011524337530136108, + "step": 141140 + }, + { + "epoch": 20.035486160397443, + "grad_norm": 0.012451624497771263, + "learning_rate": 7.997288857345635e-05, + "loss": 0.00642521008849144, + "step": 141150 + }, + { + "epoch": 20.03690560681334, + "grad_norm": 2.8071608543395996, + "learning_rate": 7.997146912704046e-05, + "loss": 0.014402301609516143, + "step": 141160 + }, + { + "epoch": 20.03832505322924, + "grad_norm": 4.811960697174072, + "learning_rate": 7.997004968062456e-05, + "loss": 0.010839483886957168, + "step": 141170 + }, + { + "epoch": 20.03974449964514, + "grad_norm": 0.11822403967380524, + "learning_rate": 7.996863023420867e-05, + "loss": 0.017814382910728455, + "step": 141180 + }, + { + "epoch": 20.041163946061037, + "grad_norm": 0.0183568075299263, + "learning_rate": 7.996721078779277e-05, + "loss": 0.041589167714118955, + "step": 141190 + }, + { + "epoch": 20.042583392476935, + "grad_norm": 0.5206476449966431, + "learning_rate": 7.996579134137688e-05, + "loss": 0.026238131523132324, + "step": 141200 + }, + { + "epoch": 20.044002838892833, + "grad_norm": 1.3223717212677002, + "learning_rate": 7.996437189496096e-05, + "loss": 0.004569841176271438, + "step": 141210 + }, + { + "epoch": 20.045422285308728, + "grad_norm": 0.18440695106983185, + "learning_rate": 7.996295244854507e-05, + "loss": 0.013369666039943695, + "step": 141220 + }, + { + "epoch": 20.046841731724626, + "grad_norm": 0.3286435306072235, + "learning_rate": 7.996153300212917e-05, + "loss": 0.035613265633583066, + "step": 141230 + }, + { + "epoch": 20.048261178140525, + "grad_norm": 3.6914515495300293, + "learning_rate": 7.996011355571328e-05, + "loss": 0.0053454674780368805, + "step": 141240 + }, + { + "epoch": 20.049680624556423, + "grad_norm": 1.9531716108322144, + "learning_rate": 7.995869410929738e-05, + "loss": 0.02335241883993149, + "step": 141250 + }, + { + "epoch": 20.05110007097232, + "grad_norm": 1.275848388671875, + "learning_rate": 7.995727466288148e-05, + "loss": 0.0027617398649454118, + "step": 141260 + }, + { + "epoch": 20.05251951738822, + "grad_norm": 0.027349628508090973, + "learning_rate": 7.995585521646559e-05, + "loss": 0.008489987254142762, + "step": 141270 + }, + { + "epoch": 20.053938963804118, + "grad_norm": 0.4223189949989319, + "learning_rate": 7.995443577004968e-05, + "loss": 0.015506541728973389, + "step": 141280 + }, + { + "epoch": 20.055358410220013, + "grad_norm": 0.24928779900074005, + "learning_rate": 7.99530163236338e-05, + "loss": 0.0367767333984375, + "step": 141290 + }, + { + "epoch": 20.05677785663591, + "grad_norm": 0.03538127243518829, + "learning_rate": 7.995159687721789e-05, + "loss": 0.005999685451388359, + "step": 141300 + }, + { + "epoch": 20.05819730305181, + "grad_norm": 0.048333484679460526, + "learning_rate": 7.995017743080199e-05, + "loss": 0.008087261021137238, + "step": 141310 + }, + { + "epoch": 20.059616749467708, + "grad_norm": 2.40273118019104, + "learning_rate": 7.994875798438609e-05, + "loss": 0.024593907594680785, + "step": 141320 + }, + { + "epoch": 20.061036195883606, + "grad_norm": 0.033067528158426285, + "learning_rate": 7.99473385379702e-05, + "loss": 0.03219816386699677, + "step": 141330 + }, + { + "epoch": 20.062455642299504, + "grad_norm": 8.978111267089844, + "learning_rate": 7.994606103619588e-05, + "loss": 0.048678803443908694, + "step": 141340 + }, + { + "epoch": 20.063875088715402, + "grad_norm": 5.184823513031006, + "learning_rate": 7.994464158978e-05, + "loss": 0.01843302547931671, + "step": 141350 + }, + { + "epoch": 20.065294535131297, + "grad_norm": 0.6802540421485901, + "learning_rate": 7.994322214336409e-05, + "loss": 0.006856510043144226, + "step": 141360 + }, + { + "epoch": 20.066713981547196, + "grad_norm": 6.415282726287842, + "learning_rate": 7.99418026969482e-05, + "loss": 0.03161287009716034, + "step": 141370 + }, + { + "epoch": 20.068133427963094, + "grad_norm": 0.6133005619049072, + "learning_rate": 7.994038325053229e-05, + "loss": 0.003521961346268654, + "step": 141380 + }, + { + "epoch": 20.069552874378992, + "grad_norm": 0.5762439370155334, + "learning_rate": 7.99389638041164e-05, + "loss": 0.008870533108711243, + "step": 141390 + }, + { + "epoch": 20.07097232079489, + "grad_norm": 0.41646623611450195, + "learning_rate": 7.99375443577005e-05, + "loss": 0.011471281200647354, + "step": 141400 + }, + { + "epoch": 20.07239176721079, + "grad_norm": 1.1015546321868896, + "learning_rate": 7.99361249112846e-05, + "loss": 0.015262497961521149, + "step": 141410 + }, + { + "epoch": 20.073811213626687, + "grad_norm": 0.1063433587551117, + "learning_rate": 7.99347054648687e-05, + "loss": 0.007727238535881043, + "step": 141420 + }, + { + "epoch": 20.075230660042582, + "grad_norm": 0.02339489385485649, + "learning_rate": 7.99332860184528e-05, + "loss": 0.009775744378566742, + "step": 141430 + }, + { + "epoch": 20.07665010645848, + "grad_norm": 0.04368114471435547, + "learning_rate": 7.993186657203691e-05, + "loss": 0.008957654982805253, + "step": 141440 + }, + { + "epoch": 20.07806955287438, + "grad_norm": 0.021869715303182602, + "learning_rate": 7.993044712562101e-05, + "loss": 0.036583393812179565, + "step": 141450 + }, + { + "epoch": 20.079488999290277, + "grad_norm": 0.16165511310100555, + "learning_rate": 7.992902767920512e-05, + "loss": 0.013248316943645477, + "step": 141460 + }, + { + "epoch": 20.080908445706175, + "grad_norm": 0.0021154037676751614, + "learning_rate": 7.992760823278922e-05, + "loss": 0.020379316806793214, + "step": 141470 + }, + { + "epoch": 20.082327892122073, + "grad_norm": 1.236534595489502, + "learning_rate": 7.992618878637331e-05, + "loss": 0.01821013242006302, + "step": 141480 + }, + { + "epoch": 20.08374733853797, + "grad_norm": 0.05037076026201248, + "learning_rate": 7.992476933995741e-05, + "loss": 0.008942946791648865, + "step": 141490 + }, + { + "epoch": 20.085166784953866, + "grad_norm": 20.41444206237793, + "learning_rate": 7.992334989354152e-05, + "loss": 0.041226530075073244, + "step": 141500 + }, + { + "epoch": 20.085166784953866, + "eval_accuracy": 0.9863292427036306, + "eval_loss": 0.048269789665937424, + "eval_runtime": 33.2933, + "eval_samples_per_second": 472.377, + "eval_steps_per_second": 14.778, + "step": 141500 + }, + { + "epoch": 20.086586231369765, + "grad_norm": 2.1929523944854736, + "learning_rate": 7.992193044712562e-05, + "loss": 0.05169554948806763, + "step": 141510 + }, + { + "epoch": 20.088005677785663, + "grad_norm": 0.07201051712036133, + "learning_rate": 7.992051100070973e-05, + "loss": 0.005569947138428688, + "step": 141520 + }, + { + "epoch": 20.08942512420156, + "grad_norm": 0.03781836852431297, + "learning_rate": 7.991909155429383e-05, + "loss": 0.012127821147441865, + "step": 141530 + }, + { + "epoch": 20.09084457061746, + "grad_norm": 0.1598953753709793, + "learning_rate": 7.991767210787793e-05, + "loss": 0.01272830069065094, + "step": 141540 + }, + { + "epoch": 20.092264017033358, + "grad_norm": 0.372420072555542, + "learning_rate": 7.991625266146204e-05, + "loss": 0.0067143462598323825, + "step": 141550 + }, + { + "epoch": 20.093683463449256, + "grad_norm": 0.2735583484172821, + "learning_rate": 7.991483321504613e-05, + "loss": 0.019540132582187654, + "step": 141560 + }, + { + "epoch": 20.09510290986515, + "grad_norm": 0.10384074598550797, + "learning_rate": 7.991341376863024e-05, + "loss": 0.01544135957956314, + "step": 141570 + }, + { + "epoch": 20.09652235628105, + "grad_norm": 0.02889949269592762, + "learning_rate": 7.991199432221434e-05, + "loss": 0.03289896249771118, + "step": 141580 + }, + { + "epoch": 20.097941802696948, + "grad_norm": 0.0637548416852951, + "learning_rate": 7.991057487579844e-05, + "loss": 0.013479314744472504, + "step": 141590 + }, + { + "epoch": 20.099361249112846, + "grad_norm": 0.03864375129342079, + "learning_rate": 7.990915542938254e-05, + "loss": 0.011584682017564773, + "step": 141600 + }, + { + "epoch": 20.100780695528744, + "grad_norm": 0.07262269407510757, + "learning_rate": 7.990773598296665e-05, + "loss": 0.003744155168533325, + "step": 141610 + }, + { + "epoch": 20.102200141944643, + "grad_norm": 0.9087169170379639, + "learning_rate": 7.990631653655075e-05, + "loss": 0.008905504643917084, + "step": 141620 + }, + { + "epoch": 20.10361958836054, + "grad_norm": 0.4373335838317871, + "learning_rate": 7.990489709013486e-05, + "loss": 0.0007677737623453141, + "step": 141630 + }, + { + "epoch": 20.105039034776436, + "grad_norm": 0.01339707151055336, + "learning_rate": 7.990347764371895e-05, + "loss": 0.01338742971420288, + "step": 141640 + }, + { + "epoch": 20.106458481192334, + "grad_norm": 0.09526550024747849, + "learning_rate": 7.990205819730305e-05, + "loss": 0.0082273468375206, + "step": 141650 + }, + { + "epoch": 20.107877927608232, + "grad_norm": 0.15038612484931946, + "learning_rate": 7.990063875088716e-05, + "loss": 0.016612686216831207, + "step": 141660 + }, + { + "epoch": 20.10929737402413, + "grad_norm": 0.11727338284254074, + "learning_rate": 7.989921930447126e-05, + "loss": 0.002484797686338425, + "step": 141670 + }, + { + "epoch": 20.11071682044003, + "grad_norm": 1.2446929216384888, + "learning_rate": 7.989779985805537e-05, + "loss": 0.010968457162380218, + "step": 141680 + }, + { + "epoch": 20.112136266855927, + "grad_norm": 0.02222842164337635, + "learning_rate": 7.989638041163945e-05, + "loss": 0.04818206131458282, + "step": 141690 + }, + { + "epoch": 20.113555713271825, + "grad_norm": 0.017617546021938324, + "learning_rate": 7.989496096522356e-05, + "loss": 0.03665972352027893, + "step": 141700 + }, + { + "epoch": 20.11497515968772, + "grad_norm": 0.06581719219684601, + "learning_rate": 7.989354151880766e-05, + "loss": 0.004803193733096123, + "step": 141710 + }, + { + "epoch": 20.11639460610362, + "grad_norm": 0.05679214745759964, + "learning_rate": 7.989212207239177e-05, + "loss": 0.01601347476243973, + "step": 141720 + }, + { + "epoch": 20.117814052519517, + "grad_norm": 0.02132427506148815, + "learning_rate": 7.989070262597588e-05, + "loss": 0.01751757264137268, + "step": 141730 + }, + { + "epoch": 20.119233498935415, + "grad_norm": 0.03954348340630531, + "learning_rate": 7.988928317955997e-05, + "loss": 0.03036175072193146, + "step": 141740 + }, + { + "epoch": 20.120652945351313, + "grad_norm": 0.3142281174659729, + "learning_rate": 7.988786373314408e-05, + "loss": 0.023780320584774018, + "step": 141750 + }, + { + "epoch": 20.12207239176721, + "grad_norm": 0.015397654846310616, + "learning_rate": 7.988644428672818e-05, + "loss": 0.006399238109588623, + "step": 141760 + }, + { + "epoch": 20.12349183818311, + "grad_norm": 0.15912342071533203, + "learning_rate": 7.988502484031229e-05, + "loss": 0.007101371884346008, + "step": 141770 + }, + { + "epoch": 20.124911284599005, + "grad_norm": 0.1289181411266327, + "learning_rate": 7.988360539389638e-05, + "loss": 0.008804739266633988, + "step": 141780 + }, + { + "epoch": 20.126330731014903, + "grad_norm": 0.0552683100104332, + "learning_rate": 7.988218594748048e-05, + "loss": 0.013078097999095917, + "step": 141790 + }, + { + "epoch": 20.1277501774308, + "grad_norm": 0.024977317079901695, + "learning_rate": 7.988076650106458e-05, + "loss": 0.05996226668357849, + "step": 141800 + }, + { + "epoch": 20.1291696238467, + "grad_norm": 0.12581050395965576, + "learning_rate": 7.987934705464869e-05, + "loss": 0.011745229363441467, + "step": 141810 + }, + { + "epoch": 20.130589070262598, + "grad_norm": 0.03531504422426224, + "learning_rate": 7.98779276082328e-05, + "loss": 0.003165086731314659, + "step": 141820 + }, + { + "epoch": 20.132008516678496, + "grad_norm": 1.4400914907455444, + "learning_rate": 7.98765081618169e-05, + "loss": 0.005033036321401596, + "step": 141830 + }, + { + "epoch": 20.133427963094395, + "grad_norm": 0.15162140130996704, + "learning_rate": 7.9875088715401e-05, + "loss": 0.02480035275220871, + "step": 141840 + }, + { + "epoch": 20.13484740951029, + "grad_norm": 0.7097915410995483, + "learning_rate": 7.987366926898509e-05, + "loss": 0.004496370628476143, + "step": 141850 + }, + { + "epoch": 20.136266855926188, + "grad_norm": 0.0464242622256279, + "learning_rate": 7.98722498225692e-05, + "loss": 0.03190518319606781, + "step": 141860 + }, + { + "epoch": 20.137686302342086, + "grad_norm": 0.0554632693529129, + "learning_rate": 7.98708303761533e-05, + "loss": 0.011931977421045303, + "step": 141870 + }, + { + "epoch": 20.139105748757984, + "grad_norm": 0.07021571695804596, + "learning_rate": 7.986941092973741e-05, + "loss": 0.005147505924105645, + "step": 141880 + }, + { + "epoch": 20.140525195173883, + "grad_norm": 0.03586012125015259, + "learning_rate": 7.986799148332151e-05, + "loss": 0.0158030703663826, + "step": 141890 + }, + { + "epoch": 20.14194464158978, + "grad_norm": 0.10487499088048935, + "learning_rate": 7.986657203690561e-05, + "loss": 0.0037853769958019257, + "step": 141900 + }, + { + "epoch": 20.14336408800568, + "grad_norm": 0.03854146599769592, + "learning_rate": 7.986515259048972e-05, + "loss": 0.007671931385993957, + "step": 141910 + }, + { + "epoch": 20.144783534421574, + "grad_norm": 0.013667808845639229, + "learning_rate": 7.986373314407382e-05, + "loss": 0.003470684587955475, + "step": 141920 + }, + { + "epoch": 20.146202980837472, + "grad_norm": 0.34707924723625183, + "learning_rate": 7.986231369765793e-05, + "loss": 0.016699378192424775, + "step": 141930 + }, + { + "epoch": 20.14762242725337, + "grad_norm": 4.193675994873047, + "learning_rate": 7.986089425124202e-05, + "loss": 0.008417283743619918, + "step": 141940 + }, + { + "epoch": 20.14904187366927, + "grad_norm": 0.017162833362817764, + "learning_rate": 7.985947480482612e-05, + "loss": 0.0015996877104043961, + "step": 141950 + }, + { + "epoch": 20.150461320085167, + "grad_norm": 0.2931995689868927, + "learning_rate": 7.985805535841022e-05, + "loss": 0.02403400242328644, + "step": 141960 + }, + { + "epoch": 20.151880766501066, + "grad_norm": 1.2880780696868896, + "learning_rate": 7.985663591199433e-05, + "loss": 0.04142577946186066, + "step": 141970 + }, + { + "epoch": 20.153300212916964, + "grad_norm": 3.4541120529174805, + "learning_rate": 7.985521646557843e-05, + "loss": 0.014758683741092682, + "step": 141980 + }, + { + "epoch": 20.15471965933286, + "grad_norm": 0.11815626919269562, + "learning_rate": 7.985379701916254e-05, + "loss": 0.05548862814903259, + "step": 141990 + }, + { + "epoch": 20.156139105748757, + "grad_norm": 0.2253233939409256, + "learning_rate": 7.985237757274664e-05, + "loss": 0.0025531187653541564, + "step": 142000 + }, + { + "epoch": 20.156139105748757, + "eval_accuracy": 0.9875373561391237, + "eval_loss": 0.04457830637693405, + "eval_runtime": 34.0845, + "eval_samples_per_second": 461.412, + "eval_steps_per_second": 14.435, + "step": 142000 + }, + { + "epoch": 20.157558552164655, + "grad_norm": 0.11514417827129364, + "learning_rate": 7.985095812633073e-05, + "loss": 0.02372538149356842, + "step": 142010 + }, + { + "epoch": 20.158977998580554, + "grad_norm": 0.01031376700848341, + "learning_rate": 7.984953867991484e-05, + "loss": 0.00249270536005497, + "step": 142020 + }, + { + "epoch": 20.160397444996452, + "grad_norm": 0.6879292130470276, + "learning_rate": 7.984811923349894e-05, + "loss": 0.007451292872428894, + "step": 142030 + }, + { + "epoch": 20.16181689141235, + "grad_norm": 0.016475774347782135, + "learning_rate": 7.984669978708305e-05, + "loss": 0.027611124515533447, + "step": 142040 + }, + { + "epoch": 20.16323633782825, + "grad_norm": 4.053171157836914, + "learning_rate": 7.984528034066714e-05, + "loss": 0.012854620814323425, + "step": 142050 + }, + { + "epoch": 20.164655784244143, + "grad_norm": 0.009656962938606739, + "learning_rate": 7.984386089425125e-05, + "loss": 0.000679202750325203, + "step": 142060 + }, + { + "epoch": 20.16607523066004, + "grad_norm": 0.4034946858882904, + "learning_rate": 7.984244144783534e-05, + "loss": 0.019225259125232697, + "step": 142070 + }, + { + "epoch": 20.16749467707594, + "grad_norm": 5.981703758239746, + "learning_rate": 7.984102200141945e-05, + "loss": 0.0107732355594635, + "step": 142080 + }, + { + "epoch": 20.168914123491838, + "grad_norm": 5.658140659332275, + "learning_rate": 7.983960255500355e-05, + "loss": 0.01368822604417801, + "step": 142090 + }, + { + "epoch": 20.170333569907736, + "grad_norm": 0.316007137298584, + "learning_rate": 7.983818310858765e-05, + "loss": 0.011389472335577012, + "step": 142100 + }, + { + "epoch": 20.171753016323635, + "grad_norm": 0.05075006186962128, + "learning_rate": 7.983676366217176e-05, + "loss": 0.016885870695114137, + "step": 142110 + }, + { + "epoch": 20.173172462739533, + "grad_norm": 16.16810417175293, + "learning_rate": 7.983534421575586e-05, + "loss": 0.05068206787109375, + "step": 142120 + }, + { + "epoch": 20.174591909155428, + "grad_norm": 5.7081708908081055, + "learning_rate": 7.983392476933997e-05, + "loss": 0.04219783842563629, + "step": 142130 + }, + { + "epoch": 20.176011355571326, + "grad_norm": 3.024782657623291, + "learning_rate": 7.983250532292407e-05, + "loss": 0.008670754730701447, + "step": 142140 + }, + { + "epoch": 20.177430801987224, + "grad_norm": 2.9781620502471924, + "learning_rate": 7.983108587650816e-05, + "loss": 0.06037324070930481, + "step": 142150 + }, + { + "epoch": 20.178850248403123, + "grad_norm": 0.16223609447479248, + "learning_rate": 7.982966643009226e-05, + "loss": 0.014251476526260376, + "step": 142160 + }, + { + "epoch": 20.18026969481902, + "grad_norm": 0.027019178494811058, + "learning_rate": 7.982824698367637e-05, + "loss": 0.011684049665927888, + "step": 142170 + }, + { + "epoch": 20.18168914123492, + "grad_norm": 0.6857998371124268, + "learning_rate": 7.982682753726047e-05, + "loss": 0.002962096780538559, + "step": 142180 + }, + { + "epoch": 20.183108587650818, + "grad_norm": 0.026773851364850998, + "learning_rate": 7.982540809084458e-05, + "loss": 0.010839618742465973, + "step": 142190 + }, + { + "epoch": 20.184528034066712, + "grad_norm": 10.207509994506836, + "learning_rate": 7.982398864442868e-05, + "loss": 0.00893901064991951, + "step": 142200 + }, + { + "epoch": 20.18594748048261, + "grad_norm": 0.5077146887779236, + "learning_rate": 7.982256919801277e-05, + "loss": 0.023206639289855956, + "step": 142210 + }, + { + "epoch": 20.18736692689851, + "grad_norm": 13.906847953796387, + "learning_rate": 7.982114975159689e-05, + "loss": 0.04845001697540283, + "step": 142220 + }, + { + "epoch": 20.188786373314407, + "grad_norm": 0.22066904604434967, + "learning_rate": 7.981973030518098e-05, + "loss": 0.020987828075885773, + "step": 142230 + }, + { + "epoch": 20.190205819730306, + "grad_norm": 7.529053211212158, + "learning_rate": 7.98183108587651e-05, + "loss": 0.04300286471843719, + "step": 142240 + }, + { + "epoch": 20.191625266146204, + "grad_norm": 9.46019458770752, + "learning_rate": 7.981689141234919e-05, + "loss": 0.05241010785102844, + "step": 142250 + }, + { + "epoch": 20.193044712562102, + "grad_norm": 7.071815490722656, + "learning_rate": 7.981547196593329e-05, + "loss": 0.031628957390785216, + "step": 142260 + }, + { + "epoch": 20.194464158977997, + "grad_norm": 0.0734858512878418, + "learning_rate": 7.981405251951739e-05, + "loss": 0.01677306890487671, + "step": 142270 + }, + { + "epoch": 20.195883605393895, + "grad_norm": 0.8266683220863342, + "learning_rate": 7.98126330731015e-05, + "loss": 0.020758605003356932, + "step": 142280 + }, + { + "epoch": 20.197303051809794, + "grad_norm": 0.038721963763237, + "learning_rate": 7.98112136266856e-05, + "loss": 0.01994621455669403, + "step": 142290 + }, + { + "epoch": 20.198722498225692, + "grad_norm": 0.268017441034317, + "learning_rate": 7.98097941802697e-05, + "loss": 0.0036127448081970217, + "step": 142300 + }, + { + "epoch": 20.20014194464159, + "grad_norm": 0.9657958745956421, + "learning_rate": 7.98083747338538e-05, + "loss": 0.030071181058883668, + "step": 142310 + }, + { + "epoch": 20.20156139105749, + "grad_norm": 0.21522824466228485, + "learning_rate": 7.98070972320795e-05, + "loss": 0.0300694078207016, + "step": 142320 + }, + { + "epoch": 20.202980837473387, + "grad_norm": 0.011424322612583637, + "learning_rate": 7.980567778566359e-05, + "loss": 0.0058794297277927395, + "step": 142330 + }, + { + "epoch": 20.20440028388928, + "grad_norm": 5.839929103851318, + "learning_rate": 7.98042583392477e-05, + "loss": 0.032745689153671265, + "step": 142340 + }, + { + "epoch": 20.20581973030518, + "grad_norm": 0.03547418490052223, + "learning_rate": 7.98028388928318e-05, + "loss": 0.01504252403974533, + "step": 142350 + }, + { + "epoch": 20.207239176721078, + "grad_norm": 8.353195190429688, + "learning_rate": 7.98014194464159e-05, + "loss": 0.01957404613494873, + "step": 142360 + }, + { + "epoch": 20.208658623136976, + "grad_norm": 5.893552303314209, + "learning_rate": 7.98e-05, + "loss": 0.02026599943637848, + "step": 142370 + }, + { + "epoch": 20.210078069552875, + "grad_norm": 0.010802337899804115, + "learning_rate": 7.97985805535841e-05, + "loss": 0.024339427053928376, + "step": 142380 + }, + { + "epoch": 20.211497515968773, + "grad_norm": 11.485355377197266, + "learning_rate": 7.979716110716821e-05, + "loss": 0.011984516680240632, + "step": 142390 + }, + { + "epoch": 20.21291696238467, + "grad_norm": 17.191001892089844, + "learning_rate": 7.979574166075231e-05, + "loss": 0.028717640042304992, + "step": 142400 + }, + { + "epoch": 20.214336408800566, + "grad_norm": 0.20301076769828796, + "learning_rate": 7.979432221433642e-05, + "loss": 0.0018381725996732712, + "step": 142410 + }, + { + "epoch": 20.215755855216464, + "grad_norm": 21.390623092651367, + "learning_rate": 7.979290276792052e-05, + "loss": 0.043421268463134766, + "step": 142420 + }, + { + "epoch": 20.217175301632363, + "grad_norm": 11.526211738586426, + "learning_rate": 7.979148332150461e-05, + "loss": 0.04156326353549957, + "step": 142430 + }, + { + "epoch": 20.21859474804826, + "grad_norm": 0.04185756668448448, + "learning_rate": 7.979006387508871e-05, + "loss": 0.011933413147926331, + "step": 142440 + }, + { + "epoch": 20.22001419446416, + "grad_norm": 1.5703065395355225, + "learning_rate": 7.978864442867282e-05, + "loss": 0.04834843277931213, + "step": 142450 + }, + { + "epoch": 20.221433640880058, + "grad_norm": 0.4260168969631195, + "learning_rate": 7.978722498225692e-05, + "loss": 0.02226836383342743, + "step": 142460 + }, + { + "epoch": 20.222853087295956, + "grad_norm": 0.5218490958213806, + "learning_rate": 7.978580553584103e-05, + "loss": 0.001465563103556633, + "step": 142470 + }, + { + "epoch": 20.22427253371185, + "grad_norm": 0.6611945629119873, + "learning_rate": 7.978438608942513e-05, + "loss": 0.020285823941230775, + "step": 142480 + }, + { + "epoch": 20.22569198012775, + "grad_norm": 0.007459554355591536, + "learning_rate": 7.978296664300922e-05, + "loss": 0.02087102234363556, + "step": 142490 + }, + { + "epoch": 20.227111426543647, + "grad_norm": 0.334139883518219, + "learning_rate": 7.978154719659334e-05, + "loss": 0.011999078094959259, + "step": 142500 + }, + { + "epoch": 20.227111426543647, + "eval_accuracy": 0.9874737712214663, + "eval_loss": 0.048032622784376144, + "eval_runtime": 32.801, + "eval_samples_per_second": 479.467, + "eval_steps_per_second": 15.0, + "step": 142500 + }, + { + "epoch": 20.228530872959546, + "grad_norm": 0.02765466645359993, + "learning_rate": 7.978012775017743e-05, + "loss": 0.017795243859291078, + "step": 142510 + }, + { + "epoch": 20.229950319375444, + "grad_norm": 12.934488296508789, + "learning_rate": 7.977870830376154e-05, + "loss": 0.027302253246307372, + "step": 142520 + }, + { + "epoch": 20.231369765791342, + "grad_norm": 0.847672700881958, + "learning_rate": 7.977728885734564e-05, + "loss": 0.05470612645149231, + "step": 142530 + }, + { + "epoch": 20.23278921220724, + "grad_norm": 14.233654975891113, + "learning_rate": 7.977586941092974e-05, + "loss": 0.034137874841690063, + "step": 142540 + }, + { + "epoch": 20.234208658623135, + "grad_norm": 1.0833734273910522, + "learning_rate": 7.977444996451384e-05, + "loss": 0.0424355298280716, + "step": 142550 + }, + { + "epoch": 20.235628105039034, + "grad_norm": 0.5786467790603638, + "learning_rate": 7.977303051809795e-05, + "loss": 0.014288076758384704, + "step": 142560 + }, + { + "epoch": 20.237047551454932, + "grad_norm": 0.015368801541626453, + "learning_rate": 7.977161107168204e-05, + "loss": 0.021183985471725463, + "step": 142570 + }, + { + "epoch": 20.23846699787083, + "grad_norm": 0.07482022047042847, + "learning_rate": 7.977019162526615e-05, + "loss": 0.024730314314365388, + "step": 142580 + }, + { + "epoch": 20.23988644428673, + "grad_norm": 0.7866510152816772, + "learning_rate": 7.976877217885025e-05, + "loss": 0.006797478348016739, + "step": 142590 + }, + { + "epoch": 20.241305890702627, + "grad_norm": 8.225578308105469, + "learning_rate": 7.976735273243435e-05, + "loss": 0.03572050929069519, + "step": 142600 + }, + { + "epoch": 20.242725337118525, + "grad_norm": 0.04579515755176544, + "learning_rate": 7.976593328601846e-05, + "loss": 0.04496398270130157, + "step": 142610 + }, + { + "epoch": 20.24414478353442, + "grad_norm": 0.04150356724858284, + "learning_rate": 7.976451383960256e-05, + "loss": 0.02562972903251648, + "step": 142620 + }, + { + "epoch": 20.24556422995032, + "grad_norm": 9.057428359985352, + "learning_rate": 7.976309439318667e-05, + "loss": 0.031835424900054934, + "step": 142630 + }, + { + "epoch": 20.246983676366217, + "grad_norm": 5.182511806488037, + "learning_rate": 7.976167494677075e-05, + "loss": 0.013751554489135741, + "step": 142640 + }, + { + "epoch": 20.248403122782115, + "grad_norm": 0.15399454534053802, + "learning_rate": 7.976025550035486e-05, + "loss": 0.007786694914102554, + "step": 142650 + }, + { + "epoch": 20.249822569198013, + "grad_norm": 0.1412489116191864, + "learning_rate": 7.975883605393896e-05, + "loss": 0.0022139832377433776, + "step": 142660 + }, + { + "epoch": 20.25124201561391, + "grad_norm": 14.100513458251953, + "learning_rate": 7.975741660752307e-05, + "loss": 0.031208738684654236, + "step": 142670 + }, + { + "epoch": 20.25266146202981, + "grad_norm": 0.028892293572425842, + "learning_rate": 7.975599716110718e-05, + "loss": 0.05569702386856079, + "step": 142680 + }, + { + "epoch": 20.254080908445705, + "grad_norm": 0.07068059593439102, + "learning_rate": 7.975457771469127e-05, + "loss": 0.005036211758852005, + "step": 142690 + }, + { + "epoch": 20.255500354861603, + "grad_norm": 0.8706439733505249, + "learning_rate": 7.975315826827538e-05, + "loss": 0.015374001860618592, + "step": 142700 + }, + { + "epoch": 20.2569198012775, + "grad_norm": 0.022880127653479576, + "learning_rate": 7.975173882185948e-05, + "loss": 0.01999637931585312, + "step": 142710 + }, + { + "epoch": 20.2583392476934, + "grad_norm": 0.12257270514965057, + "learning_rate": 7.975031937544359e-05, + "loss": 0.0037969771772623064, + "step": 142720 + }, + { + "epoch": 20.259758694109298, + "grad_norm": 0.10455180704593658, + "learning_rate": 7.974889992902768e-05, + "loss": 0.024285706877708434, + "step": 142730 + }, + { + "epoch": 20.261178140525196, + "grad_norm": 0.08691148459911346, + "learning_rate": 7.974748048261178e-05, + "loss": 0.002531801909208298, + "step": 142740 + }, + { + "epoch": 20.262597586941094, + "grad_norm": 0.06757752597332001, + "learning_rate": 7.974606103619588e-05, + "loss": 0.010270431637763977, + "step": 142750 + }, + { + "epoch": 20.26401703335699, + "grad_norm": 10.979951858520508, + "learning_rate": 7.974464158977999e-05, + "loss": 0.05837968587875366, + "step": 142760 + }, + { + "epoch": 20.265436479772887, + "grad_norm": 0.23595213890075684, + "learning_rate": 7.97432221433641e-05, + "loss": 0.013116481900215148, + "step": 142770 + }, + { + "epoch": 20.266855926188786, + "grad_norm": 0.2715380787849426, + "learning_rate": 7.97418026969482e-05, + "loss": 0.022164252400398255, + "step": 142780 + }, + { + "epoch": 20.268275372604684, + "grad_norm": 0.4441491961479187, + "learning_rate": 7.97403832505323e-05, + "loss": 0.006470701098442078, + "step": 142790 + }, + { + "epoch": 20.269694819020582, + "grad_norm": 12.162850379943848, + "learning_rate": 7.973896380411639e-05, + "loss": 0.05165572166442871, + "step": 142800 + }, + { + "epoch": 20.27111426543648, + "grad_norm": 3.346304178237915, + "learning_rate": 7.97375443577005e-05, + "loss": 0.01856372058391571, + "step": 142810 + }, + { + "epoch": 20.27253371185238, + "grad_norm": 0.02115229330956936, + "learning_rate": 7.97361249112846e-05, + "loss": 0.005987913906574249, + "step": 142820 + }, + { + "epoch": 20.273953158268274, + "grad_norm": 0.013563952408730984, + "learning_rate": 7.973470546486871e-05, + "loss": 0.004847363010048867, + "step": 142830 + }, + { + "epoch": 20.275372604684172, + "grad_norm": 1.303944706916809, + "learning_rate": 7.97332860184528e-05, + "loss": 0.01625673323869705, + "step": 142840 + }, + { + "epoch": 20.27679205110007, + "grad_norm": 0.06267435103654861, + "learning_rate": 7.97318665720369e-05, + "loss": 0.01115289404988289, + "step": 142850 + }, + { + "epoch": 20.27821149751597, + "grad_norm": 0.06709060817956924, + "learning_rate": 7.973044712562102e-05, + "loss": 0.0017483565956354142, + "step": 142860 + }, + { + "epoch": 20.279630943931867, + "grad_norm": 0.030019039288163185, + "learning_rate": 7.972902767920511e-05, + "loss": 0.012597373127937317, + "step": 142870 + }, + { + "epoch": 20.281050390347765, + "grad_norm": 0.0831553041934967, + "learning_rate": 7.972760823278923e-05, + "loss": 0.011013035476207734, + "step": 142880 + }, + { + "epoch": 20.282469836763664, + "grad_norm": 0.031748171895742416, + "learning_rate": 7.972618878637332e-05, + "loss": 0.026995158195495604, + "step": 142890 + }, + { + "epoch": 20.28388928317956, + "grad_norm": 0.1174553707242012, + "learning_rate": 7.972476933995742e-05, + "loss": 0.00546933151781559, + "step": 142900 + }, + { + "epoch": 20.285308729595457, + "grad_norm": 0.052703723311424255, + "learning_rate": 7.972334989354152e-05, + "loss": 0.005904996022582054, + "step": 142910 + }, + { + "epoch": 20.286728176011355, + "grad_norm": 0.38965511322021484, + "learning_rate": 7.972193044712563e-05, + "loss": 0.005780385807156563, + "step": 142920 + }, + { + "epoch": 20.288147622427253, + "grad_norm": 0.5876114964485168, + "learning_rate": 7.972051100070973e-05, + "loss": 0.025393232703208923, + "step": 142930 + }, + { + "epoch": 20.28956706884315, + "grad_norm": 0.2854948341846466, + "learning_rate": 7.971909155429384e-05, + "loss": 0.008964084088802338, + "step": 142940 + }, + { + "epoch": 20.29098651525905, + "grad_norm": 10.083375930786133, + "learning_rate": 7.971767210787793e-05, + "loss": 0.043013885617256165, + "step": 142950 + }, + { + "epoch": 20.292405961674948, + "grad_norm": 0.013548131100833416, + "learning_rate": 7.971625266146203e-05, + "loss": 0.01220654621720314, + "step": 142960 + }, + { + "epoch": 20.293825408090843, + "grad_norm": 0.2204940915107727, + "learning_rate": 7.971483321504614e-05, + "loss": 0.04201371967792511, + "step": 142970 + }, + { + "epoch": 20.29524485450674, + "grad_norm": 0.1154663935303688, + "learning_rate": 7.971341376863024e-05, + "loss": 0.03712378144264221, + "step": 142980 + }, + { + "epoch": 20.29666430092264, + "grad_norm": 0.39607420563697815, + "learning_rate": 7.971199432221435e-05, + "loss": 0.01964118182659149, + "step": 142990 + }, + { + "epoch": 20.298083747338538, + "grad_norm": 0.13066010177135468, + "learning_rate": 7.971057487579843e-05, + "loss": 0.013802319765090942, + "step": 143000 + }, + { + "epoch": 20.298083747338538, + "eval_accuracy": 0.9876009410567813, + "eval_loss": 0.04419711232185364, + "eval_runtime": 33.5958, + "eval_samples_per_second": 468.125, + "eval_steps_per_second": 14.645, + "step": 143000 + }, + { + "epoch": 20.299503193754436, + "grad_norm": 0.0710671991109848, + "learning_rate": 7.970915542938255e-05, + "loss": 0.022224968671798705, + "step": 143010 + }, + { + "epoch": 20.300922640170334, + "grad_norm": 0.3132091462612152, + "learning_rate": 7.970773598296664e-05, + "loss": 0.0013002410531044006, + "step": 143020 + }, + { + "epoch": 20.302342086586233, + "grad_norm": 0.16595591604709625, + "learning_rate": 7.970631653655075e-05, + "loss": 0.002704678475856781, + "step": 143030 + }, + { + "epoch": 20.303761533002127, + "grad_norm": 9.94991683959961, + "learning_rate": 7.970489709013485e-05, + "loss": 0.023206396400928496, + "step": 143040 + }, + { + "epoch": 20.305180979418026, + "grad_norm": 0.06891633570194244, + "learning_rate": 7.970347764371895e-05, + "loss": 0.0032977689057588576, + "step": 143050 + }, + { + "epoch": 20.306600425833924, + "grad_norm": 0.07374939322471619, + "learning_rate": 7.970205819730306e-05, + "loss": 0.0014350403100252152, + "step": 143060 + }, + { + "epoch": 20.308019872249822, + "grad_norm": 1.4631593227386475, + "learning_rate": 7.970063875088716e-05, + "loss": 0.03319612145423889, + "step": 143070 + }, + { + "epoch": 20.30943931866572, + "grad_norm": 0.02771943248808384, + "learning_rate": 7.969921930447127e-05, + "loss": 0.01646076887845993, + "step": 143080 + }, + { + "epoch": 20.31085876508162, + "grad_norm": 0.8415849804878235, + "learning_rate": 7.969779985805537e-05, + "loss": 0.02362992614507675, + "step": 143090 + }, + { + "epoch": 20.312278211497517, + "grad_norm": 0.09133688360452652, + "learning_rate": 7.969638041163946e-05, + "loss": 0.003934069722890854, + "step": 143100 + }, + { + "epoch": 20.313697657913412, + "grad_norm": 0.6612482666969299, + "learning_rate": 7.969496096522356e-05, + "loss": 0.029899373650550842, + "step": 143110 + }, + { + "epoch": 20.31511710432931, + "grad_norm": 0.15536877512931824, + "learning_rate": 7.969354151880767e-05, + "loss": 0.024406220018863677, + "step": 143120 + }, + { + "epoch": 20.31653655074521, + "grad_norm": 0.09201609343290329, + "learning_rate": 7.969212207239177e-05, + "loss": 0.0193393275141716, + "step": 143130 + }, + { + "epoch": 20.317955997161107, + "grad_norm": 0.15614308416843414, + "learning_rate": 7.969070262597588e-05, + "loss": 0.008105764538049698, + "step": 143140 + }, + { + "epoch": 20.319375443577005, + "grad_norm": 5.1248040199279785, + "learning_rate": 7.968928317955998e-05, + "loss": 0.024651895463466644, + "step": 143150 + }, + { + "epoch": 20.320794889992904, + "grad_norm": 12.470382690429688, + "learning_rate": 7.968786373314407e-05, + "loss": 0.057747375965118405, + "step": 143160 + }, + { + "epoch": 20.322214336408802, + "grad_norm": 0.02708159014582634, + "learning_rate": 7.968644428672818e-05, + "loss": 0.006167247146368027, + "step": 143170 + }, + { + "epoch": 20.323633782824697, + "grad_norm": 0.27378222346305847, + "learning_rate": 7.968502484031228e-05, + "loss": 0.0270577996969223, + "step": 143180 + }, + { + "epoch": 20.325053229240595, + "grad_norm": 0.4684883654117584, + "learning_rate": 7.968360539389639e-05, + "loss": 0.007477696239948273, + "step": 143190 + }, + { + "epoch": 20.326472675656493, + "grad_norm": 8.21023941040039, + "learning_rate": 7.968218594748048e-05, + "loss": 0.024356037378311157, + "step": 143200 + }, + { + "epoch": 20.32789212207239, + "grad_norm": 3.9998233318328857, + "learning_rate": 7.968076650106459e-05, + "loss": 0.011114828288555145, + "step": 143210 + }, + { + "epoch": 20.32931156848829, + "grad_norm": 8.465042114257812, + "learning_rate": 7.967934705464869e-05, + "loss": 0.053187215328216554, + "step": 143220 + }, + { + "epoch": 20.330731014904188, + "grad_norm": 1.8651872873306274, + "learning_rate": 7.96779276082328e-05, + "loss": 0.008612476289272308, + "step": 143230 + }, + { + "epoch": 20.332150461320087, + "grad_norm": 0.15141363441944122, + "learning_rate": 7.96765081618169e-05, + "loss": 0.06432226300239563, + "step": 143240 + }, + { + "epoch": 20.33356990773598, + "grad_norm": 13.178853034973145, + "learning_rate": 7.9675088715401e-05, + "loss": 0.03949933350086212, + "step": 143250 + }, + { + "epoch": 20.33498935415188, + "grad_norm": 4.991683483123779, + "learning_rate": 7.96736692689851e-05, + "loss": 0.011390550434589386, + "step": 143260 + }, + { + "epoch": 20.336408800567778, + "grad_norm": 0.4497523605823517, + "learning_rate": 7.96722498225692e-05, + "loss": 0.022866718471050262, + "step": 143270 + }, + { + "epoch": 20.337828246983676, + "grad_norm": 0.2339865118265152, + "learning_rate": 7.967083037615331e-05, + "loss": 0.0036530278623104094, + "step": 143280 + }, + { + "epoch": 20.339247693399575, + "grad_norm": 0.6333643794059753, + "learning_rate": 7.966941092973741e-05, + "loss": 0.011677588522434234, + "step": 143290 + }, + { + "epoch": 20.340667139815473, + "grad_norm": 0.5095767378807068, + "learning_rate": 7.966799148332152e-05, + "loss": 0.04856514036655426, + "step": 143300 + }, + { + "epoch": 20.34208658623137, + "grad_norm": 2.0360395908355713, + "learning_rate": 7.96665720369056e-05, + "loss": 0.01791858673095703, + "step": 143310 + }, + { + "epoch": 20.343506032647266, + "grad_norm": 0.019238732755184174, + "learning_rate": 7.966515259048971e-05, + "loss": 0.019448147714138032, + "step": 143320 + }, + { + "epoch": 20.344925479063164, + "grad_norm": 1.314644694328308, + "learning_rate": 7.966373314407381e-05, + "loss": 0.013599833846092224, + "step": 143330 + }, + { + "epoch": 20.346344925479062, + "grad_norm": 0.051649417728185654, + "learning_rate": 7.966231369765792e-05, + "loss": 0.013281884789466857, + "step": 143340 + }, + { + "epoch": 20.34776437189496, + "grad_norm": 0.6772908568382263, + "learning_rate": 7.966089425124202e-05, + "loss": 0.011097601056098938, + "step": 143350 + }, + { + "epoch": 20.34918381831086, + "grad_norm": 1.8880468606948853, + "learning_rate": 7.965947480482612e-05, + "loss": 0.028100913763046263, + "step": 143360 + }, + { + "epoch": 20.350603264726757, + "grad_norm": 10.578641891479492, + "learning_rate": 7.965805535841023e-05, + "loss": 0.007112830132246018, + "step": 143370 + }, + { + "epoch": 20.352022711142656, + "grad_norm": 0.7287304401397705, + "learning_rate": 7.965663591199432e-05, + "loss": 0.010783711075782776, + "step": 143380 + }, + { + "epoch": 20.35344215755855, + "grad_norm": 6.590147018432617, + "learning_rate": 7.965521646557844e-05, + "loss": 0.010001247376203537, + "step": 143390 + }, + { + "epoch": 20.35486160397445, + "grad_norm": 6.245509624481201, + "learning_rate": 7.965379701916253e-05, + "loss": 0.03738590776920318, + "step": 143400 + }, + { + "epoch": 20.356281050390347, + "grad_norm": 0.07190784811973572, + "learning_rate": 7.965237757274663e-05, + "loss": 0.025264889001846313, + "step": 143410 + }, + { + "epoch": 20.357700496806245, + "grad_norm": 0.01021641492843628, + "learning_rate": 7.965095812633073e-05, + "loss": 0.012144586443901062, + "step": 143420 + }, + { + "epoch": 20.359119943222144, + "grad_norm": 0.2613297700881958, + "learning_rate": 7.964953867991484e-05, + "loss": 0.010810536146163941, + "step": 143430 + }, + { + "epoch": 20.360539389638042, + "grad_norm": 0.21913333237171173, + "learning_rate": 7.964811923349894e-05, + "loss": 0.005209186300635338, + "step": 143440 + }, + { + "epoch": 20.36195883605394, + "grad_norm": 0.002570623531937599, + "learning_rate": 7.964669978708305e-05, + "loss": 0.027671465277671815, + "step": 143450 + }, + { + "epoch": 20.363378282469835, + "grad_norm": 0.007602290716022253, + "learning_rate": 7.964528034066714e-05, + "loss": 0.016392304003238677, + "step": 143460 + }, + { + "epoch": 20.364797728885733, + "grad_norm": 1.9662175178527832, + "learning_rate": 7.964386089425124e-05, + "loss": 0.019445177912712098, + "step": 143470 + }, + { + "epoch": 20.36621717530163, + "grad_norm": 1.7708325386047363, + "learning_rate": 7.964244144783535e-05, + "loss": 0.06833457350730895, + "step": 143480 + }, + { + "epoch": 20.36763662171753, + "grad_norm": 0.0037574139423668385, + "learning_rate": 7.964102200141945e-05, + "loss": 0.002533341571688652, + "step": 143490 + }, + { + "epoch": 20.36905606813343, + "grad_norm": 3.2528820037841797, + "learning_rate": 7.963960255500356e-05, + "loss": 0.008031685650348664, + "step": 143500 + }, + { + "epoch": 20.36905606813343, + "eval_accuracy": 0.9901443377630826, + "eval_loss": 0.03529705852270126, + "eval_runtime": 33.265, + "eval_samples_per_second": 472.779, + "eval_steps_per_second": 14.79, + "step": 143500 + }, + { + "epoch": 20.370475514549327, + "grad_norm": 1.3986564874649048, + "learning_rate": 7.963818310858764e-05, + "loss": 0.005757429823279381, + "step": 143510 + }, + { + "epoch": 20.371894960965225, + "grad_norm": 0.37375408411026, + "learning_rate": 7.963676366217176e-05, + "loss": 0.004768835753202439, + "step": 143520 + }, + { + "epoch": 20.37331440738112, + "grad_norm": 0.9124902486801147, + "learning_rate": 7.963534421575585e-05, + "loss": 0.008659066259860992, + "step": 143530 + }, + { + "epoch": 20.374733853797018, + "grad_norm": 0.4636019766330719, + "learning_rate": 7.963392476933996e-05, + "loss": 0.03807288706302643, + "step": 143540 + }, + { + "epoch": 20.376153300212916, + "grad_norm": 2.9868409633636475, + "learning_rate": 7.963250532292406e-05, + "loss": 0.004987531527876854, + "step": 143550 + }, + { + "epoch": 20.377572746628815, + "grad_norm": 0.053384773433208466, + "learning_rate": 7.963108587650816e-05, + "loss": 0.00445505827665329, + "step": 143560 + }, + { + "epoch": 20.378992193044713, + "grad_norm": 3.059741258621216, + "learning_rate": 7.962966643009227e-05, + "loss": 0.013981804251670837, + "step": 143570 + }, + { + "epoch": 20.38041163946061, + "grad_norm": 0.02386626973748207, + "learning_rate": 7.962824698367637e-05, + "loss": 0.0010259795933961868, + "step": 143580 + }, + { + "epoch": 20.38183108587651, + "grad_norm": 0.6353252530097961, + "learning_rate": 7.962682753726048e-05, + "loss": 0.00803077220916748, + "step": 143590 + }, + { + "epoch": 20.383250532292404, + "grad_norm": 0.2397974729537964, + "learning_rate": 7.962540809084458e-05, + "loss": 0.013181793689727783, + "step": 143600 + }, + { + "epoch": 20.384669978708303, + "grad_norm": 0.4873849153518677, + "learning_rate": 7.962398864442869e-05, + "loss": 0.0353862851858139, + "step": 143610 + }, + { + "epoch": 20.3860894251242, + "grad_norm": 0.3819289207458496, + "learning_rate": 7.962256919801277e-05, + "loss": 0.013367743790149688, + "step": 143620 + }, + { + "epoch": 20.3875088715401, + "grad_norm": 0.5420555472373962, + "learning_rate": 7.962114975159688e-05, + "loss": 0.010918934643268586, + "step": 143630 + }, + { + "epoch": 20.388928317955997, + "grad_norm": 0.8265628814697266, + "learning_rate": 7.961973030518098e-05, + "loss": 0.01730220913887024, + "step": 143640 + }, + { + "epoch": 20.390347764371896, + "grad_norm": 0.17687322199344635, + "learning_rate": 7.961831085876509e-05, + "loss": 0.03857523202896118, + "step": 143650 + }, + { + "epoch": 20.391767210787794, + "grad_norm": 0.38359999656677246, + "learning_rate": 7.961689141234919e-05, + "loss": 0.013185246288776398, + "step": 143660 + }, + { + "epoch": 20.39318665720369, + "grad_norm": 0.7572456002235413, + "learning_rate": 7.961547196593328e-05, + "loss": 0.021738988161087037, + "step": 143670 + }, + { + "epoch": 20.394606103619587, + "grad_norm": 0.933469831943512, + "learning_rate": 7.96140525195174e-05, + "loss": 0.016792310774326323, + "step": 143680 + }, + { + "epoch": 20.396025550035485, + "grad_norm": 0.48743927478790283, + "learning_rate": 7.961263307310149e-05, + "loss": 0.015995195508003233, + "step": 143690 + }, + { + "epoch": 20.397444996451384, + "grad_norm": 1.1666967868804932, + "learning_rate": 7.96112136266856e-05, + "loss": 0.006618998944759369, + "step": 143700 + }, + { + "epoch": 20.398864442867282, + "grad_norm": 0.054619308561086655, + "learning_rate": 7.96097941802697e-05, + "loss": 0.07580899000167847, + "step": 143710 + }, + { + "epoch": 20.40028388928318, + "grad_norm": 0.8266465067863464, + "learning_rate": 7.96083747338538e-05, + "loss": 0.005849643424153328, + "step": 143720 + }, + { + "epoch": 20.40170333569908, + "grad_norm": 11.851669311523438, + "learning_rate": 7.96069552874379e-05, + "loss": 0.010745181143283844, + "step": 143730 + }, + { + "epoch": 20.403122782114973, + "grad_norm": 0.21050934493541718, + "learning_rate": 7.9605535841022e-05, + "loss": 0.01756216138601303, + "step": 143740 + }, + { + "epoch": 20.40454222853087, + "grad_norm": 7.4519500732421875, + "learning_rate": 7.96041163946061e-05, + "loss": 0.048693782091140746, + "step": 143750 + }, + { + "epoch": 20.40596167494677, + "grad_norm": 0.02900480106472969, + "learning_rate": 7.960269694819021e-05, + "loss": 0.025216007232666017, + "step": 143760 + }, + { + "epoch": 20.40738112136267, + "grad_norm": 0.048225436359643936, + "learning_rate": 7.960127750177431e-05, + "loss": 0.024932587146759035, + "step": 143770 + }, + { + "epoch": 20.408800567778567, + "grad_norm": 4.381402969360352, + "learning_rate": 7.959985805535841e-05, + "loss": 0.019103142619132995, + "step": 143780 + }, + { + "epoch": 20.410220014194465, + "grad_norm": 6.140013694763184, + "learning_rate": 7.959843860894252e-05, + "loss": 0.010969682037830353, + "step": 143790 + }, + { + "epoch": 20.411639460610363, + "grad_norm": 0.9153760075569153, + "learning_rate": 7.959701916252662e-05, + "loss": 0.032856902480125426, + "step": 143800 + }, + { + "epoch": 20.413058907026258, + "grad_norm": 13.96950626373291, + "learning_rate": 7.959559971611073e-05, + "loss": 0.05687206983566284, + "step": 143810 + }, + { + "epoch": 20.414478353442156, + "grad_norm": 0.008463529869914055, + "learning_rate": 7.959418026969481e-05, + "loss": 0.015109725296497345, + "step": 143820 + }, + { + "epoch": 20.415897799858055, + "grad_norm": 0.047428861260414124, + "learning_rate": 7.959276082327892e-05, + "loss": 0.020231030881404877, + "step": 143830 + }, + { + "epoch": 20.417317246273953, + "grad_norm": 0.282391756772995, + "learning_rate": 7.959134137686302e-05, + "loss": 0.02680096626281738, + "step": 143840 + }, + { + "epoch": 20.41873669268985, + "grad_norm": 11.744375228881836, + "learning_rate": 7.958992193044713e-05, + "loss": 0.006563518196344376, + "step": 143850 + }, + { + "epoch": 20.42015613910575, + "grad_norm": 4.867786884307861, + "learning_rate": 7.958850248403123e-05, + "loss": 0.05495727062225342, + "step": 143860 + }, + { + "epoch": 20.421575585521648, + "grad_norm": 0.4899853467941284, + "learning_rate": 7.958708303761533e-05, + "loss": 0.008291900902986527, + "step": 143870 + }, + { + "epoch": 20.422995031937543, + "grad_norm": 2.9325203895568848, + "learning_rate": 7.958566359119944e-05, + "loss": 0.01769079566001892, + "step": 143880 + }, + { + "epoch": 20.42441447835344, + "grad_norm": 2.7301363945007324, + "learning_rate": 7.958424414478353e-05, + "loss": 0.01559491604566574, + "step": 143890 + }, + { + "epoch": 20.42583392476934, + "grad_norm": 0.005212320946156979, + "learning_rate": 7.958282469836765e-05, + "loss": 0.00540030300617218, + "step": 143900 + }, + { + "epoch": 20.427253371185238, + "grad_norm": 0.1059456467628479, + "learning_rate": 7.958140525195174e-05, + "loss": 0.030397701263427734, + "step": 143910 + }, + { + "epoch": 20.428672817601136, + "grad_norm": 1.7047607898712158, + "learning_rate": 7.957998580553584e-05, + "loss": 0.028485524654388427, + "step": 143920 + }, + { + "epoch": 20.430092264017034, + "grad_norm": 0.0941111221909523, + "learning_rate": 7.957856635911994e-05, + "loss": 0.018759424984455108, + "step": 143930 + }, + { + "epoch": 20.431511710432932, + "grad_norm": 1.2995245456695557, + "learning_rate": 7.957714691270405e-05, + "loss": 0.035907435417175296, + "step": 143940 + }, + { + "epoch": 20.432931156848827, + "grad_norm": 0.02575070969760418, + "learning_rate": 7.957572746628815e-05, + "loss": 0.029672053456306458, + "step": 143950 + }, + { + "epoch": 20.434350603264726, + "grad_norm": 0.2127975970506668, + "learning_rate": 7.957430801987226e-05, + "loss": 0.012106426805257798, + "step": 143960 + }, + { + "epoch": 20.435770049680624, + "grad_norm": 3.6614949703216553, + "learning_rate": 7.957288857345637e-05, + "loss": 0.0195004865527153, + "step": 143970 + }, + { + "epoch": 20.437189496096522, + "grad_norm": 1.0612987279891968, + "learning_rate": 7.957146912704045e-05, + "loss": 0.030395376682281493, + "step": 143980 + }, + { + "epoch": 20.43860894251242, + "grad_norm": 0.5859479904174805, + "learning_rate": 7.957004968062456e-05, + "loss": 0.039389362931251524, + "step": 143990 + }, + { + "epoch": 20.44002838892832, + "grad_norm": 2.5606186389923096, + "learning_rate": 7.956863023420866e-05, + "loss": 0.0025921862572431563, + "step": 144000 + }, + { + "epoch": 20.44002838892832, + "eval_accuracy": 0.9828320722324665, + "eval_loss": 0.061342090368270874, + "eval_runtime": 34.3117, + "eval_samples_per_second": 458.357, + "eval_steps_per_second": 14.339, + "step": 144000 + }, + { + "epoch": 20.441447835344217, + "grad_norm": 15.801836967468262, + "learning_rate": 7.956721078779277e-05, + "loss": 0.030484694242477416, + "step": 144010 + }, + { + "epoch": 20.442867281760112, + "grad_norm": 13.656440734863281, + "learning_rate": 7.956579134137687e-05, + "loss": 0.017888091504573822, + "step": 144020 + }, + { + "epoch": 20.44428672817601, + "grad_norm": 0.3600428104400635, + "learning_rate": 7.956437189496097e-05, + "loss": 0.025508299469947815, + "step": 144030 + }, + { + "epoch": 20.44570617459191, + "grad_norm": 0.37523525953292847, + "learning_rate": 7.956295244854506e-05, + "loss": 0.018080103397369384, + "step": 144040 + }, + { + "epoch": 20.447125621007807, + "grad_norm": 1.7047333717346191, + "learning_rate": 7.956153300212917e-05, + "loss": 0.026506760716438295, + "step": 144050 + }, + { + "epoch": 20.448545067423705, + "grad_norm": 1.0215219259262085, + "learning_rate": 7.956011355571328e-05, + "loss": 0.02427029013633728, + "step": 144060 + }, + { + "epoch": 20.449964513839603, + "grad_norm": 0.22409643232822418, + "learning_rate": 7.955869410929738e-05, + "loss": 0.057889151573181155, + "step": 144070 + }, + { + "epoch": 20.4513839602555, + "grad_norm": 0.19263197481632233, + "learning_rate": 7.955727466288148e-05, + "loss": 0.02869861125946045, + "step": 144080 + }, + { + "epoch": 20.4528034066714, + "grad_norm": 0.5966265201568604, + "learning_rate": 7.955585521646558e-05, + "loss": 0.0024799294769763947, + "step": 144090 + }, + { + "epoch": 20.454222853087295, + "grad_norm": 0.09790827333927155, + "learning_rate": 7.955443577004969e-05, + "loss": 0.0106086365878582, + "step": 144100 + }, + { + "epoch": 20.455642299503193, + "grad_norm": 5.698458194732666, + "learning_rate": 7.955301632363379e-05, + "loss": 0.006442001461982727, + "step": 144110 + }, + { + "epoch": 20.45706174591909, + "grad_norm": 0.15753024816513062, + "learning_rate": 7.95515968772179e-05, + "loss": 0.013531532883644105, + "step": 144120 + }, + { + "epoch": 20.45848119233499, + "grad_norm": 0.19208881258964539, + "learning_rate": 7.955017743080198e-05, + "loss": 0.010571710765361786, + "step": 144130 + }, + { + "epoch": 20.459900638750888, + "grad_norm": 0.400680273771286, + "learning_rate": 7.954875798438609e-05, + "loss": 0.00333230085670948, + "step": 144140 + }, + { + "epoch": 20.461320085166786, + "grad_norm": 0.12694121897220612, + "learning_rate": 7.95473385379702e-05, + "loss": 0.003826696053147316, + "step": 144150 + }, + { + "epoch": 20.462739531582685, + "grad_norm": 0.19814366102218628, + "learning_rate": 7.95459190915543e-05, + "loss": 0.05421693325042724, + "step": 144160 + }, + { + "epoch": 20.46415897799858, + "grad_norm": 1.1977726221084595, + "learning_rate": 7.954449964513841e-05, + "loss": 0.03128778040409088, + "step": 144170 + }, + { + "epoch": 20.465578424414478, + "grad_norm": 0.010190371423959732, + "learning_rate": 7.95430801987225e-05, + "loss": 0.005667523294687271, + "step": 144180 + }, + { + "epoch": 20.466997870830376, + "grad_norm": 0.5949884653091431, + "learning_rate": 7.95416607523066e-05, + "loss": 0.02206910401582718, + "step": 144190 + }, + { + "epoch": 20.468417317246274, + "grad_norm": 0.3613375127315521, + "learning_rate": 7.95402413058907e-05, + "loss": 0.002604234963655472, + "step": 144200 + }, + { + "epoch": 20.469836763662173, + "grad_norm": 0.03862883523106575, + "learning_rate": 7.953882185947481e-05, + "loss": 0.00332438163459301, + "step": 144210 + }, + { + "epoch": 20.47125621007807, + "grad_norm": 0.4510781764984131, + "learning_rate": 7.953740241305891e-05, + "loss": 0.04834697842597961, + "step": 144220 + }, + { + "epoch": 20.47267565649397, + "grad_norm": 8.12693977355957, + "learning_rate": 7.953598296664301e-05, + "loss": 0.021429724991321564, + "step": 144230 + }, + { + "epoch": 20.474095102909864, + "grad_norm": 0.24772100150585175, + "learning_rate": 7.953456352022712e-05, + "loss": 0.023217305541038513, + "step": 144240 + }, + { + "epoch": 20.475514549325762, + "grad_norm": 0.0036164431367069483, + "learning_rate": 7.953314407381122e-05, + "loss": 0.015688189864158632, + "step": 144250 + }, + { + "epoch": 20.47693399574166, + "grad_norm": 5.6642279624938965, + "learning_rate": 7.953172462739533e-05, + "loss": 0.022077365219593047, + "step": 144260 + }, + { + "epoch": 20.47835344215756, + "grad_norm": 0.3166951835155487, + "learning_rate": 7.953030518097942e-05, + "loss": 0.010726609826087951, + "step": 144270 + }, + { + "epoch": 20.479772888573457, + "grad_norm": 0.05075109004974365, + "learning_rate": 7.952888573456352e-05, + "loss": 0.004327214881777763, + "step": 144280 + }, + { + "epoch": 20.481192334989355, + "grad_norm": 0.052158039063215256, + "learning_rate": 7.952746628814762e-05, + "loss": 0.0032896395772695542, + "step": 144290 + }, + { + "epoch": 20.482611781405254, + "grad_norm": 0.011469243094325066, + "learning_rate": 7.952604684173173e-05, + "loss": 0.002999286726117134, + "step": 144300 + }, + { + "epoch": 20.48403122782115, + "grad_norm": 1.171034812927246, + "learning_rate": 7.952462739531583e-05, + "loss": 0.02299569547176361, + "step": 144310 + }, + { + "epoch": 20.485450674237047, + "grad_norm": 14.642333984375, + "learning_rate": 7.952320794889994e-05, + "loss": 0.02856981158256531, + "step": 144320 + }, + { + "epoch": 20.486870120652945, + "grad_norm": 0.003455414902418852, + "learning_rate": 7.952178850248404e-05, + "loss": 0.006457825750112533, + "step": 144330 + }, + { + "epoch": 20.488289567068843, + "grad_norm": 1.4039028882980347, + "learning_rate": 7.952036905606813e-05, + "loss": 0.013668262958526611, + "step": 144340 + }, + { + "epoch": 20.48970901348474, + "grad_norm": 1.8155009746551514, + "learning_rate": 7.951894960965224e-05, + "loss": 0.021648672223091126, + "step": 144350 + }, + { + "epoch": 20.49112845990064, + "grad_norm": 0.5147523283958435, + "learning_rate": 7.951753016323634e-05, + "loss": 0.048282742500305176, + "step": 144360 + }, + { + "epoch": 20.49254790631654, + "grad_norm": 0.06422603875398636, + "learning_rate": 7.951611071682045e-05, + "loss": 0.0357273668050766, + "step": 144370 + }, + { + "epoch": 20.493967352732433, + "grad_norm": 6.547656536102295, + "learning_rate": 7.951469127040455e-05, + "loss": 0.04553760290145874, + "step": 144380 + }, + { + "epoch": 20.49538679914833, + "grad_norm": 19.1754093170166, + "learning_rate": 7.951327182398865e-05, + "loss": 0.028497081995010377, + "step": 144390 + }, + { + "epoch": 20.49680624556423, + "grad_norm": 1.616147756576538, + "learning_rate": 7.951185237757274e-05, + "loss": 0.02675079107284546, + "step": 144400 + }, + { + "epoch": 20.498225691980128, + "grad_norm": 1.0593620538711548, + "learning_rate": 7.951043293115686e-05, + "loss": 0.017714273929595948, + "step": 144410 + }, + { + "epoch": 20.499645138396026, + "grad_norm": 2.143415689468384, + "learning_rate": 7.950901348474095e-05, + "loss": 0.005340157449245453, + "step": 144420 + }, + { + "epoch": 20.501064584811925, + "grad_norm": 0.31105735898017883, + "learning_rate": 7.950759403832506e-05, + "loss": 0.01819224953651428, + "step": 144430 + }, + { + "epoch": 20.502484031227823, + "grad_norm": 15.8975830078125, + "learning_rate": 7.950617459190916e-05, + "loss": 0.02267924100160599, + "step": 144440 + }, + { + "epoch": 20.503903477643718, + "grad_norm": 0.6668938398361206, + "learning_rate": 7.950475514549326e-05, + "loss": 0.024639742076396944, + "step": 144450 + }, + { + "epoch": 20.505322924059616, + "grad_norm": 0.33851656317710876, + "learning_rate": 7.950333569907737e-05, + "loss": 0.007134123146533966, + "step": 144460 + }, + { + "epoch": 20.506742370475514, + "grad_norm": 0.5393432378768921, + "learning_rate": 7.950191625266147e-05, + "loss": 0.002460606023669243, + "step": 144470 + }, + { + "epoch": 20.508161816891413, + "grad_norm": 0.0949764996767044, + "learning_rate": 7.950049680624558e-05, + "loss": 0.023367878794670106, + "step": 144480 + }, + { + "epoch": 20.50958126330731, + "grad_norm": 0.16646301746368408, + "learning_rate": 7.949907735982966e-05, + "loss": 0.013226522505283356, + "step": 144490 + }, + { + "epoch": 20.51100070972321, + "grad_norm": 7.567387104034424, + "learning_rate": 7.949765791341377e-05, + "loss": 0.03316475450992584, + "step": 144500 + }, + { + "epoch": 20.51100070972321, + "eval_accuracy": 0.9884275449863292, + "eval_loss": 0.04046180099248886, + "eval_runtime": 34.0112, + "eval_samples_per_second": 462.406, + "eval_steps_per_second": 14.466, + "step": 144500 + }, + { + "epoch": 20.512420156139108, + "grad_norm": 11.487260818481445, + "learning_rate": 7.949623846699787e-05, + "loss": 0.009593375027179718, + "step": 144510 + }, + { + "epoch": 20.513839602555002, + "grad_norm": 0.2956373393535614, + "learning_rate": 7.949481902058198e-05, + "loss": 0.013197061419487, + "step": 144520 + }, + { + "epoch": 20.5152590489709, + "grad_norm": 0.6947842836380005, + "learning_rate": 7.949339957416608e-05, + "loss": 0.00269196555018425, + "step": 144530 + }, + { + "epoch": 20.5166784953868, + "grad_norm": 0.025174129754304886, + "learning_rate": 7.949198012775018e-05, + "loss": 0.0012585099786520005, + "step": 144540 + }, + { + "epoch": 20.518097941802697, + "grad_norm": 0.04389370232820511, + "learning_rate": 7.949056068133429e-05, + "loss": 0.0021505054086446763, + "step": 144550 + }, + { + "epoch": 20.519517388218595, + "grad_norm": 17.116100311279297, + "learning_rate": 7.948914123491838e-05, + "loss": 0.017475605010986328, + "step": 144560 + }, + { + "epoch": 20.520936834634494, + "grad_norm": 0.06114675849676132, + "learning_rate": 7.94877217885025e-05, + "loss": 0.021759213507175447, + "step": 144570 + }, + { + "epoch": 20.522356281050392, + "grad_norm": 0.26840707659721375, + "learning_rate": 7.948630234208659e-05, + "loss": 0.012413251399993896, + "step": 144580 + }, + { + "epoch": 20.523775727466287, + "grad_norm": 0.026500867679715157, + "learning_rate": 7.948488289567069e-05, + "loss": 0.028197860717773436, + "step": 144590 + }, + { + "epoch": 20.525195173882185, + "grad_norm": 0.005009442567825317, + "learning_rate": 7.948346344925479e-05, + "loss": 0.011039438843727111, + "step": 144600 + }, + { + "epoch": 20.526614620298083, + "grad_norm": 13.434609413146973, + "learning_rate": 7.94820440028389e-05, + "loss": 0.011699755489826203, + "step": 144610 + }, + { + "epoch": 20.528034066713982, + "grad_norm": 0.3532591164112091, + "learning_rate": 7.9480624556423e-05, + "loss": 0.010837162286043167, + "step": 144620 + }, + { + "epoch": 20.52945351312988, + "grad_norm": 0.33298391103744507, + "learning_rate": 7.94792051100071e-05, + "loss": 0.015803493559360504, + "step": 144630 + }, + { + "epoch": 20.53087295954578, + "grad_norm": 0.3064946234226227, + "learning_rate": 7.94777856635912e-05, + "loss": 0.008358365297317505, + "step": 144640 + }, + { + "epoch": 20.532292405961677, + "grad_norm": 0.365702748298645, + "learning_rate": 7.94763662171753e-05, + "loss": 0.0238431379199028, + "step": 144650 + }, + { + "epoch": 20.53371185237757, + "grad_norm": 0.05736573413014412, + "learning_rate": 7.947494677075941e-05, + "loss": 0.001297728344798088, + "step": 144660 + }, + { + "epoch": 20.53513129879347, + "grad_norm": 3.2892520427703857, + "learning_rate": 7.947352732434351e-05, + "loss": 0.009251207113265991, + "step": 144670 + }, + { + "epoch": 20.536550745209368, + "grad_norm": 0.2739517092704773, + "learning_rate": 7.947210787792762e-05, + "loss": 0.011083140969276428, + "step": 144680 + }, + { + "epoch": 20.537970191625266, + "grad_norm": 0.0026503384578973055, + "learning_rate": 7.947068843151172e-05, + "loss": 0.011821965128183365, + "step": 144690 + }, + { + "epoch": 20.539389638041165, + "grad_norm": 11.42822551727295, + "learning_rate": 7.946926898509582e-05, + "loss": 0.05259329080581665, + "step": 144700 + }, + { + "epoch": 20.540809084457063, + "grad_norm": 0.06336436420679092, + "learning_rate": 7.946784953867991e-05, + "loss": 0.0036733098328113555, + "step": 144710 + }, + { + "epoch": 20.54222853087296, + "grad_norm": 1.5711698532104492, + "learning_rate": 7.946643009226402e-05, + "loss": 0.020183426141738892, + "step": 144720 + }, + { + "epoch": 20.543647977288856, + "grad_norm": 0.12674912810325623, + "learning_rate": 7.946501064584812e-05, + "loss": 0.022117115557193756, + "step": 144730 + }, + { + "epoch": 20.545067423704754, + "grad_norm": 7.3648152351379395, + "learning_rate": 7.946359119943223e-05, + "loss": 0.05172304511070251, + "step": 144740 + }, + { + "epoch": 20.546486870120653, + "grad_norm": 0.5710421800613403, + "learning_rate": 7.946217175301633e-05, + "loss": 0.004803726077079773, + "step": 144750 + }, + { + "epoch": 20.54790631653655, + "grad_norm": 0.3407939374446869, + "learning_rate": 7.946089425124203e-05, + "loss": 0.04576562941074371, + "step": 144760 + }, + { + "epoch": 20.54932576295245, + "grad_norm": 0.05488349124789238, + "learning_rate": 7.945947480482611e-05, + "loss": 0.0018111549317836762, + "step": 144770 + }, + { + "epoch": 20.550745209368348, + "grad_norm": 10.437118530273438, + "learning_rate": 7.945805535841022e-05, + "loss": 0.04636000096797943, + "step": 144780 + }, + { + "epoch": 20.552164655784246, + "grad_norm": 0.01174534484744072, + "learning_rate": 7.945663591199432e-05, + "loss": 0.012236421555280685, + "step": 144790 + }, + { + "epoch": 20.55358410220014, + "grad_norm": 0.017734697088599205, + "learning_rate": 7.945521646557843e-05, + "loss": 0.006476728618144989, + "step": 144800 + }, + { + "epoch": 20.55500354861604, + "grad_norm": 12.977668762207031, + "learning_rate": 7.945379701916253e-05, + "loss": 0.01667654812335968, + "step": 144810 + }, + { + "epoch": 20.556422995031937, + "grad_norm": 24.415279388427734, + "learning_rate": 7.945237757274663e-05, + "loss": 0.04182113707065582, + "step": 144820 + }, + { + "epoch": 20.557842441447836, + "grad_norm": 0.016579121351242065, + "learning_rate": 7.945095812633074e-05, + "loss": 0.028746408224105836, + "step": 144830 + }, + { + "epoch": 20.559261887863734, + "grad_norm": 0.32016265392303467, + "learning_rate": 7.944953867991483e-05, + "loss": 0.04389718174934387, + "step": 144840 + }, + { + "epoch": 20.560681334279632, + "grad_norm": 2.2971858978271484, + "learning_rate": 7.944811923349894e-05, + "loss": 0.03008989095687866, + "step": 144850 + }, + { + "epoch": 20.56210078069553, + "grad_norm": 15.585360527038574, + "learning_rate": 7.944669978708304e-05, + "loss": 0.018069356679916382, + "step": 144860 + }, + { + "epoch": 20.563520227111425, + "grad_norm": 6.462601184844971, + "learning_rate": 7.944528034066714e-05, + "loss": 0.014132696390151977, + "step": 144870 + }, + { + "epoch": 20.564939673527324, + "grad_norm": 5.803984642028809, + "learning_rate": 7.944386089425124e-05, + "loss": 0.03112670183181763, + "step": 144880 + }, + { + "epoch": 20.566359119943222, + "grad_norm": 0.7162980437278748, + "learning_rate": 7.944244144783535e-05, + "loss": 0.012357431650161742, + "step": 144890 + }, + { + "epoch": 20.56777856635912, + "grad_norm": 6.502598285675049, + "learning_rate": 7.944102200141944e-05, + "loss": 0.02088506519794464, + "step": 144900 + }, + { + "epoch": 20.56919801277502, + "grad_norm": 2.168832540512085, + "learning_rate": 7.943960255500356e-05, + "loss": 0.006182302162051201, + "step": 144910 + }, + { + "epoch": 20.570617459190917, + "grad_norm": 0.8160980939865112, + "learning_rate": 7.943818310858765e-05, + "loss": 0.03159986436367035, + "step": 144920 + }, + { + "epoch": 20.572036905606815, + "grad_norm": 7.209481716156006, + "learning_rate": 7.943676366217175e-05, + "loss": 0.023209857940673827, + "step": 144930 + }, + { + "epoch": 20.57345635202271, + "grad_norm": 19.98500633239746, + "learning_rate": 7.943534421575586e-05, + "loss": 0.03503158986568451, + "step": 144940 + }, + { + "epoch": 20.574875798438608, + "grad_norm": 1.5545172691345215, + "learning_rate": 7.943392476933996e-05, + "loss": 0.005894295498728752, + "step": 144950 + }, + { + "epoch": 20.576295244854506, + "grad_norm": 0.007062634453177452, + "learning_rate": 7.943250532292407e-05, + "loss": 0.048457229137420656, + "step": 144960 + }, + { + "epoch": 20.577714691270405, + "grad_norm": 0.0761469379067421, + "learning_rate": 7.943108587650817e-05, + "loss": 0.01926290839910507, + "step": 144970 + }, + { + "epoch": 20.579134137686303, + "grad_norm": 0.3928318917751312, + "learning_rate": 7.942966643009226e-05, + "loss": 0.002387086674571037, + "step": 144980 + }, + { + "epoch": 20.5805535841022, + "grad_norm": 1.768370270729065, + "learning_rate": 7.942824698367636e-05, + "loss": 0.03047075867652893, + "step": 144990 + }, + { + "epoch": 20.5819730305181, + "grad_norm": 0.02380281686782837, + "learning_rate": 7.942682753726047e-05, + "loss": 0.007473225891590119, + "step": 145000 + }, + { + "epoch": 20.5819730305181, + "eval_accuracy": 0.9804158453614803, + "eval_loss": 0.07275503128767014, + "eval_runtime": 43.1489, + "eval_samples_per_second": 364.482, + "eval_steps_per_second": 11.402, + "step": 145000 + }, + { + "epoch": 20.583392476933994, + "grad_norm": 0.06391783058643341, + "learning_rate": 7.942540809084458e-05, + "loss": 0.023229347169399263, + "step": 145010 + }, + { + "epoch": 20.584811923349893, + "grad_norm": 9.119282722473145, + "learning_rate": 7.942398864442868e-05, + "loss": 0.013885484635829925, + "step": 145020 + }, + { + "epoch": 20.58623136976579, + "grad_norm": 15.528413772583008, + "learning_rate": 7.942256919801278e-05, + "loss": 0.052690714597702026, + "step": 145030 + }, + { + "epoch": 20.58765081618169, + "grad_norm": 0.10440409183502197, + "learning_rate": 7.942114975159688e-05, + "loss": 0.034368190169334414, + "step": 145040 + }, + { + "epoch": 20.589070262597588, + "grad_norm": 3.9504480361938477, + "learning_rate": 7.941973030518099e-05, + "loss": 0.03920840620994568, + "step": 145050 + }, + { + "epoch": 20.590489709013486, + "grad_norm": 0.19848771393299103, + "learning_rate": 7.941831085876508e-05, + "loss": 0.00808703526854515, + "step": 145060 + }, + { + "epoch": 20.591909155429384, + "grad_norm": 0.538018524646759, + "learning_rate": 7.94168914123492e-05, + "loss": 0.0028334088623523713, + "step": 145070 + }, + { + "epoch": 20.59332860184528, + "grad_norm": 9.399895668029785, + "learning_rate": 7.941547196593328e-05, + "loss": 0.026027819514274596, + "step": 145080 + }, + { + "epoch": 20.594748048261177, + "grad_norm": 13.63102912902832, + "learning_rate": 7.941405251951739e-05, + "loss": 0.02653225064277649, + "step": 145090 + }, + { + "epoch": 20.596167494677076, + "grad_norm": 0.030577469617128372, + "learning_rate": 7.94126330731015e-05, + "loss": 0.0009738571941852569, + "step": 145100 + }, + { + "epoch": 20.597586941092974, + "grad_norm": 0.3383432626724243, + "learning_rate": 7.94112136266856e-05, + "loss": 0.006517581641674042, + "step": 145110 + }, + { + "epoch": 20.599006387508872, + "grad_norm": 0.045411527156829834, + "learning_rate": 7.940979418026971e-05, + "loss": 0.02748202681541443, + "step": 145120 + }, + { + "epoch": 20.60042583392477, + "grad_norm": 0.057020217180252075, + "learning_rate": 7.940837473385379e-05, + "loss": 0.03388555943965912, + "step": 145130 + }, + { + "epoch": 20.60184528034067, + "grad_norm": 0.04674219712615013, + "learning_rate": 7.94069552874379e-05, + "loss": 0.01053178608417511, + "step": 145140 + }, + { + "epoch": 20.603264726756564, + "grad_norm": 0.1999913901090622, + "learning_rate": 7.9405535841022e-05, + "loss": 0.06322197914123535, + "step": 145150 + }, + { + "epoch": 20.604684173172462, + "grad_norm": 0.051310621201992035, + "learning_rate": 7.940411639460611e-05, + "loss": 0.013078901171684264, + "step": 145160 + }, + { + "epoch": 20.60610361958836, + "grad_norm": 0.057633381336927414, + "learning_rate": 7.940269694819021e-05, + "loss": 0.046252280473709106, + "step": 145170 + }, + { + "epoch": 20.60752306600426, + "grad_norm": 1.1152459383010864, + "learning_rate": 7.940127750177431e-05, + "loss": 0.013692560791969299, + "step": 145180 + }, + { + "epoch": 20.608942512420157, + "grad_norm": 0.6031478643417358, + "learning_rate": 7.939985805535842e-05, + "loss": 0.00519898347556591, + "step": 145190 + }, + { + "epoch": 20.610361958836055, + "grad_norm": 0.12555818259716034, + "learning_rate": 7.939843860894252e-05, + "loss": 0.004649277776479721, + "step": 145200 + }, + { + "epoch": 20.611781405251953, + "grad_norm": 0.07937568426132202, + "learning_rate": 7.939701916252663e-05, + "loss": 0.01588006317615509, + "step": 145210 + }, + { + "epoch": 20.613200851667848, + "grad_norm": 0.02730713225901127, + "learning_rate": 7.939559971611072e-05, + "loss": 0.00898231491446495, + "step": 145220 + }, + { + "epoch": 20.614620298083747, + "grad_norm": 0.16898596286773682, + "learning_rate": 7.939418026969482e-05, + "loss": 0.010687188804149627, + "step": 145230 + }, + { + "epoch": 20.616039744499645, + "grad_norm": 2.987128257751465, + "learning_rate": 7.939276082327892e-05, + "loss": 0.02661990523338318, + "step": 145240 + }, + { + "epoch": 20.617459190915543, + "grad_norm": 0.052394766360521317, + "learning_rate": 7.939134137686303e-05, + "loss": 0.04456896483898163, + "step": 145250 + }, + { + "epoch": 20.61887863733144, + "grad_norm": 0.5967915654182434, + "learning_rate": 7.938992193044713e-05, + "loss": 0.004843110218644142, + "step": 145260 + }, + { + "epoch": 20.62029808374734, + "grad_norm": 0.033302005380392075, + "learning_rate": 7.938850248403124e-05, + "loss": 0.011710944026708603, + "step": 145270 + }, + { + "epoch": 20.621717530163238, + "grad_norm": 0.11341115832328796, + "learning_rate": 7.938708303761533e-05, + "loss": 0.009142975509166717, + "step": 145280 + }, + { + "epoch": 20.623136976579133, + "grad_norm": 0.08349739015102386, + "learning_rate": 7.938566359119943e-05, + "loss": 0.011193586885929108, + "step": 145290 + }, + { + "epoch": 20.62455642299503, + "grad_norm": 0.03839299455285072, + "learning_rate": 7.938424414478354e-05, + "loss": 0.003303806111216545, + "step": 145300 + }, + { + "epoch": 20.62597586941093, + "grad_norm": 0.041031140834093094, + "learning_rate": 7.938282469836764e-05, + "loss": 0.005088656768202781, + "step": 145310 + }, + { + "epoch": 20.627395315826828, + "grad_norm": 14.062775611877441, + "learning_rate": 7.938140525195175e-05, + "loss": 0.030301907658576967, + "step": 145320 + }, + { + "epoch": 20.628814762242726, + "grad_norm": 0.021417422220110893, + "learning_rate": 7.937998580553585e-05, + "loss": 0.043958616256713864, + "step": 145330 + }, + { + "epoch": 20.630234208658624, + "grad_norm": 0.08100397139787674, + "learning_rate": 7.937856635911995e-05, + "loss": 0.01910707950592041, + "step": 145340 + }, + { + "epoch": 20.631653655074523, + "grad_norm": 0.23383355140686035, + "learning_rate": 7.937714691270404e-05, + "loss": 0.0186216801404953, + "step": 145350 + }, + { + "epoch": 20.633073101490417, + "grad_norm": 0.22817610204219818, + "learning_rate": 7.937572746628815e-05, + "loss": 0.004916000366210938, + "step": 145360 + }, + { + "epoch": 20.634492547906316, + "grad_norm": 5.143357753753662, + "learning_rate": 7.937430801987225e-05, + "loss": 0.0041277710348367695, + "step": 145370 + }, + { + "epoch": 20.635911994322214, + "grad_norm": 1.02508544921875, + "learning_rate": 7.937288857345636e-05, + "loss": 0.02411566972732544, + "step": 145380 + }, + { + "epoch": 20.637331440738112, + "grad_norm": 0.04756942763924599, + "learning_rate": 7.937146912704046e-05, + "loss": 0.01031438484787941, + "step": 145390 + }, + { + "epoch": 20.63875088715401, + "grad_norm": 0.3243301212787628, + "learning_rate": 7.937004968062456e-05, + "loss": 0.011414211988449097, + "step": 145400 + }, + { + "epoch": 20.64017033356991, + "grad_norm": 0.07148227840662003, + "learning_rate": 7.936863023420867e-05, + "loss": 0.01592966765165329, + "step": 145410 + }, + { + "epoch": 20.641589779985807, + "grad_norm": 0.004803858697414398, + "learning_rate": 7.936721078779277e-05, + "loss": 0.002159743383526802, + "step": 145420 + }, + { + "epoch": 20.643009226401702, + "grad_norm": 0.3068103790283203, + "learning_rate": 7.936579134137688e-05, + "loss": 0.010675179958343505, + "step": 145430 + }, + { + "epoch": 20.6444286728176, + "grad_norm": 0.023777369409799576, + "learning_rate": 7.936437189496096e-05, + "loss": 0.007066705822944641, + "step": 145440 + }, + { + "epoch": 20.6458481192335, + "grad_norm": 1.9655640125274658, + "learning_rate": 7.936295244854507e-05, + "loss": 0.018037761747837066, + "step": 145450 + }, + { + "epoch": 20.647267565649397, + "grad_norm": 8.728242874145508, + "learning_rate": 7.936153300212917e-05, + "loss": 0.012693244218826293, + "step": 145460 + }, + { + "epoch": 20.648687012065295, + "grad_norm": 1.5654006004333496, + "learning_rate": 7.936011355571328e-05, + "loss": 0.013682277500629425, + "step": 145470 + }, + { + "epoch": 20.650106458481194, + "grad_norm": 0.17269207537174225, + "learning_rate": 7.935869410929738e-05, + "loss": 0.022952693700790405, + "step": 145480 + }, + { + "epoch": 20.651525904897092, + "grad_norm": 4.186474800109863, + "learning_rate": 7.935727466288147e-05, + "loss": 0.02002708613872528, + "step": 145490 + }, + { + "epoch": 20.652945351312987, + "grad_norm": 0.6695135235786438, + "learning_rate": 7.935585521646559e-05, + "loss": 0.04722619950771332, + "step": 145500 + }, + { + "epoch": 20.652945351312987, + "eval_accuracy": 0.9896992433394799, + "eval_loss": 0.03496474400162697, + "eval_runtime": 34.2515, + "eval_samples_per_second": 459.163, + "eval_steps_per_second": 14.364, + "step": 145500 + }, + { + "epoch": 20.654364797728885, + "grad_norm": 0.21022425591945648, + "learning_rate": 7.935443577004968e-05, + "loss": 0.017057400941848756, + "step": 145510 + }, + { + "epoch": 20.655784244144783, + "grad_norm": 1.2094993591308594, + "learning_rate": 7.93530163236338e-05, + "loss": 0.04082568883895874, + "step": 145520 + }, + { + "epoch": 20.65720369056068, + "grad_norm": 0.07598985731601715, + "learning_rate": 7.935159687721789e-05, + "loss": 0.0014511864632368089, + "step": 145530 + }, + { + "epoch": 20.65862313697658, + "grad_norm": 0.40147170424461365, + "learning_rate": 7.935017743080199e-05, + "loss": 0.0203866183757782, + "step": 145540 + }, + { + "epoch": 20.660042583392478, + "grad_norm": 0.04159606620669365, + "learning_rate": 7.934875798438609e-05, + "loss": 0.00415494367480278, + "step": 145550 + }, + { + "epoch": 20.661462029808376, + "grad_norm": 0.2567363977432251, + "learning_rate": 7.93473385379702e-05, + "loss": 0.004127321392297744, + "step": 145560 + }, + { + "epoch": 20.66288147622427, + "grad_norm": 1.3238670825958252, + "learning_rate": 7.93459190915543e-05, + "loss": 0.01999867856502533, + "step": 145570 + }, + { + "epoch": 20.66430092264017, + "grad_norm": 0.8857561945915222, + "learning_rate": 7.93444996451384e-05, + "loss": 0.003461797907948494, + "step": 145580 + }, + { + "epoch": 20.665720369056068, + "grad_norm": 0.02009095437824726, + "learning_rate": 7.93430801987225e-05, + "loss": 0.04140637814998627, + "step": 145590 + }, + { + "epoch": 20.667139815471966, + "grad_norm": 6.540339946746826, + "learning_rate": 7.93416607523066e-05, + "loss": 0.035273030400276184, + "step": 145600 + }, + { + "epoch": 20.668559261887864, + "grad_norm": 0.42791029810905457, + "learning_rate": 7.934024130589071e-05, + "loss": 0.004827521741390228, + "step": 145610 + }, + { + "epoch": 20.669978708303763, + "grad_norm": 0.9732591509819031, + "learning_rate": 7.933882185947481e-05, + "loss": 0.015463906526565551, + "step": 145620 + }, + { + "epoch": 20.67139815471966, + "grad_norm": 0.04784591868519783, + "learning_rate": 7.933740241305892e-05, + "loss": 0.00696093738079071, + "step": 145630 + }, + { + "epoch": 20.672817601135556, + "grad_norm": 0.08414292335510254, + "learning_rate": 7.9335982966643e-05, + "loss": 0.03177079856395722, + "step": 145640 + }, + { + "epoch": 20.674237047551454, + "grad_norm": 0.09434685856103897, + "learning_rate": 7.933456352022711e-05, + "loss": 0.012461913377046585, + "step": 145650 + }, + { + "epoch": 20.675656493967352, + "grad_norm": 21.005489349365234, + "learning_rate": 7.933314407381121e-05, + "loss": 0.025967153906822204, + "step": 145660 + }, + { + "epoch": 20.67707594038325, + "grad_norm": 2.2884438037872314, + "learning_rate": 7.933172462739532e-05, + "loss": 0.014497111737728118, + "step": 145670 + }, + { + "epoch": 20.67849538679915, + "grad_norm": 0.3916836977005005, + "learning_rate": 7.933030518097942e-05, + "loss": 0.029211747646331786, + "step": 145680 + }, + { + "epoch": 20.679914833215047, + "grad_norm": 1.543169617652893, + "learning_rate": 7.932888573456353e-05, + "loss": 0.08056480884552002, + "step": 145690 + }, + { + "epoch": 20.681334279630946, + "grad_norm": 11.27370834350586, + "learning_rate": 7.932746628814763e-05, + "loss": 0.022616779804229735, + "step": 145700 + }, + { + "epoch": 20.68275372604684, + "grad_norm": 0.5747624635696411, + "learning_rate": 7.932604684173173e-05, + "loss": 0.010348500311374664, + "step": 145710 + }, + { + "epoch": 20.68417317246274, + "grad_norm": 0.09961613267660141, + "learning_rate": 7.932462739531584e-05, + "loss": 0.014371511340141297, + "step": 145720 + }, + { + "epoch": 20.685592618878637, + "grad_norm": 1.2526777982711792, + "learning_rate": 7.932320794889993e-05, + "loss": 0.017895153164863585, + "step": 145730 + }, + { + "epoch": 20.687012065294535, + "grad_norm": 0.027043156325817108, + "learning_rate": 7.932178850248404e-05, + "loss": 0.03639167845249176, + "step": 145740 + }, + { + "epoch": 20.688431511710434, + "grad_norm": 0.0645659789443016, + "learning_rate": 7.932036905606813e-05, + "loss": 0.054617387056350705, + "step": 145750 + }, + { + "epoch": 20.689850958126332, + "grad_norm": 0.044021811336278915, + "learning_rate": 7.931894960965224e-05, + "loss": 0.005638457834720612, + "step": 145760 + }, + { + "epoch": 20.69127040454223, + "grad_norm": 0.08967779576778412, + "learning_rate": 7.931753016323634e-05, + "loss": 0.03928759396076202, + "step": 145770 + }, + { + "epoch": 20.692689850958125, + "grad_norm": 4.009666919708252, + "learning_rate": 7.931611071682045e-05, + "loss": 0.003842705860733986, + "step": 145780 + }, + { + "epoch": 20.694109297374023, + "grad_norm": 0.1355072259902954, + "learning_rate": 7.931469127040455e-05, + "loss": 0.016324999928474426, + "step": 145790 + }, + { + "epoch": 20.69552874378992, + "grad_norm": 0.63166344165802, + "learning_rate": 7.931327182398864e-05, + "loss": 0.012057775259017944, + "step": 145800 + }, + { + "epoch": 20.69694819020582, + "grad_norm": 0.08763563632965088, + "learning_rate": 7.931185237757275e-05, + "loss": 0.020572511851787566, + "step": 145810 + }, + { + "epoch": 20.698367636621718, + "grad_norm": 1.3961236476898193, + "learning_rate": 7.931043293115685e-05, + "loss": 0.025046154856681824, + "step": 145820 + }, + { + "epoch": 20.699787083037616, + "grad_norm": 0.23597484827041626, + "learning_rate": 7.930901348474096e-05, + "loss": 0.028546819090843202, + "step": 145830 + }, + { + "epoch": 20.701206529453515, + "grad_norm": 0.029362551867961884, + "learning_rate": 7.930759403832506e-05, + "loss": 0.033306199312210086, + "step": 145840 + }, + { + "epoch": 20.70262597586941, + "grad_norm": 5.759678363800049, + "learning_rate": 7.930617459190916e-05, + "loss": 0.01832512617111206, + "step": 145850 + }, + { + "epoch": 20.704045422285308, + "grad_norm": 0.11721846461296082, + "learning_rate": 7.930475514549325e-05, + "loss": 0.008271043002605439, + "step": 145860 + }, + { + "epoch": 20.705464868701206, + "grad_norm": 0.03404093161225319, + "learning_rate": 7.930333569907736e-05, + "loss": 0.00751207247376442, + "step": 145870 + }, + { + "epoch": 20.706884315117104, + "grad_norm": 0.9527551531791687, + "learning_rate": 7.930191625266146e-05, + "loss": 0.020460736751556397, + "step": 145880 + }, + { + "epoch": 20.708303761533003, + "grad_norm": 0.04391974210739136, + "learning_rate": 7.930049680624557e-05, + "loss": 0.03786468505859375, + "step": 145890 + }, + { + "epoch": 20.7097232079489, + "grad_norm": 0.14532692730426788, + "learning_rate": 7.929907735982967e-05, + "loss": 0.015052646398544312, + "step": 145900 + }, + { + "epoch": 20.7111426543648, + "grad_norm": 0.9020943641662598, + "learning_rate": 7.929765791341377e-05, + "loss": 0.023717553913593294, + "step": 145910 + }, + { + "epoch": 20.712562100780694, + "grad_norm": 0.005768029484897852, + "learning_rate": 7.929623846699788e-05, + "loss": 0.01201062798500061, + "step": 145920 + }, + { + "epoch": 20.713981547196592, + "grad_norm": 0.6888214945793152, + "learning_rate": 7.929481902058198e-05, + "loss": 0.012487337738275529, + "step": 145930 + }, + { + "epoch": 20.71540099361249, + "grad_norm": 0.1479235738515854, + "learning_rate": 7.929339957416609e-05, + "loss": 0.03108862042427063, + "step": 145940 + }, + { + "epoch": 20.71682044002839, + "grad_norm": 0.03405297175049782, + "learning_rate": 7.929198012775017e-05, + "loss": 0.04431195259094238, + "step": 145950 + }, + { + "epoch": 20.718239886444287, + "grad_norm": 0.10078774392604828, + "learning_rate": 7.929056068133428e-05, + "loss": 0.017326073348522188, + "step": 145960 + }, + { + "epoch": 20.719659332860186, + "grad_norm": 6.214639186859131, + "learning_rate": 7.928914123491838e-05, + "loss": 0.046317586302757265, + "step": 145970 + }, + { + "epoch": 20.721078779276084, + "grad_norm": 0.32494044303894043, + "learning_rate": 7.928772178850249e-05, + "loss": 0.023650357127189638, + "step": 145980 + }, + { + "epoch": 20.72249822569198, + "grad_norm": 13.228584289550781, + "learning_rate": 7.928630234208659e-05, + "loss": 0.012420380115509033, + "step": 145990 + }, + { + "epoch": 20.723917672107877, + "grad_norm": 3.9393420219421387, + "learning_rate": 7.928488289567068e-05, + "loss": 0.007345489412546158, + "step": 146000 + }, + { + "epoch": 20.723917672107877, + "eval_accuracy": 0.9831499968207541, + "eval_loss": 0.061240628361701965, + "eval_runtime": 34.6141, + "eval_samples_per_second": 454.352, + "eval_steps_per_second": 14.214, + "step": 146000 + }, + { + "epoch": 20.725337118523775, + "grad_norm": 0.04467616602778435, + "learning_rate": 7.92834634492548e-05, + "loss": 0.002353701740503311, + "step": 146010 + }, + { + "epoch": 20.726756564939674, + "grad_norm": 0.07547491788864136, + "learning_rate": 7.928204400283889e-05, + "loss": 0.002427778393030167, + "step": 146020 + }, + { + "epoch": 20.728176011355572, + "grad_norm": 1.7890892028808594, + "learning_rate": 7.9280624556423e-05, + "loss": 0.03683383166790009, + "step": 146030 + }, + { + "epoch": 20.72959545777147, + "grad_norm": 2.051042079925537, + "learning_rate": 7.92792051100071e-05, + "loss": 0.013651996850967407, + "step": 146040 + }, + { + "epoch": 20.73101490418737, + "grad_norm": 0.030877405777573586, + "learning_rate": 7.927778566359121e-05, + "loss": 0.03212569057941437, + "step": 146050 + }, + { + "epoch": 20.732434350603263, + "grad_norm": 8.530709266662598, + "learning_rate": 7.92763662171753e-05, + "loss": 0.017933164536952973, + "step": 146060 + }, + { + "epoch": 20.73385379701916, + "grad_norm": 5.16195821762085, + "learning_rate": 7.927494677075941e-05, + "loss": 0.01793002188205719, + "step": 146070 + }, + { + "epoch": 20.73527324343506, + "grad_norm": 0.13871033489704132, + "learning_rate": 7.92735273243435e-05, + "loss": 0.012643037736415863, + "step": 146080 + }, + { + "epoch": 20.73669268985096, + "grad_norm": 0.05193353816866875, + "learning_rate": 7.927210787792762e-05, + "loss": 0.010770949721336364, + "step": 146090 + }, + { + "epoch": 20.738112136266857, + "grad_norm": 10.728626251220703, + "learning_rate": 7.927068843151171e-05, + "loss": 0.058276236057281494, + "step": 146100 + }, + { + "epoch": 20.739531582682755, + "grad_norm": 8.37535572052002, + "learning_rate": 7.926926898509581e-05, + "loss": 0.031953093409538266, + "step": 146110 + }, + { + "epoch": 20.740951029098653, + "grad_norm": 0.041031572967767715, + "learning_rate": 7.926784953867992e-05, + "loss": 0.02155022770166397, + "step": 146120 + }, + { + "epoch": 20.742370475514548, + "grad_norm": 0.11791174113750458, + "learning_rate": 7.926643009226402e-05, + "loss": 0.013758836686611176, + "step": 146130 + }, + { + "epoch": 20.743789921930446, + "grad_norm": 3.598696231842041, + "learning_rate": 7.926501064584813e-05, + "loss": 0.009702644497156142, + "step": 146140 + }, + { + "epoch": 20.745209368346345, + "grad_norm": 8.049793243408203, + "learning_rate": 7.926359119943223e-05, + "loss": 0.03341563940048218, + "step": 146150 + }, + { + "epoch": 20.746628814762243, + "grad_norm": 2.3235762119293213, + "learning_rate": 7.926217175301632e-05, + "loss": 0.023580312728881836, + "step": 146160 + }, + { + "epoch": 20.74804826117814, + "grad_norm": 2.129178285598755, + "learning_rate": 7.926075230660042e-05, + "loss": 0.01380157470703125, + "step": 146170 + }, + { + "epoch": 20.74946770759404, + "grad_norm": 6.452048301696777, + "learning_rate": 7.925933286018453e-05, + "loss": 0.03201970756053925, + "step": 146180 + }, + { + "epoch": 20.750887154009938, + "grad_norm": 1.5061073303222656, + "learning_rate": 7.925791341376863e-05, + "loss": 0.0893592119216919, + "step": 146190 + }, + { + "epoch": 20.752306600425833, + "grad_norm": 3.5834708213806152, + "learning_rate": 7.925649396735274e-05, + "loss": 0.03867372274398804, + "step": 146200 + }, + { + "epoch": 20.75372604684173, + "grad_norm": 0.40649959444999695, + "learning_rate": 7.925507452093684e-05, + "loss": 0.0099691703915596, + "step": 146210 + }, + { + "epoch": 20.75514549325763, + "grad_norm": 0.08001447468996048, + "learning_rate": 7.925365507452094e-05, + "loss": 0.024438340961933137, + "step": 146220 + }, + { + "epoch": 20.756564939673527, + "grad_norm": 0.28294801712036133, + "learning_rate": 7.925223562810505e-05, + "loss": 0.03000095784664154, + "step": 146230 + }, + { + "epoch": 20.757984386089426, + "grad_norm": 0.373675674200058, + "learning_rate": 7.925081618168914e-05, + "loss": 0.007903087139129638, + "step": 146240 + }, + { + "epoch": 20.759403832505324, + "grad_norm": 2.2916080951690674, + "learning_rate": 7.924939673527325e-05, + "loss": 0.04177244305610657, + "step": 146250 + }, + { + "epoch": 20.760823278921222, + "grad_norm": 0.39103400707244873, + "learning_rate": 7.924797728885734e-05, + "loss": 0.03385518789291382, + "step": 146260 + }, + { + "epoch": 20.762242725337117, + "grad_norm": 0.7429710030555725, + "learning_rate": 7.924655784244145e-05, + "loss": 0.03701767921447754, + "step": 146270 + }, + { + "epoch": 20.763662171753015, + "grad_norm": 0.938109815120697, + "learning_rate": 7.924513839602555e-05, + "loss": 0.015692499279975892, + "step": 146280 + }, + { + "epoch": 20.765081618168914, + "grad_norm": 3.9352753162384033, + "learning_rate": 7.924371894960966e-05, + "loss": 0.00905241072177887, + "step": 146290 + }, + { + "epoch": 20.766501064584812, + "grad_norm": 0.03445643186569214, + "learning_rate": 7.924229950319377e-05, + "loss": 0.015830010175704956, + "step": 146300 + }, + { + "epoch": 20.76792051100071, + "grad_norm": 0.5951571464538574, + "learning_rate": 7.924088005677785e-05, + "loss": 0.022093257308006285, + "step": 146310 + }, + { + "epoch": 20.76933995741661, + "grad_norm": 1.0523383617401123, + "learning_rate": 7.923946061036196e-05, + "loss": 0.02023012936115265, + "step": 146320 + }, + { + "epoch": 20.770759403832507, + "grad_norm": 0.11734830588102341, + "learning_rate": 7.923804116394606e-05, + "loss": 0.021375299990177156, + "step": 146330 + }, + { + "epoch": 20.7721788502484, + "grad_norm": 13.957560539245605, + "learning_rate": 7.923662171753017e-05, + "loss": 0.046216410398483274, + "step": 146340 + }, + { + "epoch": 20.7735982966643, + "grad_norm": 17.897584915161133, + "learning_rate": 7.923520227111427e-05, + "loss": 0.024576073884963988, + "step": 146350 + }, + { + "epoch": 20.7750177430802, + "grad_norm": 0.12110137194395065, + "learning_rate": 7.923378282469837e-05, + "loss": 0.006710472702980042, + "step": 146360 + }, + { + "epoch": 20.776437189496097, + "grad_norm": 0.12965555489063263, + "learning_rate": 7.923236337828246e-05, + "loss": 0.016585759818553925, + "step": 146370 + }, + { + "epoch": 20.777856635911995, + "grad_norm": 2.008803606033325, + "learning_rate": 7.923094393186657e-05, + "loss": 0.020957988500595093, + "step": 146380 + }, + { + "epoch": 20.779276082327893, + "grad_norm": 6.884129047393799, + "learning_rate": 7.922952448545069e-05, + "loss": 0.009522868692874909, + "step": 146390 + }, + { + "epoch": 20.78069552874379, + "grad_norm": 0.01557956263422966, + "learning_rate": 7.922810503903478e-05, + "loss": 0.02518046796321869, + "step": 146400 + }, + { + "epoch": 20.782114975159686, + "grad_norm": 2.624701499938965, + "learning_rate": 7.92266855926189e-05, + "loss": 0.01180010661482811, + "step": 146410 + }, + { + "epoch": 20.783534421575585, + "grad_norm": 9.77568531036377, + "learning_rate": 7.922526614620298e-05, + "loss": 0.03461946845054627, + "step": 146420 + }, + { + "epoch": 20.784953867991483, + "grad_norm": 0.02805766463279724, + "learning_rate": 7.922384669978709e-05, + "loss": 0.028170162439346315, + "step": 146430 + }, + { + "epoch": 20.78637331440738, + "grad_norm": 0.2586061358451843, + "learning_rate": 7.922242725337119e-05, + "loss": 0.0037616658955812454, + "step": 146440 + }, + { + "epoch": 20.78779276082328, + "grad_norm": 0.15174534916877747, + "learning_rate": 7.92210078069553e-05, + "loss": 0.016199553012847902, + "step": 146450 + }, + { + "epoch": 20.789212207239178, + "grad_norm": 4.853360176086426, + "learning_rate": 7.92195883605394e-05, + "loss": 0.013407911360263824, + "step": 146460 + }, + { + "epoch": 20.790631653655076, + "grad_norm": 0.007301007863134146, + "learning_rate": 7.921816891412349e-05, + "loss": 0.009662486612796783, + "step": 146470 + }, + { + "epoch": 20.79205110007097, + "grad_norm": 0.5911498069763184, + "learning_rate": 7.92167494677076e-05, + "loss": 0.004151134565472603, + "step": 146480 + }, + { + "epoch": 20.79347054648687, + "grad_norm": 5.810220241546631, + "learning_rate": 7.92153300212917e-05, + "loss": 0.026870083808898926, + "step": 146490 + }, + { + "epoch": 20.794889992902768, + "grad_norm": 0.8876366019248962, + "learning_rate": 7.921391057487581e-05, + "loss": 0.0064541235566139225, + "step": 146500 + }, + { + "epoch": 20.794889992902768, + "eval_accuracy": 0.9862020728683156, + "eval_loss": 0.05110529065132141, + "eval_runtime": 34.6768, + "eval_samples_per_second": 453.531, + "eval_steps_per_second": 14.188, + "step": 146500 + }, + { + "epoch": 20.796309439318666, + "grad_norm": 0.02104359120130539, + "learning_rate": 7.921249112845991e-05, + "loss": 0.010167718678712846, + "step": 146510 + }, + { + "epoch": 20.797728885734564, + "grad_norm": 0.13494668900966644, + "learning_rate": 7.9211071682044e-05, + "loss": 0.026707875728607177, + "step": 146520 + }, + { + "epoch": 20.799148332150462, + "grad_norm": 0.15622884035110474, + "learning_rate": 7.92096522356281e-05, + "loss": 0.006951558589935303, + "step": 146530 + }, + { + "epoch": 20.80056777856636, + "grad_norm": 0.2711966633796692, + "learning_rate": 7.920823278921221e-05, + "loss": 0.012996454536914826, + "step": 146540 + }, + { + "epoch": 20.801987224982255, + "grad_norm": 0.3607664108276367, + "learning_rate": 7.920681334279631e-05, + "loss": 0.023054002225399016, + "step": 146550 + }, + { + "epoch": 20.803406671398154, + "grad_norm": 0.0117929857224226, + "learning_rate": 7.920539389638042e-05, + "loss": 0.015093138813972473, + "step": 146560 + }, + { + "epoch": 20.804826117814052, + "grad_norm": 0.1728912591934204, + "learning_rate": 7.920397444996452e-05, + "loss": 0.01267610639333725, + "step": 146570 + }, + { + "epoch": 20.80624556422995, + "grad_norm": 0.05420781672000885, + "learning_rate": 7.920255500354862e-05, + "loss": 0.01759723275899887, + "step": 146580 + }, + { + "epoch": 20.80766501064585, + "grad_norm": 1.4305238723754883, + "learning_rate": 7.920113555713273e-05, + "loss": 0.026539325714111328, + "step": 146590 + }, + { + "epoch": 20.809084457061747, + "grad_norm": 3.561781406402588, + "learning_rate": 7.919971611071683e-05, + "loss": 0.01684565544128418, + "step": 146600 + }, + { + "epoch": 20.810503903477645, + "grad_norm": 1.0061430931091309, + "learning_rate": 7.919829666430094e-05, + "loss": 0.004090564325451851, + "step": 146610 + }, + { + "epoch": 20.81192334989354, + "grad_norm": 10.373432159423828, + "learning_rate": 7.919687721788502e-05, + "loss": 0.019665876030921937, + "step": 146620 + }, + { + "epoch": 20.81334279630944, + "grad_norm": 0.11311222612857819, + "learning_rate": 7.919545777146913e-05, + "loss": 0.06184806227684021, + "step": 146630 + }, + { + "epoch": 20.814762242725337, + "grad_norm": 0.9957504272460938, + "learning_rate": 7.919403832505323e-05, + "loss": 0.008581961691379546, + "step": 146640 + }, + { + "epoch": 20.816181689141235, + "grad_norm": 15.013198852539062, + "learning_rate": 7.919261887863734e-05, + "loss": 0.019738689064979553, + "step": 146650 + }, + { + "epoch": 20.817601135557133, + "grad_norm": 0.9187983274459839, + "learning_rate": 7.919119943222144e-05, + "loss": 0.0025506075471639633, + "step": 146660 + }, + { + "epoch": 20.81902058197303, + "grad_norm": 0.6271923780441284, + "learning_rate": 7.918977998580553e-05, + "loss": 0.028296566009521483, + "step": 146670 + }, + { + "epoch": 20.82044002838893, + "grad_norm": 0.012920745648443699, + "learning_rate": 7.918836053938965e-05, + "loss": 0.00728784054517746, + "step": 146680 + }, + { + "epoch": 20.821859474804825, + "grad_norm": 2.474107265472412, + "learning_rate": 7.918694109297374e-05, + "loss": 0.040848946571350096, + "step": 146690 + }, + { + "epoch": 20.823278921220723, + "grad_norm": 0.03071807138621807, + "learning_rate": 7.918552164655785e-05, + "loss": 0.011624724417924882, + "step": 146700 + }, + { + "epoch": 20.82469836763662, + "grad_norm": 11.327455520629883, + "learning_rate": 7.918410220014195e-05, + "loss": 0.052508091926574706, + "step": 146710 + }, + { + "epoch": 20.82611781405252, + "grad_norm": 7.31023645401001, + "learning_rate": 7.918268275372605e-05, + "loss": 0.01595485508441925, + "step": 146720 + }, + { + "epoch": 20.827537260468418, + "grad_norm": 0.44911283254623413, + "learning_rate": 7.918126330731015e-05, + "loss": 0.001621703803539276, + "step": 146730 + }, + { + "epoch": 20.828956706884316, + "grad_norm": 0.690433919429779, + "learning_rate": 7.917984386089426e-05, + "loss": 0.009605031460523605, + "step": 146740 + }, + { + "epoch": 20.830376153300215, + "grad_norm": 0.373314768075943, + "learning_rate": 7.917842441447835e-05, + "loss": 0.002264280617237091, + "step": 146750 + }, + { + "epoch": 20.83179559971611, + "grad_norm": 0.04561639204621315, + "learning_rate": 7.917700496806246e-05, + "loss": 0.01368083357810974, + "step": 146760 + }, + { + "epoch": 20.833215046132008, + "grad_norm": 0.4038635492324829, + "learning_rate": 7.917558552164656e-05, + "loss": 0.022652235627174378, + "step": 146770 + }, + { + "epoch": 20.834634492547906, + "grad_norm": 4.013926982879639, + "learning_rate": 7.917416607523066e-05, + "loss": 0.040308129787445066, + "step": 146780 + }, + { + "epoch": 20.836053938963804, + "grad_norm": 0.009684121236205101, + "learning_rate": 7.917274662881477e-05, + "loss": 0.007236669957637787, + "step": 146790 + }, + { + "epoch": 20.837473385379703, + "grad_norm": 0.37154409289360046, + "learning_rate": 7.917132718239887e-05, + "loss": 0.002590896561741829, + "step": 146800 + }, + { + "epoch": 20.8388928317956, + "grad_norm": 0.23550155758857727, + "learning_rate": 7.916990773598298e-05, + "loss": 0.006059055775403976, + "step": 146810 + }, + { + "epoch": 20.8403122782115, + "grad_norm": 0.15419526398181915, + "learning_rate": 7.916848828956708e-05, + "loss": 0.010148958116769791, + "step": 146820 + }, + { + "epoch": 20.841731724627394, + "grad_norm": 0.3938317596912384, + "learning_rate": 7.916706884315117e-05, + "loss": 0.007333367317914963, + "step": 146830 + }, + { + "epoch": 20.843151171043292, + "grad_norm": 9.145828247070312, + "learning_rate": 7.916564939673527e-05, + "loss": 0.01460103690624237, + "step": 146840 + }, + { + "epoch": 20.84457061745919, + "grad_norm": 12.217960357666016, + "learning_rate": 7.916422995031938e-05, + "loss": 0.03970673084259033, + "step": 146850 + }, + { + "epoch": 20.84599006387509, + "grad_norm": 7.862465858459473, + "learning_rate": 7.916281050390348e-05, + "loss": 0.012903441488742829, + "step": 146860 + }, + { + "epoch": 20.847409510290987, + "grad_norm": 1.4578545093536377, + "learning_rate": 7.916139105748759e-05, + "loss": 0.017339283227920534, + "step": 146870 + }, + { + "epoch": 20.848828956706885, + "grad_norm": 0.0015449131606146693, + "learning_rate": 7.915997161107169e-05, + "loss": 0.016219761967658997, + "step": 146880 + }, + { + "epoch": 20.850248403122784, + "grad_norm": 0.6998650431632996, + "learning_rate": 7.915855216465578e-05, + "loss": 0.010852450132369995, + "step": 146890 + }, + { + "epoch": 20.85166784953868, + "grad_norm": 0.0549323745071888, + "learning_rate": 7.91571327182399e-05, + "loss": 0.051076120138168334, + "step": 146900 + }, + { + "epoch": 20.853087295954577, + "grad_norm": 14.543206214904785, + "learning_rate": 7.9155713271824e-05, + "loss": 0.03457919955253601, + "step": 146910 + }, + { + "epoch": 20.854506742370475, + "grad_norm": 1.7455620765686035, + "learning_rate": 7.91542938254081e-05, + "loss": 0.004024988040328026, + "step": 146920 + }, + { + "epoch": 20.855926188786373, + "grad_norm": 0.6541800498962402, + "learning_rate": 7.915287437899219e-05, + "loss": 0.038422593474388124, + "step": 146930 + }, + { + "epoch": 20.85734563520227, + "grad_norm": 0.0649171844124794, + "learning_rate": 7.91514549325763e-05, + "loss": 0.04115557968616486, + "step": 146940 + }, + { + "epoch": 20.85876508161817, + "grad_norm": 2.2254154682159424, + "learning_rate": 7.91500354861604e-05, + "loss": 0.015144921839237213, + "step": 146950 + }, + { + "epoch": 20.86018452803407, + "grad_norm": 2.6537952423095703, + "learning_rate": 7.914861603974451e-05, + "loss": 0.09188648462295532, + "step": 146960 + }, + { + "epoch": 20.861603974449963, + "grad_norm": 7.721174240112305, + "learning_rate": 7.91471965933286e-05, + "loss": 0.05469951629638672, + "step": 146970 + }, + { + "epoch": 20.86302342086586, + "grad_norm": 9.921685218811035, + "learning_rate": 7.91457771469127e-05, + "loss": 0.042270541191101074, + "step": 146980 + }, + { + "epoch": 20.86444286728176, + "grad_norm": 0.1609053611755371, + "learning_rate": 7.914435770049681e-05, + "loss": 0.033688592910766604, + "step": 146990 + }, + { + "epoch": 20.865862313697658, + "grad_norm": 0.05122964829206467, + "learning_rate": 7.914293825408091e-05, + "loss": 0.016573894023895263, + "step": 147000 + }, + { + "epoch": 20.865862313697658, + "eval_accuracy": 0.98995358301011, + "eval_loss": 0.0332157239317894, + "eval_runtime": 34.7743, + "eval_samples_per_second": 452.259, + "eval_steps_per_second": 14.148, + "step": 147000 + }, + { + "epoch": 20.867281760113556, + "grad_norm": 0.06537210196256638, + "learning_rate": 7.914151880766502e-05, + "loss": 0.001004798337817192, + "step": 147010 + }, + { + "epoch": 20.868701206529455, + "grad_norm": 1.1541846990585327, + "learning_rate": 7.914009936124912e-05, + "loss": 0.0018973808735609054, + "step": 147020 + }, + { + "epoch": 20.870120652945353, + "grad_norm": 0.2158127725124359, + "learning_rate": 7.913867991483322e-05, + "loss": 0.006645660102367401, + "step": 147030 + }, + { + "epoch": 20.871540099361248, + "grad_norm": 0.08106806129217148, + "learning_rate": 7.913726046841731e-05, + "loss": 0.02808656692504883, + "step": 147040 + }, + { + "epoch": 20.872959545777146, + "grad_norm": 0.567354679107666, + "learning_rate": 7.913584102200142e-05, + "loss": 0.010174166411161423, + "step": 147050 + }, + { + "epoch": 20.874378992193044, + "grad_norm": 0.9629103541374207, + "learning_rate": 7.913442157558552e-05, + "loss": 0.011855004727840424, + "step": 147060 + }, + { + "epoch": 20.875798438608943, + "grad_norm": 0.3463072180747986, + "learning_rate": 7.913300212916963e-05, + "loss": 0.004977930709719658, + "step": 147070 + }, + { + "epoch": 20.87721788502484, + "grad_norm": 1.4517196416854858, + "learning_rate": 7.913158268275373e-05, + "loss": 0.025546705722808837, + "step": 147080 + }, + { + "epoch": 20.87863733144074, + "grad_norm": 8.010258674621582, + "learning_rate": 7.913016323633783e-05, + "loss": 0.04137132465839386, + "step": 147090 + }, + { + "epoch": 20.880056777856637, + "grad_norm": 0.08049314469099045, + "learning_rate": 7.912874378992194e-05, + "loss": 0.0032156050205230714, + "step": 147100 + }, + { + "epoch": 20.881476224272532, + "grad_norm": 0.08725004643201828, + "learning_rate": 7.912732434350604e-05, + "loss": 0.011750607937574386, + "step": 147110 + }, + { + "epoch": 20.88289567068843, + "grad_norm": 0.059840764850378036, + "learning_rate": 7.912590489709015e-05, + "loss": 0.08233913779258728, + "step": 147120 + }, + { + "epoch": 20.88431511710433, + "grad_norm": 0.019664814695715904, + "learning_rate": 7.912448545067424e-05, + "loss": 0.005721261352300644, + "step": 147130 + }, + { + "epoch": 20.885734563520227, + "grad_norm": 5.9343461990356445, + "learning_rate": 7.912306600425834e-05, + "loss": 0.025959882140159606, + "step": 147140 + }, + { + "epoch": 20.887154009936125, + "grad_norm": 19.820329666137695, + "learning_rate": 7.912164655784244e-05, + "loss": 0.014830124378204346, + "step": 147150 + }, + { + "epoch": 20.888573456352024, + "grad_norm": 0.029194775968790054, + "learning_rate": 7.912036905606814e-05, + "loss": 0.024605154991149902, + "step": 147160 + }, + { + "epoch": 20.889992902767922, + "grad_norm": 2.7611563205718994, + "learning_rate": 7.911894960965223e-05, + "loss": 0.03203037977218628, + "step": 147170 + }, + { + "epoch": 20.891412349183817, + "grad_norm": 3.295513868331909, + "learning_rate": 7.911753016323635e-05, + "loss": 0.013078141212463378, + "step": 147180 + }, + { + "epoch": 20.892831795599715, + "grad_norm": 0.13321231305599213, + "learning_rate": 7.911611071682044e-05, + "loss": 0.03065497279167175, + "step": 147190 + }, + { + "epoch": 20.894251242015613, + "grad_norm": 6.127897262573242, + "learning_rate": 7.911469127040455e-05, + "loss": 0.018430909514427184, + "step": 147200 + }, + { + "epoch": 20.89567068843151, + "grad_norm": 0.07745135575532913, + "learning_rate": 7.911327182398864e-05, + "loss": 0.007900258898735047, + "step": 147210 + }, + { + "epoch": 20.89709013484741, + "grad_norm": 7.25475549697876, + "learning_rate": 7.911185237757275e-05, + "loss": 0.015114283561706543, + "step": 147220 + }, + { + "epoch": 20.89850958126331, + "grad_norm": 8.938583374023438, + "learning_rate": 7.911043293115685e-05, + "loss": 0.008583293855190277, + "step": 147230 + }, + { + "epoch": 20.899929027679207, + "grad_norm": 1.1823887825012207, + "learning_rate": 7.910901348474096e-05, + "loss": 0.00854203775525093, + "step": 147240 + }, + { + "epoch": 20.9013484740951, + "grad_norm": 0.05185272544622421, + "learning_rate": 7.910759403832507e-05, + "loss": 0.04141756296157837, + "step": 147250 + }, + { + "epoch": 20.902767920511, + "grad_norm": 0.06545621156692505, + "learning_rate": 7.910617459190915e-05, + "loss": 0.05727676153182983, + "step": 147260 + }, + { + "epoch": 20.904187366926898, + "grad_norm": 0.1190522238612175, + "learning_rate": 7.910475514549326e-05, + "loss": 0.016204726696014405, + "step": 147270 + }, + { + "epoch": 20.905606813342796, + "grad_norm": 0.022433854639530182, + "learning_rate": 7.910333569907736e-05, + "loss": 0.014434719085693359, + "step": 147280 + }, + { + "epoch": 20.907026259758695, + "grad_norm": 0.2797408998012543, + "learning_rate": 7.910191625266147e-05, + "loss": 0.003265124186873436, + "step": 147290 + }, + { + "epoch": 20.908445706174593, + "grad_norm": 0.14977967739105225, + "learning_rate": 7.910049680624557e-05, + "loss": 0.03801598250865936, + "step": 147300 + }, + { + "epoch": 20.90986515259049, + "grad_norm": 0.6391460299491882, + "learning_rate": 7.909907735982967e-05, + "loss": 0.032560572028160095, + "step": 147310 + }, + { + "epoch": 20.911284599006386, + "grad_norm": 3.5316262245178223, + "learning_rate": 7.909765791341376e-05, + "loss": 0.015275755524635315, + "step": 147320 + }, + { + "epoch": 20.912704045422284, + "grad_norm": 0.3526131510734558, + "learning_rate": 7.909623846699787e-05, + "loss": 0.01211082860827446, + "step": 147330 + }, + { + "epoch": 20.914123491838183, + "grad_norm": 1.025555968284607, + "learning_rate": 7.909481902058198e-05, + "loss": 0.027760547399520875, + "step": 147340 + }, + { + "epoch": 20.91554293825408, + "grad_norm": 0.5729354619979858, + "learning_rate": 7.909339957416608e-05, + "loss": 0.05583299398422241, + "step": 147350 + }, + { + "epoch": 20.91696238466998, + "grad_norm": 0.009218241088092327, + "learning_rate": 7.909198012775018e-05, + "loss": 0.004211675003170967, + "step": 147360 + }, + { + "epoch": 20.918381831085878, + "grad_norm": 0.9458274245262146, + "learning_rate": 7.909056068133428e-05, + "loss": 0.05659654140472412, + "step": 147370 + }, + { + "epoch": 20.919801277501776, + "grad_norm": 0.42036503553390503, + "learning_rate": 7.908914123491839e-05, + "loss": 0.03894461989402771, + "step": 147380 + }, + { + "epoch": 20.92122072391767, + "grad_norm": 10.208904266357422, + "learning_rate": 7.908772178850249e-05, + "loss": 0.016311009228229523, + "step": 147390 + }, + { + "epoch": 20.92264017033357, + "grad_norm": 0.01734880730509758, + "learning_rate": 7.90863023420866e-05, + "loss": 0.031048858165740968, + "step": 147400 + }, + { + "epoch": 20.924059616749467, + "grad_norm": 0.019519660621881485, + "learning_rate": 7.90848828956707e-05, + "loss": 0.028166115283966064, + "step": 147410 + }, + { + "epoch": 20.925479063165366, + "grad_norm": 1.4088938236236572, + "learning_rate": 7.908346344925479e-05, + "loss": 0.01879177838563919, + "step": 147420 + }, + { + "epoch": 20.926898509581264, + "grad_norm": 0.06329909712076187, + "learning_rate": 7.90820440028389e-05, + "loss": 0.036617633700370786, + "step": 147430 + }, + { + "epoch": 20.928317955997162, + "grad_norm": 0.5240737199783325, + "learning_rate": 7.9080624556423e-05, + "loss": 0.009767904877662659, + "step": 147440 + }, + { + "epoch": 20.92973740241306, + "grad_norm": 5.202698230743408, + "learning_rate": 7.907920511000711e-05, + "loss": 0.009432019293308258, + "step": 147450 + }, + { + "epoch": 20.931156848828955, + "grad_norm": 11.060153007507324, + "learning_rate": 7.907778566359121e-05, + "loss": 0.013427031040191651, + "step": 147460 + }, + { + "epoch": 20.932576295244854, + "grad_norm": 0.8927894830703735, + "learning_rate": 7.90763662171753e-05, + "loss": 0.015002116560935974, + "step": 147470 + }, + { + "epoch": 20.933995741660752, + "grad_norm": 0.030140064656734467, + "learning_rate": 7.90749467707594e-05, + "loss": 0.01904297322034836, + "step": 147480 + }, + { + "epoch": 20.93541518807665, + "grad_norm": 11.515298843383789, + "learning_rate": 7.907352732434351e-05, + "loss": 0.014468705654144287, + "step": 147490 + }, + { + "epoch": 20.93683463449255, + "grad_norm": 1.8394439220428467, + "learning_rate": 7.907210787792761e-05, + "loss": 0.01816985011100769, + "step": 147500 + }, + { + "epoch": 20.93683463449255, + "eval_accuracy": 0.9853118840211101, + "eval_loss": 0.059090834110975266, + "eval_runtime": 33.5044, + "eval_samples_per_second": 469.401, + "eval_steps_per_second": 14.685, + "step": 147500 + }, + { + "epoch": 20.938254080908447, + "grad_norm": 1.7218375205993652, + "learning_rate": 7.907068843151172e-05, + "loss": 0.007790523767471314, + "step": 147510 + }, + { + "epoch": 20.939673527324345, + "grad_norm": 2.2282238006591797, + "learning_rate": 7.906926898509582e-05, + "loss": 0.024518656730651855, + "step": 147520 + }, + { + "epoch": 20.94109297374024, + "grad_norm": 7.747946262359619, + "learning_rate": 7.906784953867992e-05, + "loss": 0.02995598316192627, + "step": 147530 + }, + { + "epoch": 20.942512420156138, + "grad_norm": 4.2246413230896, + "learning_rate": 7.906643009226403e-05, + "loss": 0.034627553820610044, + "step": 147540 + }, + { + "epoch": 20.943931866572036, + "grad_norm": 0.25856253504753113, + "learning_rate": 7.906501064584812e-05, + "loss": 0.00449640341103077, + "step": 147550 + }, + { + "epoch": 20.945351312987935, + "grad_norm": 4.605138778686523, + "learning_rate": 7.906359119943224e-05, + "loss": 0.013264468312263489, + "step": 147560 + }, + { + "epoch": 20.946770759403833, + "grad_norm": 14.829985618591309, + "learning_rate": 7.906217175301632e-05, + "loss": 0.03572210669517517, + "step": 147570 + }, + { + "epoch": 20.94819020581973, + "grad_norm": 0.2998245656490326, + "learning_rate": 7.906075230660043e-05, + "loss": 0.002378993108868599, + "step": 147580 + }, + { + "epoch": 20.94960965223563, + "grad_norm": 0.9222350120544434, + "learning_rate": 7.905933286018453e-05, + "loss": 0.01790400892496109, + "step": 147590 + }, + { + "epoch": 20.951029098651524, + "grad_norm": 9.298759460449219, + "learning_rate": 7.905791341376864e-05, + "loss": 0.031370556354522704, + "step": 147600 + }, + { + "epoch": 20.952448545067423, + "grad_norm": 0.08165039867162704, + "learning_rate": 7.905649396735274e-05, + "loss": 0.05433051586151123, + "step": 147610 + }, + { + "epoch": 20.95386799148332, + "grad_norm": 0.053742699325084686, + "learning_rate": 7.905507452093683e-05, + "loss": 0.017582088708877563, + "step": 147620 + }, + { + "epoch": 20.95528743789922, + "grad_norm": 2.7409090995788574, + "learning_rate": 7.905365507452094e-05, + "loss": 0.02233976274728775, + "step": 147630 + }, + { + "epoch": 20.956706884315118, + "grad_norm": 6.455620765686035, + "learning_rate": 7.905223562810504e-05, + "loss": 0.0433304637670517, + "step": 147640 + }, + { + "epoch": 20.958126330731016, + "grad_norm": 0.05922337621450424, + "learning_rate": 7.905081618168915e-05, + "loss": 0.04422985017299652, + "step": 147650 + }, + { + "epoch": 20.959545777146914, + "grad_norm": 1.0155706405639648, + "learning_rate": 7.904939673527325e-05, + "loss": 0.020816732943058015, + "step": 147660 + }, + { + "epoch": 20.96096522356281, + "grad_norm": 2.480271577835083, + "learning_rate": 7.904797728885735e-05, + "loss": 0.016534870862960814, + "step": 147670 + }, + { + "epoch": 20.962384669978707, + "grad_norm": 4.882796764373779, + "learning_rate": 7.904655784244144e-05, + "loss": 0.013173118233680725, + "step": 147680 + }, + { + "epoch": 20.963804116394606, + "grad_norm": 0.062019314616918564, + "learning_rate": 7.904513839602556e-05, + "loss": 0.00510212555527687, + "step": 147690 + }, + { + "epoch": 20.965223562810504, + "grad_norm": 0.006183877121657133, + "learning_rate": 7.904371894960965e-05, + "loss": 0.006848765164613723, + "step": 147700 + }, + { + "epoch": 20.966643009226402, + "grad_norm": 0.11883995682001114, + "learning_rate": 7.904229950319376e-05, + "loss": 0.015217235684394837, + "step": 147710 + }, + { + "epoch": 20.9680624556423, + "grad_norm": 0.04546439275145531, + "learning_rate": 7.904088005677786e-05, + "loss": 0.009336093068122863, + "step": 147720 + }, + { + "epoch": 20.9694819020582, + "grad_norm": 0.21021588146686554, + "learning_rate": 7.903946061036196e-05, + "loss": 0.010315261781215668, + "step": 147730 + }, + { + "epoch": 20.970901348474094, + "grad_norm": 0.004868995398283005, + "learning_rate": 7.903804116394607e-05, + "loss": 0.0026836566627025603, + "step": 147740 + }, + { + "epoch": 20.972320794889992, + "grad_norm": 0.28464367985725403, + "learning_rate": 7.903662171753017e-05, + "loss": 0.012949483096599579, + "step": 147750 + }, + { + "epoch": 20.97374024130589, + "grad_norm": 1.2863636016845703, + "learning_rate": 7.903520227111428e-05, + "loss": 0.07239128351211548, + "step": 147760 + }, + { + "epoch": 20.97515968772179, + "grad_norm": 10.511181831359863, + "learning_rate": 7.903378282469838e-05, + "loss": 0.025117868185043336, + "step": 147770 + }, + { + "epoch": 20.976579134137687, + "grad_norm": 1.9466150999069214, + "learning_rate": 7.903236337828247e-05, + "loss": 0.03874390423297882, + "step": 147780 + }, + { + "epoch": 20.977998580553585, + "grad_norm": 0.046162527054548264, + "learning_rate": 7.903094393186657e-05, + "loss": 0.01782917380332947, + "step": 147790 + }, + { + "epoch": 20.979418026969483, + "grad_norm": 1.2228208780288696, + "learning_rate": 7.902952448545068e-05, + "loss": 0.0012259628623723985, + "step": 147800 + }, + { + "epoch": 20.980837473385378, + "grad_norm": 0.046159371733665466, + "learning_rate": 7.902810503903478e-05, + "loss": 0.017555412650108338, + "step": 147810 + }, + { + "epoch": 20.982256919801276, + "grad_norm": 1.7239919900894165, + "learning_rate": 7.902668559261889e-05, + "loss": 0.03226572871208191, + "step": 147820 + }, + { + "epoch": 20.983676366217175, + "grad_norm": 0.0749114602804184, + "learning_rate": 7.902526614620299e-05, + "loss": 0.015035668015480041, + "step": 147830 + }, + { + "epoch": 20.985095812633073, + "grad_norm": 0.026181036606431007, + "learning_rate": 7.902384669978708e-05, + "loss": 0.036445245146751404, + "step": 147840 + }, + { + "epoch": 20.98651525904897, + "grad_norm": 0.051933784037828445, + "learning_rate": 7.90224272533712e-05, + "loss": 0.02933332324028015, + "step": 147850 + }, + { + "epoch": 20.98793470546487, + "grad_norm": 0.31368857622146606, + "learning_rate": 7.902100780695529e-05, + "loss": 0.006153375655412674, + "step": 147860 + }, + { + "epoch": 20.989354151880768, + "grad_norm": 0.5830459594726562, + "learning_rate": 7.90195883605394e-05, + "loss": 0.03201732337474823, + "step": 147870 + }, + { + "epoch": 20.990773598296663, + "grad_norm": 0.06816734373569489, + "learning_rate": 7.901816891412349e-05, + "loss": 0.04411167204380036, + "step": 147880 + }, + { + "epoch": 20.99219304471256, + "grad_norm": 0.040966372936964035, + "learning_rate": 7.90167494677076e-05, + "loss": 0.021518656611442567, + "step": 147890 + }, + { + "epoch": 20.99361249112846, + "grad_norm": 0.6190553307533264, + "learning_rate": 7.90153300212917e-05, + "loss": 0.04294320344924927, + "step": 147900 + }, + { + "epoch": 20.995031937544358, + "grad_norm": 2.001371145248413, + "learning_rate": 7.90139105748758e-05, + "loss": 0.03126533329486847, + "step": 147910 + }, + { + "epoch": 20.996451383960256, + "grad_norm": 0.9343059659004211, + "learning_rate": 7.90124911284599e-05, + "loss": 0.010781645774841309, + "step": 147920 + }, + { + "epoch": 20.997870830376154, + "grad_norm": 0.01081905048340559, + "learning_rate": 7.9011071682044e-05, + "loss": 0.0056711096316576, + "step": 147930 + }, + { + "epoch": 20.999290276792053, + "grad_norm": 12.201112747192383, + "learning_rate": 7.900965223562811e-05, + "loss": 0.014659737050533295, + "step": 147940 + }, + { + "epoch": 21.000709723207947, + "grad_norm": 0.039594896137714386, + "learning_rate": 7.900823278921221e-05, + "loss": 0.03759959638118744, + "step": 147950 + }, + { + "epoch": 21.002129169623846, + "grad_norm": 0.10926660895347595, + "learning_rate": 7.900681334279632e-05, + "loss": 0.032229763269424436, + "step": 147960 + }, + { + "epoch": 21.003548616039744, + "grad_norm": 0.014060246758162975, + "learning_rate": 7.900539389638042e-05, + "loss": 0.009088961035013199, + "step": 147970 + }, + { + "epoch": 21.004968062455642, + "grad_norm": 9.610164642333984, + "learning_rate": 7.900397444996452e-05, + "loss": 0.020833241939544677, + "step": 147980 + }, + { + "epoch": 21.00638750887154, + "grad_norm": 0.2901909351348877, + "learning_rate": 7.900255500354861e-05, + "loss": 0.013191723823547363, + "step": 147990 + }, + { + "epoch": 21.00780695528744, + "grad_norm": 7.599514961242676, + "learning_rate": 7.900113555713272e-05, + "loss": 0.02051192969083786, + "step": 148000 + }, + { + "epoch": 21.00780695528744, + "eval_accuracy": 0.9854390538564253, + "eval_loss": 0.05769599974155426, + "eval_runtime": 34.1016, + "eval_samples_per_second": 461.18, + "eval_steps_per_second": 14.427, + "step": 148000 + }, + { + "epoch": 21.009226401703337, + "grad_norm": 0.030993621796369553, + "learning_rate": 7.899971611071682e-05, + "loss": 0.015471708774566651, + "step": 148010 + }, + { + "epoch": 21.010645848119232, + "grad_norm": 0.24977083504199982, + "learning_rate": 7.899829666430093e-05, + "loss": 0.005109232664108276, + "step": 148020 + }, + { + "epoch": 21.01206529453513, + "grad_norm": 0.01610984094440937, + "learning_rate": 7.899687721788503e-05, + "loss": 0.015183395147323609, + "step": 148030 + }, + { + "epoch": 21.01348474095103, + "grad_norm": 0.02092795819044113, + "learning_rate": 7.899545777146913e-05, + "loss": 0.0008370682597160339, + "step": 148040 + }, + { + "epoch": 21.014904187366927, + "grad_norm": 0.11478916555643082, + "learning_rate": 7.899403832505324e-05, + "loss": 0.03654801547527313, + "step": 148050 + }, + { + "epoch": 21.016323633782825, + "grad_norm": 0.007695644628256559, + "learning_rate": 7.899261887863733e-05, + "loss": 0.006345228850841522, + "step": 148060 + }, + { + "epoch": 21.017743080198724, + "grad_norm": 0.05666486918926239, + "learning_rate": 7.899119943222145e-05, + "loss": 0.005860751867294312, + "step": 148070 + }, + { + "epoch": 21.019162526614622, + "grad_norm": 1.7690719366073608, + "learning_rate": 7.898977998580553e-05, + "loss": 0.03130360245704651, + "step": 148080 + }, + { + "epoch": 21.020581973030517, + "grad_norm": 0.11971930414438248, + "learning_rate": 7.898836053938964e-05, + "loss": 0.0030382439494132996, + "step": 148090 + }, + { + "epoch": 21.022001419446415, + "grad_norm": 0.17994071543216705, + "learning_rate": 7.898694109297374e-05, + "loss": 0.033177369832992555, + "step": 148100 + }, + { + "epoch": 21.023420865862313, + "grad_norm": 0.09168960154056549, + "learning_rate": 7.898552164655785e-05, + "loss": 0.006556940078735351, + "step": 148110 + }, + { + "epoch": 21.02484031227821, + "grad_norm": 3.8017678260803223, + "learning_rate": 7.898410220014195e-05, + "loss": 0.003216473013162613, + "step": 148120 + }, + { + "epoch": 21.02625975869411, + "grad_norm": 0.021076340228319168, + "learning_rate": 7.898268275372606e-05, + "loss": 0.022803887724876404, + "step": 148130 + }, + { + "epoch": 21.027679205110008, + "grad_norm": 3.38307785987854, + "learning_rate": 7.898126330731015e-05, + "loss": 0.013693174719810486, + "step": 148140 + }, + { + "epoch": 21.029098651525906, + "grad_norm": 0.21132506430149078, + "learning_rate": 7.897984386089425e-05, + "loss": 0.006197235733270645, + "step": 148150 + }, + { + "epoch": 21.0305180979418, + "grad_norm": 14.318134307861328, + "learning_rate": 7.897842441447836e-05, + "loss": 0.029898801445961, + "step": 148160 + }, + { + "epoch": 21.0319375443577, + "grad_norm": 0.02023470774292946, + "learning_rate": 7.897700496806246e-05, + "loss": 0.0028037030249834062, + "step": 148170 + }, + { + "epoch": 21.033356990773598, + "grad_norm": 0.1024431511759758, + "learning_rate": 7.897558552164657e-05, + "loss": 0.01755046546459198, + "step": 148180 + }, + { + "epoch": 21.034776437189496, + "grad_norm": 0.012744315899908543, + "learning_rate": 7.897416607523065e-05, + "loss": 0.0270268052816391, + "step": 148190 + }, + { + "epoch": 21.036195883605394, + "grad_norm": 0.2124946266412735, + "learning_rate": 7.897274662881477e-05, + "loss": 0.00761546641588211, + "step": 148200 + }, + { + "epoch": 21.037615330021293, + "grad_norm": 0.8210309743881226, + "learning_rate": 7.897132718239886e-05, + "loss": 0.023723949491977692, + "step": 148210 + }, + { + "epoch": 21.03903477643719, + "grad_norm": 0.03501635789871216, + "learning_rate": 7.896990773598297e-05, + "loss": 0.002953563630580902, + "step": 148220 + }, + { + "epoch": 21.040454222853086, + "grad_norm": 4.635706901550293, + "learning_rate": 7.896848828956707e-05, + "loss": 0.0172238752245903, + "step": 148230 + }, + { + "epoch": 21.041873669268984, + "grad_norm": 9.512527465820312, + "learning_rate": 7.896706884315117e-05, + "loss": 0.011320335417985916, + "step": 148240 + }, + { + "epoch": 21.043293115684882, + "grad_norm": 0.4590822458267212, + "learning_rate": 7.896564939673528e-05, + "loss": 0.03066089451313019, + "step": 148250 + }, + { + "epoch": 21.04471256210078, + "grad_norm": 4.4357500076293945, + "learning_rate": 7.896422995031938e-05, + "loss": 0.03336559534072876, + "step": 148260 + }, + { + "epoch": 21.04613200851668, + "grad_norm": 0.4467734396457672, + "learning_rate": 7.896281050390349e-05, + "loss": 0.07908724546432495, + "step": 148270 + }, + { + "epoch": 21.047551454932577, + "grad_norm": 0.4528430104255676, + "learning_rate": 7.896139105748759e-05, + "loss": 0.020352552831172942, + "step": 148280 + }, + { + "epoch": 21.048970901348476, + "grad_norm": 0.019472533836960793, + "learning_rate": 7.895997161107168e-05, + "loss": 0.014004814624786376, + "step": 148290 + }, + { + "epoch": 21.05039034776437, + "grad_norm": 0.07784029096364975, + "learning_rate": 7.895855216465578e-05, + "loss": 0.020544235408306123, + "step": 148300 + }, + { + "epoch": 21.05180979418027, + "grad_norm": 13.455442428588867, + "learning_rate": 7.895713271823989e-05, + "loss": 0.05464913845062256, + "step": 148310 + }, + { + "epoch": 21.053229240596167, + "grad_norm": 0.027525078505277634, + "learning_rate": 7.895571327182399e-05, + "loss": 0.0052706852555274965, + "step": 148320 + }, + { + "epoch": 21.054648687012065, + "grad_norm": 4.2665252685546875, + "learning_rate": 7.89542938254081e-05, + "loss": 0.014684043824672699, + "step": 148330 + }, + { + "epoch": 21.056068133427964, + "grad_norm": 0.350553035736084, + "learning_rate": 7.89528743789922e-05, + "loss": 0.002646828070282936, + "step": 148340 + }, + { + "epoch": 21.057487579843862, + "grad_norm": 0.02084886096417904, + "learning_rate": 7.89514549325763e-05, + "loss": 0.01621060222387314, + "step": 148350 + }, + { + "epoch": 21.05890702625976, + "grad_norm": 0.3889669179916382, + "learning_rate": 7.89500354861604e-05, + "loss": 0.006524135917425155, + "step": 148360 + }, + { + "epoch": 21.060326472675655, + "grad_norm": 5.557522296905518, + "learning_rate": 7.89486160397445e-05, + "loss": 0.011207732558250427, + "step": 148370 + }, + { + "epoch": 21.061745919091553, + "grad_norm": 0.02720538340508938, + "learning_rate": 7.894719659332861e-05, + "loss": 0.009898757189512252, + "step": 148380 + }, + { + "epoch": 21.06316536550745, + "grad_norm": 0.6669163703918457, + "learning_rate": 7.89457771469127e-05, + "loss": 0.0027324333786964417, + "step": 148390 + }, + { + "epoch": 21.06458481192335, + "grad_norm": 5.931302547454834, + "learning_rate": 7.894435770049681e-05, + "loss": 0.026851081848144533, + "step": 148400 + }, + { + "epoch": 21.066004258339248, + "grad_norm": 0.9109027981758118, + "learning_rate": 7.89429382540809e-05, + "loss": 0.049683880805969236, + "step": 148410 + }, + { + "epoch": 21.067423704755146, + "grad_norm": 0.2809244692325592, + "learning_rate": 7.894151880766502e-05, + "loss": 0.01648867130279541, + "step": 148420 + }, + { + "epoch": 21.068843151171045, + "grad_norm": 0.09820913523435593, + "learning_rate": 7.894009936124911e-05, + "loss": 0.008877874910831451, + "step": 148430 + }, + { + "epoch": 21.07026259758694, + "grad_norm": 0.11730563640594482, + "learning_rate": 7.893867991483321e-05, + "loss": 0.010576790571212769, + "step": 148440 + }, + { + "epoch": 21.071682044002838, + "grad_norm": 0.02079988457262516, + "learning_rate": 7.893726046841732e-05, + "loss": 0.012141837924718856, + "step": 148450 + }, + { + "epoch": 21.073101490418736, + "grad_norm": 0.641864538192749, + "learning_rate": 7.893584102200142e-05, + "loss": 0.007063349336385727, + "step": 148460 + }, + { + "epoch": 21.074520936834634, + "grad_norm": 0.0663255825638771, + "learning_rate": 7.893442157558553e-05, + "loss": 0.013601230084896087, + "step": 148470 + }, + { + "epoch": 21.075940383250533, + "grad_norm": 0.2728342115879059, + "learning_rate": 7.893300212916963e-05, + "loss": 0.002510496228933334, + "step": 148480 + }, + { + "epoch": 21.07735982966643, + "grad_norm": 0.0070982640609145164, + "learning_rate": 7.893158268275374e-05, + "loss": 0.049581718444824216, + "step": 148490 + }, + { + "epoch": 21.07877927608233, + "grad_norm": 0.045075900852680206, + "learning_rate": 7.893016323633782e-05, + "loss": 0.032151824235916136, + "step": 148500 + }, + { + "epoch": 21.07877927608233, + "eval_accuracy": 0.9852482991034527, + "eval_loss": 0.054044269025325775, + "eval_runtime": 32.9291, + "eval_samples_per_second": 477.601, + "eval_steps_per_second": 14.941, + "step": 148500 + } + ], + "logging_steps": 10, + "max_steps": 704500, + "num_input_tokens_seen": 0, + "num_train_epochs": 100, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +}